summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRemi Collet <fedora@famillecollet.com>2013-03-20 10:29:29 +0100
committerRemi Collet <fedora@famillecollet.com>2013-03-20 10:29:29 +0100
commit6deac027c98f5d99e1805f9ddc21ff2dbebe0fb7 (patch)
tree008990c48199f2d517fc9b1a4b47c6b162ec30ef
compat-icu36: new package (for EL-5)HEADmaster
-rw-r--r--Makefile4
-rw-r--r--canonicalize.patch11
-rw-r--r--compat-icu36.spec189
-rw-r--r--icu-3.4-multiarchdevel.patch70
-rwxr-xr-xicu-config387
-rw-r--r--icu.icu5365.dependantvowels.patch11
-rw-r--r--icu.icu5418.malayam.patch39
-rw-r--r--icu.icu5431.malayam.patch107
-rw-r--r--icu.icu5433.oriya.patch31
-rw-r--r--icu.icu5465.telegu.patch29
-rw-r--r--icu.icu5483.backport.patch874
-rw-r--r--icu.icu5488.assamese.patch11
-rw-r--r--icu.icu5500.devicetablecrash.patch11
-rw-r--r--icu.icu5501.sinhala.biggerexpand.patch11
-rw-r--r--icu.icu5506.multiplevowels.patch61
-rw-r--r--icu.icu5557.safety.patch14
-rw-r--r--icu.icu5594.gujarati.patch14
-rw-r--r--icu.icu5691.backport.patch730
-rw-r--r--icu.icu5797.backport.patch749
-rw-r--r--icu.icu6001.backport.patch741
-rw-r--r--icu.icu6002.backport.patch397
-rw-r--r--icu.icu6175.emptysegments.patch535
-rw-r--r--icu.icuXXXX.malayalam.bysyllable.patch250
-rw-r--r--icu.icuXXXX.rollbackabi.patch131
-rw-r--r--icu.icuXXXX.virama.prevnext.patch98
-rw-r--r--icu.rh429023.regexp.patch307
26 files changed, 5812 insertions, 0 deletions
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..1e65467
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,4 @@
+SRCDIR := $(shell pwd)
+NAME := $(shell basename $(SRCDIR))
+include ../common/Makefile
+
diff --git a/canonicalize.patch b/canonicalize.patch
new file mode 100644
index 0000000..3ff9c33
--- /dev/null
+++ b/canonicalize.patch
@@ -0,0 +1,11 @@
+--- source/common/uloc.c 2011-12-12 04:50:00.601092000 -0500
++++ source/common/uloc.c 2011-12-12 04:56:18.503570000 -0500
+@@ -1712,7 +1712,7 @@
+ /* Check for EURO variants. */
+ sawEuro = _deleteVariant(variant, variantSize, "EURO", 4);
+ len -= sawEuro;
+- if (sawEuro > 0 && name[len-1] == '_') { /* delete trailing '_' */
++ if (sawEuro > 0 && len > 0 && name[len-1] == '_') { /* delete trailing '_' */
+ --len;
+ }
+
diff --git a/compat-icu36.spec b/compat-icu36.spec
new file mode 100644
index 0000000..d8a820e
--- /dev/null
+++ b/compat-icu36.spec
@@ -0,0 +1,189 @@
+Name: compat-icu36
+Version: 3.6
+Release: 5.16.1
+Summary: International Components for Unicode
+
+Group: System Environment/Libraries
+License: X License
+URL: http://www.ibm.com/software/globalization/icu/
+Source0: ftp://ftp.software.ibm.com/software/globalization/icu/icu4c-3_6-src.tgz
+BuildRoot: %{_tmppath}/%{name}-%{version}-root
+
+BuildRequires: doxygen, autoconf
+Patch1: icu-3.4-multiarchdevel.patch
+Patch2: icu-config
+Patch3: icu.icu5365.dependantvowels.patch
+Patch4: icu.icu5418.malayam.patch
+Patch5: icu.icu5431.malayam.patch
+Patch6: icu.icu5433.oriya.patch
+Patch7: icu.icuXXXX.virama.prevnext.patch
+Patch8: icu.icu5465.telegu.patch
+Patch9: icu.icu5488.assamese.patch
+Patch10: icu.icu5500.devicetablecrash.patch
+Patch11: icu.icu5501.sinhala.biggerexpand.patch
+Patch12: icu.icu5557.safety.patch
+Patch13: icu.icu5594.gujarati.patch
+Patch14: icu.icu5506.multiplevowels.patch
+Patch15: icu.icuXXXX.malayalam.bysyllable.patch
+Patch16: icu.rh429023.regexp.patch
+Patch17: icu.icu5483.backport.patch
+Patch18: icu.icu5797.backport.patch
+Patch19: icu.icu6001.backport.patch
+Patch20: icu.icu6002.backport.patch
+Patch21: icu.icu6175.emptysegments.patch
+Patch22: icu.icu5691.backport.patch
+Patch23: icu.icuXXXX.rollbackabi.patch
+Patch24: canonicalize.patch
+Conflicts: icu
+
+%description
+The International Components for Unicode (ICU) libraries provide
+robust and full-featured Unicode services on a wide variety of
+platforms. ICU supports the most current version of the Unicode
+standard, and they provide support for supplementary Unicode
+characters (needed for GB 18030 repertoire support).
+As computing environments become more heterogeneous, software
+portability becomes more important. ICU lets you produce the same
+results across all the various platforms you support, without
+sacrificing performance. It offers great flexibility to extend and
+customize the supplied services.
+
+
+%package -n compat-libicu36
+Summary: International Components for Unicode - libraries
+Group: System Environment/Libraries
+
+%description -n compat-libicu36
+%{summary}.
+
+This package provides the ICU libraries for package built
+against version %{version}.
+
+%package -n compat-libicu36-devel
+Summary: Development files for International Components for Unicode
+Group: Development/Libraries
+Requires: compat-libicu36 = %{version}-%{release}
+Requires: pkgconfig
+Conflicts: libicu-devel
+
+%description -n compat-libicu36-devel
+%{summary}.
+
+%package -n compat-libicu36-doc
+Summary: Documentation for International Components for Unicode
+Group: Documentation
+
+%description -n compat-libicu36-doc
+%{summary}.
+
+
+%prep
+%setup -q -n icu
+%patch1 -p1 -b .multiarchdevel
+%patch3 -p1 -b .dependantvowels
+%patch4 -p1 -b .icu5418.malayam.patch
+%patch5 -p1 -b .icu5431.malayam.patch
+%patch6 -p1 -b .icu5433.oriya.patch
+%patch7 -p1 -b .icuXXXX.virama.prevnext.patch
+%patch8 -p1 -b .icu5465.telegu.patch
+%patch9 -p1 -b .icu5488.assamese.patch
+%patch10 -p1 -b .icu5500.devicetablecrash.patch
+%patch11 -p1 -b .icu5501.sinhala.biggerexpand.patch
+%patch12 -p1 -b .icu5557.safety.patch
+%patch13 -p1 -b .icu5594.gujarati.patch
+%patch14 -p1 -b .icu5506.multiplevowels.patch
+%patch15 -p1 -b .icuXXXX.malayalam.bysyllable.patch
+%patch16 -p1 -b .rh429023.regexp.patch
+%patch17 -p1 -b .icu5483.backport.patch
+%patch18 -p1 -b .icu5797.backport.patch
+%patch19 -p1 -b .icu6001.backport.patch
+%patch20 -p1 -b .icu6002.backport.patch
+%patch21 -p1 -b .icu6175.emptysegments.patch
+%patch22 -p1 -b .icu5691.backport.patch
+%patch23 -p1 -b .icuXXXX.rollbackabi.patch
+%patch24 -p0 -b .canonicalize.patch
+
+%build
+cd source
+export CFLAGS="$RPM_OPT_FLAGS -fno-strict-aliasing"
+export CXXFLAGS="$RPM_OPT_FLAGS -fno-strict-aliasing"
+autoconf
+%configure --with-data-packaging=library --disable-samples
+#rhbz#654590
+sed -i -- "s/-nodefaultlibs -nostdlib//" config/mh-linux
+make # %{?_smp_mflags} # -j(X>1) may "break" man pages as of 3.2, b.f.u #2357
+make doc
+
+%install
+rm -rf $RPM_BUILD_ROOT source/__docs
+make -C source install DESTDIR=$RPM_BUILD_ROOT
+make -C source install-doc docdir=__docs
+chmod +x $RPM_BUILD_ROOT%{_libdir}/*.so.*
+cp %{PATCH2} $RPM_BUILD_ROOT%{_bindir}/icu-config
+chmod a+x $RPM_BUILD_ROOT%{_bindir}/icu-config
+sed -i s/\\\$\(THREADSCXXFLAGS\)// $RPM_BUILD_ROOT/%{_libdir}/pkgconfig/icu.pc
+sed -i s/\\\$\(THREADSCPPFLAGS\)/-D_REENTRANT/ $RPM_BUILD_ROOT/%{_libdir}/pkgconfig/icu.pc
+
+%check
+make -C source check
+
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+
+%post -n compat-libicu36 -p /sbin/ldconfig
+
+%postun -n compat-libicu36 -p /sbin/ldconfig
+
+
+%files
+%defattr(-,root,root,-)
+%doc license.html readme.html
+%{_bindir}/derb
+%{_bindir}/genbrk
+%{_bindir}/gencnval
+%{_bindir}/genctd
+%{_bindir}/genrb
+%{_bindir}/makeconv
+%{_bindir}/pkgdata
+%{_bindir}/uconv
+%{_sbindir}/*
+%{_mandir}/man1/derb.1*
+%{_mandir}/man1/gencnval.1*
+%{_mandir}/man1/genrb.1*
+%{_mandir}/man1/genbrk.1*
+%{_mandir}/man1/genctd.1*
+%{_mandir}/man1/makeconv.1*
+%{_mandir}/man1/pkgdata.1*
+%{_mandir}/man1/uconv.1*
+%{_mandir}/man8/*.8*
+
+%files -n compat-libicu36
+%defattr(-,root,root,-)
+%{_libdir}/*.so.*
+
+%files -n compat-libicu36-devel
+%defattr(-,root,root,-)
+%{_bindir}/icu-config
+%{_mandir}/man1/icu-config.1*
+%{_includedir}/layout
+%{_includedir}/unicode
+%{_libdir}/*.so
+%{_libdir}/icu
+%{_libdir}/pkgconfig/icu.pc
+%dir %{_datadir}/icu
+%dir %{_datadir}/icu/3.6
+%{_datadir}/icu/3.6/mkinstalldirs
+%{_datadir}/icu/3.6/config
+%doc %{_datadir}/icu/3.6/license.html
+
+%files -n compat-libicu36-doc
+%defattr(-,root,root,-)
+%doc source/__docs/icu/html/*
+
+
+%changelog
+* Wed Mar 20 2013 Remi Collet <RPMS@famillecollet.com> - 3.6-5.16.1
+- new package from RHEL-5 spec of icu.
+
diff --git a/icu-3.4-multiarchdevel.patch b/icu-3.4-multiarchdevel.patch
new file mode 100644
index 0000000..a7839aa
--- /dev/null
+++ b/icu-3.4-multiarchdevel.patch
@@ -0,0 +1,70 @@
+--- icu/source/configure.in.orig 2006-05-02 12:10:31.000000000 +0100
++++ icu/source/configure.in 2006-05-02 15:06:07.000000000 +0100
+@@ -1011,6 +1011,7 @@
+ Makefile \
+ data/icupkg.inc \
+ config/Makefile.inc \
++ config/icu.pc \
+ data/Makefile \
+ stubdata/Makefile \
+ common/Makefile \
+--- /dev/null 2006-04-29 13:38:37.035974750 +0100
++++ icu/source/config/icu.pc.in 2006-05-02 15:03:14.000000000 +0100
+@@ -0,0 +1,46 @@
++prefix = @prefix@
++bindir = @bindir@
++exec_prefix = @exec_prefix@
++libdir = @libdir@
++includedir = @includedir@
++datadir = @datadir@
++sbindir = @sbindir@
++mandir = @mandir@
++sysconfdir = @sysconfdir@
++CFLAGS = @CFLAGS@
++CXXFLAGS = @CXXFLAGS@
++DEFS = @DEFS@
++UNICODE_VERSION=@UNICODE_VERSION@
++ICUPREFIX=icu
++ICULIBSUFFIX=@ICULIBSUFFIX@
++LIBICU=lib${ICUPREFIX}
++LIBCPPFLAGS=-D_REENTRANT
++CPPFLAGS=@CPPFLAGS@ ${LIBCPPFLAGS} -I${prefix}/include
++SHAREDLIBCPPFLAGS=-DPIC
++SHAREDLIBCXXFLAGS=-fPIC
++SHAREDLIBCFLAGS=-fPIC
++pkglibdir=${libdir}/@PACKAGE@${ICULIBSUFFIX}/@VERSION@
++pkgdatadir=${datadir}/@PACKAGE@${ICULIBSUFFIX}/@VERSION@
++ICUDATA_NAME = icudt@LIB_VERSION_MAJOR@@ICUDATA_CHAR@
++ICUPKGDATA_DIR=@libdir@
++ICUDATA_DIR=${pkgdatadir}
++SO=so
++ICULIBS_COMMON_LIB_NAME=${LIBICU}uc${ICULIBSUFFIX}.${SO}
++SHLIB_cc=cxx ${DEFS} ${CPPFLAGS} ${CXXFLAGS} @LDFLAGS@ -shared
++SHLIB_c=cc ${DEFS} ${CPPFLAGS} ${CFLAGS} @LDFLAGS@ -shared
++ICULIBS_LAYOUT = -l${ICUPREFIX}le${ICULIBSUFFIX} -l${ICUPREFIX}lx${ICULIBSUFFIX}
++ICULIBS_TOOLUTIL = -l${ICUPREFIX}tu${ICULIBSUFFIX}
++ICULIBS_OBSOLETE = -l${ICUPREFIX}obsolete${ICULIBSUFFIX}
++ICULIBS_ICUIO = -l${ICUPREFIX}io${ICULIBSUFFIX}
++ICULIBS_I18N = -l${ICUPREFIX}i18n${ICULIBSUFFIX}
++ICULIBS_COMMON = -l${ICUPREFIX}uc${ICULIBSUFFIX}
++ICULIBS_DATA = -l${ICUPREFIX}data${ICULIBSUFFIX}
++ICULIBS_LIBSONLY = ${ICULIBS_I18N} ${ICULIBS_COMMON} ${ICULIBS_DATA}
++ICULIBS_SYSTEMLIBS = @LIBS@
++ICULIBS_BASE = @LIBS@ -L${libdir}
++ICULIBS = ${ICULIBS_BASE} ${ICULIBS_I18N} ${ICULIBS_COMMON} ${ICULIBS_DATA}
++
++Name: @PACKAGE@
++Description: International Components for Unicode
++Version: @VERSION@
++Libs: @LDFLAGS@ ${ICULIBS} @LIBS@
+--- icu/source/Makefile.in.orig 2006-05-02 12:10:31.000000000 +0100
++++ icu/source/Makefile.in 2006-05-02 15:18:15.000000000 +0100
+@@ -125,6 +125,8 @@
+ @$(MKINSTALLDIRS) $(DESTDIR)$(sbindir)
+ $(INSTALL_DATA) @platform_make_fragment@ $(DESTDIR)$(pkgdatadir)/config/@platform_make_fragment_name@
+ $(INSTALL_SCRIPT) $(top_srcdir)/mkinstalldirs $(DESTDIR)$(pkgdatadir)/mkinstalldirs
++ @$(MKINSTALLDIRS) $(DESTDIR)$(libdir)/pkgconfig
++ $(INSTALL_DATA) $(top_srcdir)/config/icu.pc $(DESTDIR)$(libdir)/pkgconfig/icu.pc
+ $(INSTALL_DATA) $(top_srcdir)/../license.html $(DESTDIR)$(pkgdatadir)/license.html
+ $(INSTALL_SCRIPT) $(top_builddir)/config/icu-config $(DESTDIR)$(bindir)/icu-config
+ $(INSTALL_DATA) $(top_builddir)/config/Makefile.inc $(DESTDIR)$(pkglibdir)/Makefile.inc
diff --git a/icu-config b/icu-config
new file mode 100755
index 0000000..08f9ce8
--- /dev/null
+++ b/icu-config
@@ -0,0 +1,387 @@
+#!/bin/sh
+## -*-sh-*-
+#set -x
+# BEGIN of icu-config-top
+#******************************************************************************
+# Copyright (C) 1999-2004, International Business Machines
+# Corporation and others. All Rights Reserved.
+#******************************************************************************
+# This script is designed to aid configuration of ICU.
+# rpath links a library search path right into the binaries.
+
+
+### END of icu-config-top
+
+## Zero out prefix.
+exec_prefix=`pkg-config --variable=exec_prefix icu`
+execprefix=$exec_prefix
+prefix=`pkg-config --variable=prefix icu`
+
+
+loaddefs()
+{
+LDLIBRARYPATH_ENVVAR="LD_LIBRARY_PATH"
+bindir=`pkg-config --variable=bindir icu`
+sbindir=`pkg-config --variable=sbindir icu`
+libdir=`pkg-config --variable=libdir icu`
+sysconfdir=`pkg-config --variable=sysconfdir icu`
+mandir=`pkg-config --variable=mandir icu`
+datadir=`pkg-config --variable=datadir icu`
+pkglibdir=`pkg-config --variable=pkglibdir icu`
+ICULIBS_COMMON_LIB_NAME=`pkg-config --variable=ICULIBS_COMMON_LIB_NAME icu`
+UNICODE_VERSION=`pkg-config --variable=UNICODE_VERSION icu`
+VERSION=`pkg-config --modversion icu`
+SO=`pkg-config --variable=SO icu`
+
+## -*-sh-*-
+## BEGIN of icu-config-bottom.
+## Copyright (c) 2002-2004, International Business Machines Corporation and
+## others. All Rights Reserved.
+
+ICUUC_FILE=${libdir}/${ICULIBS_COMMON_LIB_NAME}
+
+# echo ENABLE RPATH $ENABLE_RPATH and RPATHLDFLAGS=${RPATH_LDFLAGS}
+if [ "x$PKGDATA_MODE" = "x" ]; then
+ PKGDATA_MODE=dll
+fi
+
+}
+
+## The actual code of icu-config goes here.
+
+ME=`basename $0`
+
+allflags()
+{
+ echo " --bindir Print binary directory path (bin)"
+ echo " --cc Print C compiler used [CC]"
+ echo " --cflags Print C compiler flags [CFLAGS]"
+ echo " --cflags-dynamic Print additional C flags for"
+ echo " building shared libraries."
+ echo " --cppflags Print C Preprocessor flags [CPPFLAGS]"
+ echo " --cppflags-dynamic Print additional C Preprocessor flags for"
+ echo " building shared libraries."
+ echo " --cppflags-searchpath Print only -I include directives (-Iinclude)"
+ echo " --cxx Print C++ compiler used [CXX]"
+ echo " --cxxflags Print C++ compiler flags [CXXFLAGS]"
+ echo " --cxxflags-dynamic Print additional C++ flags for"
+ echo " building shared libraries."
+ echo " --detect-prefix Attempt to detect prefix based on PATH"
+ echo " --exec-prefix Print prefix for executables (/bin)"
+ echo " --exists Return with 0 status if ICU exists else fail"
+ echo " --help, -?, --usage Print this message"
+ echo " --icudata Print shortname of ICU data file (icudt21l)"
+ echo " --icudata-install-dir Print path to install data to - use as --install option to pkgdata(1)"
+ echo " --icudata-mode Print default ICU pkgdata mode (dll) - use as --mode option to pkgdata(1)."
+ echo " --icudatadir Print path to packaged archive data. Can set as [ICU_DATA]"
+ echo " --invoke Print commands to invoke an ICU program"
+ echo " --invoke=<prog> Print commands to invoke an ICU program named <prog> (ex: genrb)"
+ echo " --ldflags Print -L search path and -l libraries to link with ICU [LDFLAGS]. This is for the data, uc (common), and i18n libraries only. "
+ echo " --ldflags-layout Print ICU layout engine link directive. Use in addition to --ldflags"
+ echo " --ldflags-libsonly Same as --ldflags, but only the -l directives"
+ echo " --ldflags-searchpath Print only -L (search path) directive"
+ echo " --ldflags-system Print only system libs ICU links with (-lpthread, -lm)"
+ echo " --ldflags-icuio Print ICU icuio link directive. Use in addition to --ldflags "
+ echo " --ldflags-obsolete Print ICU obsolete link directive. Use in addition to --ldflags. (requires icuapps/obsolete to be built and installed.) "
+ echo " --mandir Print manpage (man) path"
+ echo " --prefix Print PREFIX to icu install (/usr/local)"
+ echo " --prefix=XXX Set prefix to XXX for remainder of command"
+ echo " --sbindir Print system binary path (sbin) "
+ echo " --shared-datadir Print shared data (share) path. This is NOT the ICU data dir."
+ echo " --shlib-c Print the command to compile and build C shared libraries with ICU"
+ echo " --shlib-cc Print the command to compile and build C++ shared libraries with ICU"
+ echo " --sysconfdir Print system config (etc) path"
+ echo " --unicode-version Print version of Unicode data used in ICU ($UNICODE_VERSION)"
+ echo " --version Print ICU version ($VERSION)"
+ echo " --incfile Print path to Makefile.inc (for -O option of pkgdata)"
+}
+
+## Print the normal usage message
+shortusage()
+{
+ echo "usage: ${ME} " `allflags | cut -c-25 | sed -e 's%.*%[ & ]%'`
+}
+
+
+usage()
+{
+ echo "${ME}: icu-config: ICU configuration helper script"
+ echo
+ echo "The most commonly used options will be --cflags, --cxxflags, --cppflags, and --ldflags."
+ echo 'Example (in make): CPFLAGS=$(shell icu-config --cppflags)'
+ echo ' LDFLAGS=$(shell icu-config --ldflags)'
+ echo " (etc).."
+ echo
+ echo "Usage:"
+ allflags
+
+ echo
+ echo " [Brackets] show MAKE variable equivalents, (parenthesis) show example output"
+ echo
+ echo "Copyright (c) 2002, International Business Machines Corporation and others. All Rights Reserved."
+}
+
+## Check the sanity of current variables
+sanity()
+{
+ if [ ! -f ${ICUUC_FILE} ];
+ then
+ echo "### $ME: Can't find ${ICUUC_FILE} - ICU prefix is wrong." 1>&2
+ echo "### Try the --prefix= or --exec-prefix= options " 1>&2
+ echo "### or --detect-prefix"
+ echo "### $ME: Exitting." 1>&2
+ exit 2
+ fi
+}
+
+## Main starts here.
+
+if [ $# -lt 1 ]; then
+ shortusage
+ exit 1
+fi
+
+
+# Load our variables from autoconf
+# ALWAYS load twice because of dependencies
+loaddefs
+loaddefs
+sanity
+
+while [ $# -gt 0 ];
+do
+ arg="$1"
+ var=`echo $arg | sed -e 's/^[^=]*=//'`
+# echo "### processing $arg" 1>&2
+ case "$arg" in
+
+ # undocumented.
+ --debug)
+ set -x
+ ;;
+
+ --so)
+ echo $SO
+ ;;
+
+ --bindir)
+ echo $bindir
+ ;;
+
+ --libdir)
+ echo $libdir
+ ;;
+
+ --exists)
+ sanity
+ ;;
+
+ --sbindir)
+ echo $sbindir
+ ;;
+
+ --invoke=*)
+ QUOT="'"
+ CMD="${var}"
+
+ # If it's not a locally executable command (1st choice) then
+ # search for it in the ICU directories.
+ if [ ! -x ${CMD} ]; then
+ if [ -x ${bindir}/${var} ]; then
+ CMD="${bindir}/${var}"
+ fi
+ if [ -x ${sbindir}/${var} ]; then
+ CMD="${sbindir}/${var}"
+ fi
+ fi
+
+ echo "env ${QUOT}${LDLIBRARYPATH_ENVVAR}=${libdir}:"'${'"${LDLIBRARYPATH_ENVVAR}"'}'${QUOT} ${CMD}
+ ;;
+
+ --invoke)
+ QUOT="'"
+ echo "env ${QUOT}${LDLIBRARYPATH_ENVVAR}=${libdir}:"'${'"${LDLIBRARYPATH_ENVVAR}"'}'${QUOT}
+ ;;
+
+ --cflags)
+ pkg-config --variable=CFLAGS icu
+ ;;
+
+ --cc)
+ echo cc
+ ;;
+
+ --cxx)
+ echo c++
+ ;;
+
+ --cxxflags)
+ pkg-config --variable=CXXFLAGS icu
+ ;;
+
+ --cppflags)
+ # Don't echo the -I. - it's unneeded.
+ CPPFLAGS=`pkg-config --variable=CPPFLAGS icu`
+ echo $CPPFLAGS | sed -e 's/-I. //'
+ ;;
+
+ --cppflags-searchpath)
+ echo -I${prefix}/include
+ ;;
+
+ --cppflags-dynamic)
+ pkg-config --variable=SHAREDLIBCPPFLAGS icu
+ ;;
+
+ --cxxflags-dynamic)
+ pkg-config --variable=SHAREDLIBCXXFLAGS icu
+ ;;
+
+ --cflags-dynamic)
+ pkg-config --variable=SHAREDLIBCFLAGS icu
+ ;;
+
+ --ldflags-system)
+ pkg-config --variable=ICULIBS_SYSTEMLIBS icu
+ ;;
+
+ --ldflags)
+ pkg-config --libs icu
+# $RPATH_LDFLAGS
+ ;;
+
+ --ldflags-libsonly)
+ pkg-config --variable=ICULIBS_LIBSONLY icu
+ ;;
+
+ --ldflags-icuio)
+ pkg-config --variable=ICULIBS_ICUIO icu
+ ;;
+
+ --ldflags-obsolete)
+ pkg-config --variable=ICULIBS_OBSOLETE icu
+ ;;
+
+ --ldflags-toolutil)
+ pkg-config --variable=ICULIBS_TOOLUTIL icu
+ ;;
+
+ --ldflags-layout)
+ pkg-config --variable=ICULIBS_LAYOUT icu
+ ;;
+
+ --ldflags-searchpath)
+ echo -L${libdir}
+ ;;
+
+ --detect-prefix)
+ HERE=`echo $0 | sed -e "s/$ME//g"`
+ if [ -f $HERE/../lib/${ICULIBS_COMMON_LIB_NAME} ]; then
+ prefix=$HERE/..
+ echo "## Using --prefix=${prefix}" 1>&2
+ fi
+ loaddefs
+ loaddefs
+ sanity
+ ;;
+
+ --exec-prefix)
+ echo $exec_prefix
+ ;;
+
+ --prefix)
+ echo $prefix
+ ;;
+
+ --prefix=*)
+ prefix=$var
+ loaddefs
+ loaddefs
+ sanity
+ ;;
+
+ --sysconfdir)
+ echo $sysconfdir
+ ;;
+
+ --mandir)
+ echo $mandir
+ ;;
+
+ --shared-datadir)
+ echo $datadir
+ ;;
+
+ --incfile)
+ echo $pkglibdir/Makefile.inc
+ ;;
+
+ --icudata)
+ pkg-config --variable=ICUDATA_NAME icu
+ ;;
+
+ --icudata-mode)
+ echo $PKGDATA_MODE
+ ;;
+
+ --icudata-install-dir)
+ pkg-config --variable=ICUPKGDATA_DIR icu
+ ;;
+
+ --icudatadir)
+ pkg-config --variable=ICUDATA_DIR icu
+ ;;
+
+ --shlib-c)
+ pkg-config --variable=SHLIB_c icu
+ ;;
+
+ --shlib-cc)
+ pkg-config --variable=SHLIB_cc icu
+ ;;
+
+ --version)
+ echo $VERSION
+ ;;
+
+ --unicode-version)
+ echo $UNICODE_VERSION
+ ;;
+
+ --help)
+ usage
+ exit 0
+ ;;
+
+ --usage)
+ usage
+ exit 0
+ ;;
+
+# --enable-rpath=*)
+# ENABLE_RPATH=$var
+# loaddefs
+# ;;
+
+ -?)
+ usage
+ exit 0
+ ;;
+
+ *)
+ echo ${ME}: ERROR Unknown Option $arg 1>&2
+ echo 1>&2
+ shortusage 1>&2
+ echo "### $ME: Exitting." 1>&2
+ exit 1;
+ ;;
+ esac
+ shift
+done
+
+# Check once before we quit (will check last used prefix)
+sanity
+## END of icu-config-bottom
+
+exit 0
+
diff --git a/icu.icu5365.dependantvowels.patch b/icu.icu5365.dependantvowels.patch
new file mode 100644
index 0000000..5708018
--- /dev/null
+++ b/icu.icu5365.dependantvowels.patch
@@ -0,0 +1,11 @@
+--- icu/source/layout/IndicReordering.cpp.orig 2006-09-05 17:01:15.000000000 +0100
++++ icu/source/layout/IndicReordering.cpp 2006-09-05 17:01:19.000000000 +0100
+@@ -377,7 +377,7 @@
+ {-1, 6, 1, -1, -1, -1, -1, -1, -1, 5, 9, 5, 5, 4, 12}, // 2 - consonant with nukta
+ {-1, 6, 1, -1, -1, -1, -1, -1, 2, 5, 9, 5, 5, 4, 12}, // 3 - consonant
+ {-1, -1, -1, -1, -1, -1, 3, 2, -1, -1, -1, -1, -1, -1, 7}, // 4 - consonant virama
+- {-1, 6, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, // 5 - dependent vowels
++ {-1, 6, 1, -1, -1, -1, -1, -1, -1, 5, -1, -1, -1, -1, -1}, // 5 - dependent vowels
+ {-1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, // 6 - vowel mark
+ {-1, -1, -1, -1, -1, -1, 3, 2, -1, -1, -1, -1, -1, -1, -1}, // 7 - consonant virama ZWJ, consonant ZWJ virama
+ {-1, 6, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4, -1}, // 8 - independent vowels that can take a virama
diff --git a/icu.icu5418.malayam.patch b/icu.icu5418.malayam.patch
new file mode 100644
index 0000000..03fbe63
--- /dev/null
+++ b/icu.icu5418.malayam.patch
@@ -0,0 +1,39 @@
+--- icu/source/layout/IndicClassTables.cpp.orig 2006-08-23 01:12:40.000000000 +0100
++++ icu/source/layout/IndicClassTables.cpp 2006-09-25 09:06:38.000000000 +0100
+@@ -173,6 +173,19 @@
+ _iv, _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx // 0CE0 - 0CEF
+ };
+
++#if 1
++//use the pango char class table here
++static const IndicClassTable::CharClass mlymCharClasses[] =
++{
++ _xx, _xx, _mp, _mp, _xx, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _xx, _iv, _iv, /* 0D00 - 0D0F */
++ _iv, _xx, _iv, _iv, _iv, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, /* 0D10 - 0D1F */
++ _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _xx, _ct, _ct, _ct, _ct, _ct, _pb, /* 0D20 - 0D2F */
++ _pb, _cn, _ct, _ct, _ct, _pb, _ct, _ct, _ct, _ct, _xx, _xx, _xx, _xx, _dr, _dr, /* 0D30 - 0D3F */
++ _dr, _dr, _dr, _dr, _xx, _xx, _dl, _dl, _dl, _xx, _s1, _s2, _s3, _vr, _xx, _xx, /* 0D40 - 0D4F */
++ _xx, _xx, _xx, _xx, _xx, _xx, _xx, _dr, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, /* 0D50 - 0D5F */
++ _iv, _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx /* 0D60 - 0D6F */
++};
++#else
+ // FIXME: this is correct for old-style Malayalam (MAL) but not for reformed Malayalam (MLR)
+ // FIXME: should there be a REPH for old-style Malayalam?
+ static const IndicClassTable::CharClass mlymCharClasses[] =
+@@ -185,6 +198,7 @@
+ _xx, _xx, _xx, _xx, _xx, _xx, _xx, _m2, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, // 0D50 - 0D5F
+ _iv, _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx // 0D60 - 0D6F
+ };
++#endif
+
+ static const IndicClassTable::CharClass sinhCharClasses[] =
+ {
+@@ -232,7 +246,7 @@
+ #define TAML_SCRIPT_FLAGS (SF_MPRE_FIXUP | SF_NO_POST_BASE_LIMIT | SF_FILTER_ZERO_WIDTH)
+ #define TELU_SCRIPT_FLAGS (SF_MATRAS_AFTER_BASE | SF_FILTER_ZERO_WIDTH | 3)
+ #define KNDA_SCRIPT_FLAGS (SF_MATRAS_AFTER_BASE | SF_FILTER_ZERO_WIDTH | 3)
+-#define MLYM_SCRIPT_FLAGS (SF_MPRE_FIXUP | SF_NO_POST_BASE_LIMIT | SF_FILTER_ZERO_WIDTH)
++#define MLYM_SCRIPT_FLAGS (SF_MPRE_FIXUP | SF_NO_POST_BASE_LIMIT)
+ #define SINH_SCRIPT_FLAGS (SF_MPRE_FIXUP | SF_NO_POST_BASE_LIMIT)
+
+ //
diff --git a/icu.icu5431.malayam.patch b/icu.icu5431.malayam.patch
new file mode 100644
index 0000000..48a549d
--- /dev/null
+++ b/icu.icu5431.malayam.patch
@@ -0,0 +1,107 @@
+--- icu.orig/source/layout/IndicReordering.cpp 2006-12-21 09:24:42.000000000 +0000
++++ icu/source/layout/IndicReordering.cpp 2006-12-21 09:16:15.000000000 +0000
+@@ -50,6 +50,14 @@
+ #define distFeatureMask 0x00010000UL
+ #define initFeatureMask 0x00008000UL
+
++// TODO: Find better names for these!
++#define tagArray4 (loclFeatureMask | nuktFeatureMask | akhnFeatureMask | vatuFeatureMask | presFeatureMask | blwsFeatureMask | abvsFeatureMask | pstsFeatureMask | halnFeatureMask | blwmFeatureMask | abvmFeatureMask | distFeatureMask)
++#define tagArray3 (pstfFeatureMask | tagArray4)
++#define tagArray2 (halfFeatureMask | tagArray3)
++#define tagArray1 (blwfFeatureMask | tagArray2)
++#define tagArray0 (rphfFeatureMask | tagArray1)
++
++
+ class IndicReorderingOutput : public UMemory {
+ private:
+ le_int32 fOutIndex;
+@@ -154,6 +162,27 @@
+ fSMabove = fSMbelow = 0;
+ }
+
++ void swapChars(int a, int b)
++ {
++ LEErrorCode success = LE_NO_ERROR;
++ LEUnicode temp_char;
++ le_uint32 temp_index;
++ FeatureMask temp_tag;
++
++ temp_char = fOutChars[fOutIndex + b];
++ temp_index = fGlyphStorage.getCharIndex(fOutIndex + b, success);
++ temp_tag = fGlyphStorage.getAuxData(fOutIndex + b, success);
++
++ fOutChars[fOutIndex + b] = fOutChars[fOutIndex + a];
++ le_uint32 toswap = fGlyphStorage.getCharIndex(fOutIndex + a, success);
++ fGlyphStorage.setCharIndex(fOutIndex + b, toswap, success);
++ fGlyphStorage.setAuxData(fOutIndex + b, tagArray3, success);
++
++ fOutChars[fOutIndex + a] = temp_char;
++ fGlyphStorage.setCharIndex(fOutIndex + a, temp_index, success);
++ fGlyphStorage.setAuxData(fOutIndex + a, temp_tag, success);
++ }
++
+ void writeChar(LEUnicode ch, le_uint32 charIndex, FeatureMask charFeatures)
+ {
+ LEErrorCode success = LE_NO_ERROR;
+@@ -335,13 +364,6 @@
+ C_DOTTED_CIRCLE = 0x25CC
+ };
+
+-// TODO: Find better names for these!
+-#define tagArray4 (loclFeatureMask | nuktFeatureMask | akhnFeatureMask | vatuFeatureMask | presFeatureMask | blwsFeatureMask | abvsFeatureMask | pstsFeatureMask | halnFeatureMask | blwmFeatureMask | abvmFeatureMask | distFeatureMask)
+-#define tagArray3 (pstfFeatureMask | tagArray4)
+-#define tagArray2 (halfFeatureMask | tagArray3)
+-#define tagArray1 (blwfFeatureMask | tagArray2)
+-#define tagArray0 (rphfFeatureMask | tagArray1)
+-
+ static const FeatureMap featureMap[] =
+ {
+ {loclFeatureTag, loclFeatureMask},
+@@ -629,6 +651,21 @@
+ output.writeChar(chars[i], i, tagArray4);
+ }
+
++ /* for the special conjuction of Cons+0x0d4d+0x0d31 or Cons+0x0d4d+0x0d30 of Malayalam */
++ if ((baseConsonant - 2 >= 0) &&
++ (chars[baseConsonant - 1] == 0x0d4d) &&
++ ((chars[baseConsonant] == 0x0d31) ||
++ (chars[baseConsonant] == 0x0d30)) &&
++ ((chars[baseConsonant - 2] >= 0x0d15) &&
++ (chars[baseConsonant - 2] <= 0x0d39))) {
++ if (baseConsonant < 3 || chars[baseConsonant - 3] != 0x0d4d) {
++ output.swapChars(-1, -3);
++
++ if (mpreFixups)
++ mpreFixups->reduce();
++ }
++ }
++
+ if ((classTable->scriptFlags & SF_MATRAS_AFTER_BASE) != 0) {
+ output.writeMbelow();
+ output.writeSMbelow(); // FIXME: there are no SMs in these scripts...
+--- icu.orig/source/layout/MPreFixups.h 2006-11-10 09:42:47.000000000 +0000
++++ icu/source/layout/MPreFixups.h 2006-12-21 09:13:47.000000000 +0000
+@@ -31,6 +31,8 @@
+
+ void apply(LEGlyphStorage &glyphStorage);
+
++ void reduce();
++
+ private:
+ FixupData *fFixupData;
+ le_int32 fFixupCount;
+--- icu.orig/source/layout/MPreFixups.cpp 2006-11-10 09:42:47.000000000 +0000
++++ icu/source/layout/MPreFixups.cpp 2006-12-21 09:16:33.000000000 +0000
+@@ -40,6 +40,12 @@
+ }
+ }
+
++void MPreFixups::reduce()
++{
++ if (fFixupCount > 0)
++ fFixupCount--;
++}
++
+ void MPreFixups::apply(LEGlyphStorage &glyphStorage)
+ {
+ for (le_int32 fixup = 0; fixup < fFixupCount; fixup += 1) {
diff --git a/icu.icu5433.oriya.patch b/icu.icu5433.oriya.patch
new file mode 100644
index 0000000..f35f5a2
--- /dev/null
+++ b/icu.icu5433.oriya.patch
@@ -0,0 +1,31 @@
+diff -ru icu.orig/source/layout/IndicClassTables.cpp icu/source/layout/IndicClassTables.cpp
+--- icu.orig/source/layout/IndicClassTables.cpp 2006-10-03 14:27:47.000000000 +0100
++++ icu/source/layout/IndicClassTables.cpp 2006-10-03 14:30:07.000000000 +0100
+@@ -120,6 +120,19 @@
+ _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx // 0AE0 - 0AEF
+ };
+
++#if 1
++static const IndicClassTable::CharClass oryaCharClasses[] =
++{
++ _xx, _ma, _mp, _mp, _xx, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _xx, _xx, _iv, /* 0B00 - 0B0F */
++ _iv, _xx, _xx, _iv, _iv, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _ct, _bb, /* 0B10 - 0B1F */
++ _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _xx, _bb, _bb, _bb, _bb, _bb, _pb, /* 0B20 - 0B2F */
++ _rb, _xx, _bb, _bb, _xx, _bb, _bb, _bb, _bb, _bb, _xx, _xx, _nu, _xx, _dr, _da, /* 0B30 - 0B3F */
++ _dr, _db, _db, _db, _xx, _xx, _xx, _dl, _s1, _xx, _xx, _s2, _s3, _vr, _xx, _xx, /* 0B40 - 0B4F */
++ _xx, _xx, _xx, _xx, _xx, _xx, _da, _dr, _xx, _xx, _xx, _xx, _cn, _cn, _xx, _pb, /* 0B50 - 0B5F */
++ _iv, _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, /* 0B60 - 0B6F */
++ _xx, _bb /* 0B70 - 0B71 */
++};
++#else
+ static const IndicClassTable::CharClass oryaCharClasses[] =
+ {
+ _xx, _ma, _mp, _mp, _xx, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _xx, _xx, _iv, // 0B00 - 0B0F
+@@ -131,6 +144,7 @@
+ _iv, _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, // 0B60 - 0B6F
+ _xx, _ct // 0B70 - 0B71
+ };
++#endif
+
+ static const IndicClassTable::CharClass tamlCharClasses[] =
+ {
diff --git a/icu.icu5465.telegu.patch b/icu.icu5465.telegu.patch
new file mode 100644
index 0000000..7e80103
--- /dev/null
+++ b/icu.icu5465.telegu.patch
@@ -0,0 +1,29 @@
+--- icu.orig/source/layout/IndicClassTables.cpp 2007-02-05 14:44:17.000000000 +0000
++++ icu/source/layout/IndicClassTables.cpp 2007-02-05 14:47:49.000000000 +0000
+@@ -145,6 +145,7 @@
+ };
+
+ // FIXME: Should some of the bb's be pb's? (KA, NA, MA, YA, VA, etc. (approx 13))
++#if 0
+ static const IndicClassTable::CharClass teluCharClasses[] =
+ {
+ _xx, _mp, _mp, _mp, _xx, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _xx, _iv, _iv, // 0C00 - 0C0F
+@@ -155,6 +156,18 @@
+ _xx, _xx, _xx, _xx, _xx, _da, _m2, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, // 0C50 - 0C5F
+ _iv, _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx // 0C60 - 0C6F
+ };
++#else
++static const IndicClassTable::CharClass teluCharClasses[] =
++{
++ _xx, _mp, _mp, _mp, _xx, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _xx, _iv, _iv, /* 0C00 - 0C0F */
++ _iv, _xx, _iv, _iv, _iv, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, /* 0C10 - 0C1F */
++ _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _xx, _bb, _bb, _bb, _bb, _bb, _bb, /* 0C20 - 0C2F */
++ _bb, _bb, _bb, _bb, _xx, _bb, _bb, _bb, _bb, _bb, _xx, _xx, _xx, _xx, _da, _da, /* 0C30 - 0C3F */
++ _da, _dr, _dr, _dr, _dr, _xx, _da, _da, _s1, _xx, _da, _da, _da, _vr, _xx, _xx, /* 0C40 - 0C4F */
++ _xx, _xx, _xx, _xx, _xx, _da, _db, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, /* 0C50 - 0C5F */
++ _iv, _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx /* 0C60 - 0C6F */
++};
++#endif
+
+ // U+CC3 and U+CC4 are _lm here not _dr since the Kannada rendering
+ // rules want them below and to the right of the entire cluster
diff --git a/icu.icu5483.backport.patch b/icu.icu5483.backport.patch
new file mode 100644
index 0000000..039dee2
--- /dev/null
+++ b/icu.icu5483.backport.patch
@@ -0,0 +1,874 @@
+diff -ru icu.orig/source/common/ucnv2022.c icu/source/common/ucnv2022.c
+--- icu.orig/source/common/ucnv2022.c 2009-06-02 11:48:38.000000000 +0100
++++ icu/source/common/ucnv2022.c 2009-06-02 12:30:29.000000000 +0100
+@@ -84,6 +84,26 @@
+ #define V_TAB 0x0B
+ #define SPACE 0x20
+
++enum {
++ HWKANA_START=0xff61,
++ HWKANA_END=0xff9f
++};
++
++/*
++ * 94-character sets with native byte values A1..FE are encoded in ISO 2022
++ * as bytes 21..7E. (Subtract 0x80.)
++ * 96-character sets with native byte values A0..FF are encoded in ISO 2022
++ * as bytes 20..7F. (Subtract 0x80.)
++ * Do not encode C1 control codes with native bytes 80..9F
++ * as bytes 00..1F (C0 control codes).
++ */
++enum {
++ GR94_START=0xa1,
++ GR94_END=0xfe,
++ GR96_START=0xa0,
++ GR96_END=0xff
++};
++
+ /*
+ * ISO 2022 control codes must not be converted from Unicode
+ * because they would mess up the byte stream.
+@@ -981,22 +1001,27 @@
+
+
+ /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
+- * any future change in _MBCSFromUChar32() function should be reflected in
+- * this macro
++ * any future change in _MBCSFromUChar32() function should be reflected here.
++ * @return number of bytes in *value; negative number if fallback; 0 if no mapping
+ */
+-static U_INLINE void
++static U_INLINE int32_t
+ MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
+ UChar32 c,
+ uint32_t* value,
+ UBool useFallback,
+- int32_t *length,
+ int outputType)
+ {
+ const int32_t *cx;
+ const uint16_t *table;
+ uint32_t stage2Entry;
+ uint32_t myValue;
++ int32_t length;
+ const uint8_t *p;
++ /*
++ * TODO(markus): Use and require new, faster MBCS conversion table structures.
++ * Use internal version of ucnv_open() that verifies that the new structures are available,
++ * else U_INTERNAL_PROGRAM_ERROR.
++ */
+ /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
+ if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
+ table=sharedData->mbcs.fromUnicodeTable;
+@@ -1005,51 +1030,60 @@
+ if(outputType==MBCS_OUTPUT_2){
+ myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
+ if(myValue<=0xff) {
+- *length=1;
++ length=1;
+ } else {
+- *length=2;
++ length=2;
+ }
+ } else /* outputType==MBCS_OUTPUT_3 */ {
+ p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
+ myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
+ if(myValue<=0xff) {
+- *length=1;
++ length=1;
+ } else if(myValue<=0xffff) {
+- *length=2;
++ length=2;
+ } else {
+- *length=3;
++ length=3;
+ }
+ }
++ /*
++ * TODO(markus): Use Shift-JIS table for JIS X 0208, to save mapping table space.
++ * Pass in parameter for type of output bytes, for validation and shifting:
++ * - Direct: Pass bytes through, but forbid control codes 00-1F (except SI/SO/ESC) and space 20?
++ * (Need to allow some (TAB/LF/CR) or most of them for ASCII and maybe JIS X 0201.)
++ * - A1-FE: Subtract 80 after range check.
++ * - SJIS: Shift DBCS result to 21-7E x 21-7E.
++ */
+ /* is this code point assigned, or do we use fallbacks? */
+- if( (stage2Entry&(1<<(16+(c&0xf))))!=0 ||
+- (FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0)
+- ) {
++ if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
++ /* assigned */
++ *value=myValue;
++ return length;
++ } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
+ /*
+ * We allow a 0 byte output if the "assigned" bit is set for this entry.
+ * There is no way with this data structure for fallback output
+ * to be a zero byte.
+ */
+- /* assigned */
+ *value=myValue;
+- return;
++ return -length;
+ }
+ }
+
+ cx=sharedData->mbcs.extIndexes;
+ if(cx!=NULL) {
+- *length=ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
+- return;
++ return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
+ }
+
+ /* unassigned */
+- *length=0;
++ return 0;
+ }
+
+ /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
+- * any future change in _MBCSSingleFromUChar32() function should be reflected in
+- * this macro
++ * any future change in _MBCSSingleFromUChar32() function should be reflected here.
++ * @param retval pointer to output byte
++ * @return 1 roundtrip byte 0 no mapping -1 fallback byte
+ */
+-static U_INLINE void
++static U_INLINE int32_t
+ MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
+ UChar32 c,
+ uint32_t* retval,
+@@ -1059,20 +1093,21 @@
+ int32_t value;
+ /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
+ if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
+- *retval=(uint16_t)-1;
+- return;
++ return 0;
+ }
+ /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
+ table=sharedData->mbcs.fromUnicodeTable;
+ /* get the byte for the output */
+ value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
+ /* is this code point assigned, or do we use fallbacks? */
+- if(useFallback ? value>=0x800 : value>=0xc00) {
+- value &=0xff;
++ *retval=(uint32_t)(value&0xff);
++ if(value>=0xf00) {
++ return 1; /* roundtrip */
++ } else if(useFallback ? value>=0x800 : value>=0xc00) {
++ return -1; /* fallback taken */
+ } else {
+- value= -1;
++ return 0; /* no mapping */
+ }
+- *retval=(uint16_t) value;
+ }
+
+ #ifdef U_ENABLE_GENERIC_ISO_2022
+@@ -1316,6 +1351,7 @@
+
+ static void
+ UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
++ UConverter *cnv = args->converter;
+ UConverterDataISO2022 *converterData;
+ ISO2022State *pFromU2022State;
+ uint8_t *target = (uint8_t *) args->target;
+@@ -1335,14 +1371,13 @@
+ int8_t cs, g;
+
+ /* set up the state */
+- converterData = (UConverterDataISO2022*)args->converter->extraInfo;
++ converterData = (UConverterDataISO2022*)cnv->extraInfo;
+ pFromU2022State = &converterData->fromU2022State;
+- useFallback = args->converter->useFallback;
+
+ choiceCount = 0;
+
+ /* check if the last codepoint of previous buffer was a lead surrogate*/
+- if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) {
++ if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
+ goto getTrail;
+ }
+
+@@ -1361,26 +1396,26 @@
+ if(UTF_IS_SECOND_SURROGATE(trail)) {
+ source++;
+ sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
+- args->converter->fromUChar32=0x00;
++ cnv->fromUChar32=0x00;
+ /* convert this supplementary code point */
+ /* exit this condition tree */
+ } else {
+ /* this is an unmatched lead code unit (1st surrogate) */
+ /* callback(illegal) */
+ *err=U_ILLEGAL_CHAR_FOUND;
+- args->converter->fromUChar32=sourceChar;
++ cnv->fromUChar32=sourceChar;
+ break;
+ }
+ } else {
+ /* no more input */
+- args->converter->fromUChar32=sourceChar;
++ cnv->fromUChar32=sourceChar;
+ break;
+ }
+ } else {
+ /* this is an unmatched trail code unit (2nd surrogate) */
+ /* callback(illegal) */
+ *err=U_ILLEGAL_CHAR_FOUND;
+- args->converter->fromUChar32=sourceChar;
++ cnv->fromUChar32=sourceChar;
+ break;
+ }
+ }
+@@ -1389,7 +1424,7 @@
+ if(IS_2022_CONTROL(sourceChar)) {
+ /* callback(illegal) */
+ *err=U_ILLEGAL_CHAR_FOUND;
+- args->converter->fromUChar32=sourceChar;
++ cnv->fromUChar32=sourceChar;
+ break;
+ }
+
+@@ -1407,9 +1442,10 @@
+
+ /* JIS7/8: try single-byte half-width Katakana before JISX208 */
+ if(converterData->version == 3 || converterData->version == 4) {
+- choices[choiceCount++] = cs = (int8_t)HWKANA_7BIT;
+- csm &= ~CSM(cs);
++ choices[choiceCount++] = (int8_t)HWKANA_7BIT;
+ }
++ /* Do not try single-byte half-width Katakana for other versions. */
++ csm &= ~CSM(HWKANA_7BIT);
+
+ /* try the current G0 charset */
+ choices[choiceCount++] = cs = pFromU2022State->cs[0];
+@@ -1432,86 +1468,134 @@
+ }
+
+ cs = g = 0;
++ /*
++ * len==0: no mapping found yet
++ * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
++ * len>0: found a roundtrip result, done
++ */
+ len = 0;
++ /*
++ * We will turn off useFallback after finding a fallback,
++ * but we still get fallbacks from PUA code points as usual.
++ * Therefore, we will also need to check that we don't overwrite
++ * an early fallback with a later one.
++ */
++ useFallback = cnv->useFallback;
+
+- for(i = 0; i < choiceCount && len == 0; ++i) {
+- cs = choices[i];
+- switch(cs) {
++ for(i = 0; i < choiceCount && len <= 0; ++i) {
++ uint32_t value;
++ int32_t len2;
++ int8_t cs0 = choices[i];
++ switch(cs0) {
+ case ASCII:
+ if(sourceChar <= 0x7f) {
+ targetValue = (uint32_t)sourceChar;
+ len = 1;
++ cs = cs0;
++ g = 0;
+ }
+ break;
+ case ISO8859_1:
+- if(0x80 <= sourceChar && sourceChar <= 0xff) {
++ if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
+ targetValue = (uint32_t)sourceChar - 0x80;
+ len = 1;
++ cs = cs0;
+ g = 2;
+ }
+ break;
+ case HWKANA_7BIT:
+- if((uint32_t)(0xff9f-sourceChar)<=(0xff9f-0xff61)) {
+- targetValue = (uint32_t)(sourceChar - (0xff61 - 0x21));
+- len = 1;
+-
++ if((uint32_t)(HWKANA_END-sourceChar)<=(HWKANA_END-HWKANA_START)) {
+ if(converterData->version==3) {
+ /* JIS7: use G1 (SO) */
+- pFromU2022State->cs[1] = cs; /* do not output an escape sequence */
++ /* Shift U+FF61..U+FF9F to bytes 21..5F. */
++ targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
++ len = 1;
++ pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
+ g = 1;
+ } else if(converterData->version==4) {
+ /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
+- int8_t cs0;
+-
+- targetValue += 0x80;
++ /* Shift U+FF61..U+FF9F to bytes A1..DF. */
++ targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
++ len = 1;
+
+- cs0 = pFromU2022State->cs[0];
+- if(IS_JP_DBCS(cs0)) {
++ cs = pFromU2022State->cs[0];
++ if(IS_JP_DBCS(cs)) {
+ /* switch from a DBCS charset to JISX201 */
+ cs = (int8_t)JISX201;
+- } else {
+- /* stay in the current G0 charset */
+- cs = cs0;
+ }
++ /* else stay in the current G0 charset */
++ g = 0;
+ }
++ /* else do not use HWKANA_7BIT with other versions */
+ }
+ break;
+ case JISX201:
+ /* G0 SBCS */
+- MBCS_SINGLE_FROM_UCHAR32(
+- converterData->myConverterArray[cs],
+- sourceChar, &targetValue,
+- useFallback);
+- if(targetValue <= 0x7f) {
+- len = 1;
++ len2 = MBCS_SINGLE_FROM_UCHAR32(
++ converterData->myConverterArray[cs0],
++ sourceChar, &value,
++ useFallback);
++ if(len2 != 0 && !(len2 < 0 && len != 0) && value <= 0x7f) {
++ targetValue = value;
++ len = len2;
++ cs = cs0;
++ g = 0;
++ useFallback = FALSE;
+ }
+ break;
+ case ISO8859_7:
+ /* G0 SBCS forced to 7-bit output */
+- MBCS_SINGLE_FROM_UCHAR32(
+- converterData->myConverterArray[cs],
+- sourceChar, &targetValue,
+- useFallback);
+- if(0x80 <= targetValue && targetValue <= 0xff) {
+- targetValue -= 0x80;
+- len = 1;
++ len2 = MBCS_SINGLE_FROM_UCHAR32(
++ converterData->myConverterArray[cs0],
++ sourceChar, &value,
++ useFallback);
++ if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
++ targetValue = value - 0x80;
++ len = len2;
++ cs = cs0;
+ g = 2;
++ useFallback = FALSE;
+ }
+ break;
+ default:
+ /* G0 DBCS */
+- MBCS_FROM_UCHAR32_ISO2022(
+- converterData->myConverterArray[cs],
+- sourceChar, &targetValue,
+- useFallback, &len, MBCS_OUTPUT_2);
+- if(len != 2) {
+- len = 0;
++ len2 = MBCS_FROM_UCHAR32_ISO2022(
++ converterData->myConverterArray[cs0],
++ sourceChar, &value,
++ useFallback, MBCS_OUTPUT_2);
++ if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
++ if(cs0 == KSC5601) {
++ /*
++ * Check for valid bytes for the encoding scheme.
++ * This is necessary because the sub-converter (windows-949)
++ * has a broader encoding scheme than is valid for 2022.
++ *
++ * Check that the result is a 2-byte value with each byte in the range A1..FE
++ * (strict EUC-KR DBCS) before accepting it and subtracting 0x80 from each byte
++ * to move it to the ISO 2022 range 21..7E.
++ */
++ if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
++ (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
++ ) {
++ value -= 0x8080; /* shift down to 21..7e byte range */
++ } else {
++ break; /* not valid for ISO 2022 */
++ }
++ }
++ targetValue = value;
++ len = len2;
++ cs = cs0;
++ g = 0;
++ useFallback = FALSE;
+ }
+ break;
+ }
+ }
+
+- if(len > 0) {
++ if(len != 0) {
++ if(len < 0) {
++ len = -len; /* fallback */
++ }
+ outLen = 0; /* count output bytes */
+
+ /* write SI if necessary (only for JIS7) */
+@@ -1560,7 +1644,7 @@
+ * then this is an error
+ */
+ *err = U_INVALID_CHAR_FOUND;
+- args->converter->fromUChar32=sourceChar;
++ cnv->fromUChar32=sourceChar;
+ break;
+ }
+
+@@ -1586,7 +1670,7 @@
+ }
+ } else {
+ fromUWriteUInt8(
+- args->converter,
++ cnv,
+ buffer, outLen,
+ &target, (const char *)targetLimit,
+ &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
+@@ -1615,7 +1699,7 @@
+ */
+ if( U_SUCCESS(*err) &&
+ (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
+- args->flush && source>=sourceLimit && args->converter->fromUChar32==0
++ args->flush && source>=sourceLimit && cnv->fromUChar32==0
+ ) {
+ int32_t sourceIndex;
+
+@@ -1654,7 +1738,7 @@
+ }
+
+ fromUWriteUInt8(
+- args->converter,
++ cnv,
+ buffer, outLen,
+ &target, (const char *)targetLimit,
+ &offsets, sourceIndex,
+@@ -1777,7 +1861,7 @@
+ !IS_JP_DBCS(cs)
+ ) {
+ /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
+- targetUniChar = mySourceChar + (0xff61 - 0xa1);
++ targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
+
+ /* return from a single-shift state to the previous one */
+ if(pToU2022State->g >= 2) {
+@@ -1818,7 +1902,7 @@
+ case HWKANA_7BIT:
+ if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
+ /* 7-bit halfwidth Katakana */
+- targetUniChar = mySourceChar + (0xff61 - 0x21);
++ targetUniChar = mySourceChar + (HWKANA_START - 0x21);
+ }
+ break;
+ default:
+@@ -1965,9 +2049,10 @@
+ break;
+ }
+
+- /* length= ucnv_MBCSFromUChar32(converterData->currentConverter->sharedData,
+- sourceChar,&targetByteUnit,args->converter->useFallback);*/
+- MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,&length,MBCS_OUTPUT_2);
++ length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
++ if(length < 0) {
++ length = -length; /* fallback */
++ }
+ /* only DBCS or SBCS characters are expected*/
+ /* DB characters with high bit set to 1 are expected */
+ if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080)&& length==2)){
+@@ -2449,7 +2534,7 @@
+
+ static void
+ UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
+-
++ UConverter *cnv = args->converter;
+ UConverterDataISO2022 *converterData;
+ ISO2022State *pFromU2022State;
+ uint8_t *target = (uint8_t *) args->target;
+@@ -2466,14 +2551,13 @@
+ UBool useFallback;
+
+ /* set up the state */
+- converterData = (UConverterDataISO2022*)args->converter->extraInfo;
++ converterData = (UConverterDataISO2022*)cnv->extraInfo;
+ pFromU2022State = &converterData->fromU2022State;
+- useFallback = args->converter->useFallback;
+
+ choiceCount = 0;
+
+ /* check if the last codepoint of previous buffer was a lead surrogate*/
+- if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) {
++ if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
+ goto getTrail;
+ }
+
+@@ -2492,26 +2576,26 @@
+ if(UTF_IS_SECOND_SURROGATE(trail)) {
+ source++;
+ sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
+- args->converter->fromUChar32=0x00;
++ cnv->fromUChar32=0x00;
+ /* convert this supplementary code point */
+ /* exit this condition tree */
+ } else {
+ /* this is an unmatched lead code unit (1st surrogate) */
+ /* callback(illegal) */
+ *err=U_ILLEGAL_CHAR_FOUND;
+- args->converter->fromUChar32=sourceChar;
++ cnv->fromUChar32=sourceChar;
+ break;
+ }
+ } else {
+ /* no more input */
+- args->converter->fromUChar32=sourceChar;
++ cnv->fromUChar32=sourceChar;
+ break;
+ }
+ } else {
+ /* this is an unmatched trail code unit (2nd surrogate) */
+ /* callback(illegal) */
+ *err=U_ILLEGAL_CHAR_FOUND;
+- args->converter->fromUChar32=sourceChar;
++ cnv->fromUChar32=sourceChar;
+ break;
+ }
+ }
+@@ -2522,7 +2606,7 @@
+ if(IS_2022_CONTROL(sourceChar)) {
+ /* callback(illegal) */
+ *err=U_ILLEGAL_CHAR_FOUND;
+- args->converter->fromUChar32=sourceChar;
++ cnv->fromUChar32=sourceChar;
+ break;
+ }
+
+@@ -2545,7 +2629,6 @@
+ }
+ else{
+ /* convert U+0080..U+10ffff */
+- UConverterSharedData *cnv;
+ int32_t i;
+ int8_t cs, g;
+
+@@ -2593,17 +2676,41 @@
+ }
+
+ cs = g = 0;
++ /*
++ * len==0: no mapping found yet
++ * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
++ * len>0: found a roundtrip result, done
++ */
+ len = 0;
++ /*
++ * We will turn off useFallback after finding a fallback,
++ * but we still get fallbacks from PUA code points as usual.
++ * Therefore, we will also need to check that we don't overwrite
++ * an early fallback with a later one.
++ */
++ useFallback = cnv->useFallback;
+
+- for(i = 0; i < choiceCount && len == 0; ++i) {
+- cs = choices[i];
+- if(cs > 0) {
+- if(cs > CNS_11643_0) {
+- cnv = converterData->myConverterArray[CNS_11643];
+- MBCS_FROM_UCHAR32_ISO2022(cnv,sourceChar,&targetValue,useFallback,&len,MBCS_OUTPUT_3);
+- if(len==3) {
+- cs = (int8_t)(CNS_11643_0 + (targetValue >> 16) - 0x80);
+- len = 2;
++ for(i = 0; i < choiceCount && len <= 0; ++i) {
++ int8_t cs0 = choices[i];
++ if(cs0 > 0) {
++ uint32_t value;
++ int32_t len2;
++ if(cs0 > CNS_11643_0) {
++ len2 = MBCS_FROM_UCHAR32_ISO2022(
++ converterData->myConverterArray[CNS_11643],
++ sourceChar,
++ &value,
++ useFallback,
++ MBCS_OUTPUT_3);
++ if(len2 == 3 || (len2 == -3 && len == 0)) {
++ targetValue = value;
++ cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
++ if(len2 >= 0) {
++ len = 2;
++ } else {
++ len = -2;
++ useFallback = FALSE;
++ }
+ if(cs == CNS_11643_1) {
+ g = 1;
+ } else if(cs == CNS_11643_2) {
+@@ -2617,15 +2724,25 @@
+ }
+ } else {
+ /* GB2312_1 or ISO-IR-165 */
+- cnv = converterData->myConverterArray[cs];
+- MBCS_FROM_UCHAR32_ISO2022(cnv,sourceChar,&targetValue,useFallback,&len,MBCS_OUTPUT_2);
+- g = 1; /* used if len == 2 */
++ len2 = MBCS_FROM_UCHAR32_ISO2022(
++ converterData->myConverterArray[cs0],
++ sourceChar,
++ &value,
++ useFallback,
++ MBCS_OUTPUT_2);
++ if(len2 == 2 || (len2 == -2 && len == 0)) {
++ targetValue = value;
++ len = len2;
++ cs = cs0;
++ g = 1;
++ useFallback = FALSE;
++ }
+ }
+ }
+ }
+
+- if(len > 0) {
+- len = 0; /* count output bytes; it must have been len == 2 */
++ if(len != 0) {
++ len = 0; /* count output bytes; it must have been abs(len) == 2 */
+
+ /* write the designation sequence if necessary */
+ if(cs != pFromU2022State->cs[g]) {
+@@ -2670,7 +2787,7 @@
+ * then this is an error
+ */
+ *err = U_INVALID_CHAR_FOUND;
+- args->converter->fromUChar32=sourceChar;
++ cnv->fromUChar32=sourceChar;
+ break;
+ }
+ }
+@@ -2691,7 +2808,7 @@
+ }
+ } else {
+ fromUWriteUInt8(
+- args->converter,
++ cnv,
+ buffer, len,
+ &target, (const char *)targetLimit,
+ &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
+@@ -2720,7 +2837,7 @@
+ */
+ if( U_SUCCESS(*err) &&
+ pFromU2022State->g!=0 &&
+- args->flush && source>=sourceLimit && args->converter->fromUChar32==0
++ args->flush && source>=sourceLimit && cnv->fromUChar32==0
+ ) {
+ int32_t sourceIndex;
+
+@@ -2748,7 +2865,7 @@
+ }
+
+ fromUWriteUInt8(
+- args->converter,
++ cnv,
+ SHIFT_IN_STR, 1,
+ &target, (const char *)targetLimit,
+ &offsets, sourceIndex,
+@@ -3146,7 +3263,7 @@
+ }
+ if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) {
+ /* include half-width Katakana for JP */
+- sa->addRange(sa->set, 0xff61, 0xff9f);
++ sa->addRange(sa->set, HWKANA_START, HWKANA_END);
+ }
+ break;
+ case 'c':
+diff -ru icu.orig/source/common/ucnv_ext.c icu/source/common/ucnv_ext.c
+--- icu.orig/source/common/ucnv_ext.c 2009-06-02 11:48:38.000000000 +0100
++++ icu/source/common/ucnv_ext.c 2009-06-02 12:14:20.000000000 +0100
+@@ -551,6 +551,12 @@
+ return 0;
+ }
+
++ /*
++ * Tests for (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0:
++ * Do not interpret values with reserved bits used, for forward compatibility,
++ * and do not even remember intermediate results with reserved bits used.
++ */
++
+ if(UCNV_EXT_TO_U_IS_PARTIAL(value)) {
+ /* partial match, enter the loop below */
+ index=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value);
+@@ -575,7 +581,8 @@
+ value=*fromUSectionValues++;
+ if( value!=0 &&
+ (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) ||
+- FROM_U_USE_FALLBACK(useFallback, firstCP))
++ FROM_U_USE_FALLBACK(useFallback, firstCP)) &&
++ (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0
+ ) {
+ /* remember longest match so far */
+ matchValue=value;
+@@ -613,8 +620,9 @@
+ /* partial match, continue */
+ index=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value);
+ } else {
+- if( UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) ||
+- FROM_U_USE_FALLBACK(useFallback, firstCP)
++ if( (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) ||
++ FROM_U_USE_FALLBACK(useFallback, firstCP)) &&
++ (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0
+ ) {
+ /* full match, stop with result */
+ matchValue=value;
+@@ -632,8 +640,9 @@
+ return 0;
+ }
+ } else /* result from firstCP trie lookup */ {
+- if( UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) ||
+- FROM_U_USE_FALLBACK(useFallback, firstCP)
++ if( (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) ||
++ FROM_U_USE_FALLBACK(useFallback, firstCP)) &&
++ (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0
+ ) {
+ /* full match, stop with result */
+ matchValue=value;
+@@ -644,20 +653,18 @@
+ }
+ }
+
+- if(matchValue&UCNV_EXT_FROM_U_RESERVED_MASK) {
+- /* do not interpret values with reserved bits used, for forward compatibility */
+- return 0;
+- }
+-
+ /* return result */
+ if(matchValue==UCNV_EXT_FROM_U_SUBCHAR1) {
+ return 1; /* assert matchLength==2 */
+ }
+
+- *pMatchValue=UCNV_EXT_FROM_U_MASK_ROUNDTRIP(matchValue);
++ *pMatchValue=matchValue;
+ return matchLength;
+ }
+
++/*
++ * @param value fromUnicode mapping table value; ignores roundtrip and reserved bits
++ */
+ static U_INLINE void
+ ucnv_extWriteFromU(UConverter *cnv, const int32_t *cx,
+ uint32_t value,
+@@ -792,6 +799,10 @@
+ }
+ }
+
++/*
++ * Used by ISO 2022 implementation.
++ * @return number of bytes in *pValue; negative number if fallback; 0 for no mapping
++ */
+ U_CFUNC int32_t
+ ucnv_extSimpleMatchFromU(const int32_t *cx,
+ UChar32 cp, uint32_t *pValue,
+@@ -809,13 +820,15 @@
+ if(match>=2) {
+ /* write result for simple, single-character conversion */
+ int32_t length;
+-
++ int isRoundtrip;
++
++ isRoundtrip=UCNV_EXT_FROM_U_IS_ROUNDTRIP(value);
+ length=UCNV_EXT_FROM_U_GET_LENGTH(value);
+ value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(value);
+
+ if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) {
+ *pValue=value;
+- return length;
++ return isRoundtrip ? length : -length;
+ #if 0 /* not currently used */
+ } else if(length==4) {
+ /* de-serialize a 4-byte result */
+@@ -825,7 +838,7 @@
+ ((uint32_t)result[1]<<16)|
+ ((uint32_t)result[2]<<8)|
+ result[3];
+- return 4;
++ return isRoundtrip ? 4 : -4;
+ #endif
+ }
+ }
+diff -ru icu.orig/source/common/ucnv_ext.h icu/source/common/ucnv_ext.h
+--- icu.orig/source/common/ucnv_ext.h 2009-06-02 11:48:38.000000000 +0100
++++ icu/source/common/ucnv_ext.h 2009-06-02 12:14:20.000000000 +0100
+@@ -452,7 +452,7 @@
+ #define UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) (((value)&UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)!=0)
+ #define UCNV_EXT_FROM_U_MASK_ROUNDTRIP(value) ((value)&~UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)
+
+-/* use after masking off the roundtrip flag */
++/* get length; masks away all other bits */
+ #define UCNV_EXT_FROM_U_GET_LENGTH(value) (int32_t)(((value)>>UCNV_EXT_FROM_U_LENGTH_SHIFT)&UCNV_EXT_MAX_BYTES)
+
+ /* get bytes or bytes index */
+diff -ru icu.orig/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c
+--- icu.orig/source/common/ucnvmbcs.c 2009-06-02 11:48:38.000000000 +0100
++++ icu/source/common/ucnvmbcs.c 2009-06-02 12:14:20.000000000 +0100
+@@ -3785,7 +3785,8 @@
+
+ cx=sharedData->mbcs.extIndexes;
+ if(cx!=NULL) {
+- return ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback);
++ length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback);
++ return length>=0 ? length : -length; /* return abs(length); */
+ }
+
+ /* unassigned */
+diff -ru icu.orig/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt
+--- icu.orig/source/test/testdata/conversion.txt 2009-06-02 11:48:26.000000000 +0100
++++ icu/source/test/testdata/conversion.txt 2009-06-02 12:14:20.000000000 +0100
+@@ -495,6 +495,46 @@
+ }
+ { "UTF-16BE", :bin{ 00 }, "", :intvector{}, :int{1}, :int{0}, "truncated", ".", :bin{ 00 } }
+ { "UTF-16BE", :bin{ d800dc }, "", :intvector{}, :int{1}, :int{0}, "truncated", ".", :bin{ d800dc } }
++ // Verify that mappings that would result in byte values outside 20..7F (for SBCS)
++ // or 21..7E (for DBCS) are not used.
++ // ibm-9005_X110-2007.ucm (ISO 8859-7, <ESC>.F=1b2e46):
++ // <U009F> \x9F |0 (also in ISO 8859-1)
++ // <U0387> \xB7 |1
++ // windows-949-2000 (KSC_5601, <ESC>$(C=1b242843):
++ // <UC829> \xA0\xA1 |0
++ // <UD4FE> \xC0\x41 |0
++ // <UD79D> \xC8\xFE |0
++ {
++ "JIS8", // =ISO_2022,locale=ja,version=4
++ "\u009f\u0387\uc829\ud4fe\ud79d",
++ :bin{ 1a1b2e461b4e371a1a1b242843487e1b2842 },
++ :intvector{ 0,1,1,1,1,1,1,2,3,4,4,4,4,4,4,4,4,4 },
++ :int{1}, :int{1}, "", "?", ""
++ }
++ // Ticket 5483: ISO 2022 converter incorrectly using fallback mapping
++ // Verify that a roundtrip mapping is used even when a fallback mapping is
++ // available in the current state.
++ // U+FF61 is handled in code
++ // jisx-208.ucm (<ESC>$B=1b2442):
++ // <U30FE> \x21\x34 |0
++ // <UFF5D> \x21\x51 |0 and
++ // ibm-897_P100-1995.ucm (JIS X 0201, <ESC>(J=1b284a):
++ // <UFF5D> \x7D |1
++ // ibm-9005_X110-2007.ucm (ISO 8859-7, <ESC>.F=1b2e46):
++ // <U03D5> \xF6 |1
++ // <U2015> \xAF |0
++ // <UFF5D> \x7D |1 (not legal for ISO 2022)
++ // windows-949-2000 (KSC_5601, <ESC>$(C=1b242843):
++ // <UAC00> \xB0\xA1 |0
++ // <UFF5D> \xA3\xFD |0
++ // <U223C> \xA1\xAD |0 (in extension table)
++ {
++ "JIS8", // =ISO_2022,locale=ja,version=4
++ "a\uff61\u03d5\uff5d\uac00\u223c\uff5d\u30fe\uff5d", // Make it switch to ISO-8859-7, KSC 5601 and JIS X 0208.
++ :bin{ 61a11b2e461b4e761b244221511b2428433021212d237d1b2442213421511b2842 },
++ :intvector{ 0,1,2,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,4,5,5,6,6,7,7,7,7,7,8,8,8,8,8 },
++ :int{1}, :int{1}, "", "?", ""
++ }
+
+ // e4b8 is a partial sequence
+ { "UTF-8", :bin{ 31e4ba8ce4b8 }, "1\u4e8c", :intvector{ 0, 1 }, :int{1}, :int{0}, "truncated", ".", :bin{ e4b8 } }
diff --git a/icu.icu5488.assamese.patch b/icu.icu5488.assamese.patch
new file mode 100644
index 0000000..8b5d773
--- /dev/null
+++ b/icu.icu5488.assamese.patch
@@ -0,0 +1,11 @@
+--- icu.orig/source/layout/IndicClassTables.cpp 2006-10-18 09:05:20.000000000 +0100
++++ icu/source/layout/IndicClassTables.cpp 2006-11-01 09:26:58.000000000 +0000
+@@ -94,7 +94,7 @@
+ _dr, _db, _db, _db, _db, _xx, _xx, _l1, _dl, _xx, _xx, _s1, _s2, _vr, _xx, _xx, // 09C0 - 09CF
+ _xx, _xx, _xx, _xx, _xx, _xx, _xx, _m2, _xx, _xx, _xx, _xx, _cn, _cn, _xx, _cn, // 09D0 - 09DF
+ _iv, _iv, _dv, _dv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, // 09E0 - 09EF
+- _ct, _ct, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx // 09F0 - 09FA
++ _rv, _ct, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx /* 09F0 - 09FA */
+ };
+
+ static const IndicClassTable::CharClass punjCharClasses[] =
diff --git a/icu.icu5500.devicetablecrash.patch b/icu.icu5500.devicetablecrash.patch
new file mode 100644
index 0000000..16ea5b7
--- /dev/null
+++ b/icu.icu5500.devicetablecrash.patch
@@ -0,0 +1,11 @@
+--- icu.orig/source/layout/DeviceTables.cpp 2006-10-18 09:05:20.000000000 +0100
++++ icu/source/layout/DeviceTables.cpp 2006-11-08 09:08:09.000000000 +0000
+@@ -22,7 +22,7 @@
+ le_uint16 format = SWAPW(deltaFormat) - 1;
+ le_int16 result = 0;
+
+- if (ppem >= start && ppem <= SWAPW(endSize)) {
++ if (ppem >= start && ppem <= SWAPW(endSize) && format < sizeof(fieldBits)/sizeof(fieldBits[0])) {
+ le_uint16 sizeIndex = ppem - start;
+ le_uint16 bits = fieldBits[format];
+ le_uint16 count = 16 / bits;
diff --git a/icu.icu5501.sinhala.biggerexpand.patch b/icu.icu5501.sinhala.biggerexpand.patch
new file mode 100644
index 0000000..6013780
--- /dev/null
+++ b/icu.icu5501.sinhala.biggerexpand.patch
@@ -0,0 +1,11 @@
+--- icu.orig/source/layout/IndicClassTables.cpp 2006-10-18 09:05:20.000000000 +0100
++++ icu/source/layout/IndicClassTables.cpp 2006-11-08 11:20:55.000000000 +0000
+@@ -284,7 +284,7 @@
+
+ static const IndicClassTable mlymClassTable = {0x0D00, 0x0D6F, 3, MLYM_SCRIPT_FLAGS, mlymCharClasses, mlymSplitTable};
+
+-static const IndicClassTable sinhClassTable = {0x0D80, 0x0DF4, 3, SINH_SCRIPT_FLAGS, sinhCharClasses, sinhSplitTable};
++static const IndicClassTable sinhClassTable = {0x0D80, 0x0DF4, 4, SINH_SCRIPT_FLAGS, sinhCharClasses, sinhSplitTable};
+
+ //
+ // IndicClassTable addresses
diff --git a/icu.icu5506.multiplevowels.patch b/icu.icu5506.multiplevowels.patch
new file mode 100644
index 0000000..a58ec64
--- /dev/null
+++ b/icu.icu5506.multiplevowels.patch
@@ -0,0 +1,61 @@
+diff -ur icu.orig/source/layout/IndicReordering.cpp icu/source/layout/IndicReordering.cpp
+--- icu.orig/source/layout/IndicReordering.cpp 2006-11-10 09:42:44.000000000 +0000
++++ icu/source/layout/IndicReordering.cpp 2006-11-10 09:47:05.000000000 +0000
+@@ -395,7 +395,7 @@
+ {-1, 6, 1, -1, -1, -1, -1, -1, -1, 5, 9, 5, 5, 4, 12}, // 2 - consonant with nukta
+ {-1, 6, 1, -1, -1, -1, -1, -1, 2, 5, 9, 5, 5, 4, 12}, // 3 - consonant
+ {-1, -1, -1, -1, -1, -1, 3, 2, -1, -1, -1, -1, -1, -1, 7}, // 4 - consonant virama
+- {-1, 6, 1, -1, -1, -1, -1, -1, -1, 5, -1, -1, -1, -1, -1}, // 5 - dependent vowels
++ {-1, 6, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, // 5 - dependent vowels
+ {-1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, // 6 - vowel mark
+ {-1, -1, -1, -1, -1, -1, 3, 2, -1, -1, -1, -1, -1, -1, -1}, // 7 - consonant virama ZWJ, consonant ZWJ virama
+ {-1, 6, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4, -1}, // 8 - independent vowels that can take a virama
+@@ -423,6 +423,48 @@
+
+ state = stateTable[state][charClass & CF_CLASS_MASK];
+
++ /*for the components of split matra*/
++ if ((charCount >= cursor + 3) &&
++ (chars[cursor] == 0x0DD9 && chars[cursor + 1] == 0x0DCF && chars[cursor + 2] == 0x0DCA)) { /*for 3 split matra of Sinhala*/
++ return cursor + 3;
++ }
++ else if ((charCount >= cursor + 3) &&
++ (chars[cursor] == 0x0CC6 && chars[cursor + 1] == 0x0CC2 && chars[cursor + 2] == 0x0CD5)) { /*for 3 split matra of Kannada*/
++ return cursor + 3;
++ }
++ /*for 2 split matra*/
++ else if (charCount >= cursor + 2) {
++ /*for Bengali*/
++ if ((chars[cursor] == 0x09C7 && chars[cursor + 1] == 0x09BE) ||
++ (chars[cursor] == 0x09C7 && chars[cursor + 1] == 0x09D7) ||
++ /*for Oriya*/
++ (chars[cursor] == 0x0B47 && chars[cursor + 1] == 0x0B3E) ||
++ (chars[cursor] == 0x0B47 && chars[cursor + 1] == 0x0B56) ||
++ (chars[cursor] == 0x0B47 && chars[cursor + 1] == 0x0B57) ||
++ /*for Tamil*/
++ (chars[cursor] == 0x0BC6 && chars[cursor + 1] == 0x0BBE) ||
++ (chars[cursor] == 0x0BC6 && chars[cursor + 1] == 0x0BD7) ||
++ (chars[cursor] == 0x0BC7 && chars[cursor + 1] == 0x0BBE) ||
++ /*for Malayalam*/
++ (chars[cursor] == 0x0D46 && chars[cursor + 1] == 0x0D3E) ||
++ (chars[cursor] == 0x0D46 && chars[cursor + 1] == 0x0D57) ||
++ (chars[cursor] == 0x0D47 && chars[cursor + 1] == 0x0D3E) ||
++ /*for Sinhala*/
++ (chars[cursor] == 0x0DD9 && chars[cursor + 1] == 0x0DCA) ||
++ (chars[cursor] == 0x0DD9 && chars[cursor + 1] == 0x0DCF) ||
++ (chars[cursor] == 0x0DD9 && chars[cursor + 1] == 0x0DDF) ||
++ (chars[cursor] == 0x0DDC && chars[cursor + 1] == 0x0DCA) ||
++ /*for Telugu*/
++ (chars[cursor] == 0x0C46 && chars[cursor + 1] == 0x0C56) ||
++ /*for Kannada*/
++ (chars[cursor] == 0x0CBF && chars[cursor + 1] == 0x0CD5) ||
++ (chars[cursor] == 0x0CC6 && chars[cursor + 1] == 0x0CD5) ||
++ (chars[cursor] == 0x0CC6 && chars[cursor + 1] == 0x0CD6) ||
++ (chars[cursor] == 0x0CC6 && chars[cursor + 1] == 0x0CC2) ||
++ (chars[cursor] == 0x0CCA && chars[cursor + 1] == 0x0CD5))
++ return cursor + 2;
++ }
++
+ if (state < 0) {
+ break;
+ }
diff --git a/icu.icu5557.safety.patch b/icu.icu5557.safety.patch
new file mode 100644
index 0000000..682caa1
--- /dev/null
+++ b/icu.icu5557.safety.patch
@@ -0,0 +1,14 @@
+--- icu.orig/source/layout/CoverageTables.cpp 2007-01-09 12:57:41.000000000 +0000
++++ icu/source/layout/CoverageTables.cpp 2007-01-09 12:59:09.000000000 +0000
+@@ -44,6 +44,11 @@
+ le_uint16 count = SWAPW(glyphCount);
+ le_uint8 bit = OpenTypeUtilities::highBit(count);
+ le_uint16 power = 1 << bit;
++
++ if (count == 0) {
++ return -1;
++ }
++
+ le_uint16 extra = count - power;
+ le_uint16 probe = power;
+ le_uint16 index = 0;
diff --git a/icu.icu5594.gujarati.patch b/icu.icu5594.gujarati.patch
new file mode 100644
index 0000000..b21418d
--- /dev/null
+++ b/icu.icu5594.gujarati.patch
@@ -0,0 +1,14 @@
+--- icu.orig/source/layout/IndicClassTables.cpp 2007-02-09 14:26:04.000000000 +0000
++++ icu/source/layout/IndicClassTables.cpp 2007-02-13 15:41:52.000000000 +0000
+@@ -117,7 +117,11 @@
+ _rv, _xx, _ct, _ct, _xx, _ct, _ct, _ct, _ct, _ct, _xx, _xx, _nu, _xx, _dr, _dl, // 0AB0 - 0ABF
+ _dr, _db, _db, _db, _db, _da, _xx, _da, _da, _dr, _xx, _dr, _dr, _vr, _xx, _xx, // 0AC0 - 0ACF
+ _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, // 0AD0 - 0ADF
++#if 1
++ _iv, _xx, _db, _db, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx // 0AE0 - 0AEF
++#else
+ _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx // 0AE0 - 0AEF
++#endif
+ };
+
+ #if 1
diff --git a/icu.icu5691.backport.patch b/icu.icu5691.backport.patch
new file mode 100644
index 0000000..906ecd3
--- /dev/null
+++ b/icu.icu5691.backport.patch
@@ -0,0 +1,730 @@
+diff -ru icu.6175/source/common/ucnv2022.c icu/source/common/ucnv2022.c
+--- icu.6175/source/common/ucnv2022.c 2009-06-02 15:47:31.000000000 +0100
++++ icu/source/common/ucnv2022.c 2009-06-02 16:03:15.000000000 +0100
+@@ -754,6 +754,7 @@
+ UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
+ uint32_t key = myData2022->key;
+ int32_t offset = 0;
++ int8_t initialToULength = _this->toULength;
+ char c;
+
+ value = VALID_NON_TERMINAL_2022;
+@@ -806,7 +807,6 @@
+ return;
+ } else if (value == INVALID_2022 ) {
+ *err = U_ILLEGAL_ESCAPE_SEQUENCE;
+- return;
+ } else /* value == VALID_TERMINAL_2022 */ {
+ switch(var){
+ #ifdef U_ENABLE_GENERIC_ISO_2022
+@@ -938,6 +938,35 @@
+ }
+ if(U_SUCCESS(*err)) {
+ _this->toULength = 0;
++ } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
++ if(_this->toULength>1) {
++ /*
++ * Ticket 5691: consistent illegal sequences:
++ * - We include at least the first byte (ESC) in the illegal sequence.
++ * - If any of the non-initial bytes could be the start of a character,
++ * we stop the illegal sequence before the first one of those.
++ * In escape sequences, all following bytes are "printable", that is,
++ * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
++ * they are valid single/lead bytes.
++ * For simplicity, we always only report the initial ESC byte as the
++ * illegal sequence and back out all other bytes we looked at.
++ */
++ /* Back out some bytes. */
++ int8_t backOutDistance=_this->toULength-1;
++ int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
++ if(backOutDistance<=bytesFromThisBuffer) {
++ /* same as initialToULength<=1 */
++ *source-=backOutDistance;
++ } else {
++ /* Back out bytes from the previous buffer: Need to replay them. */
++ _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
++ /* same as -(initialToULength-1) */
++ /* preToULength is negative! */
++ uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
++ *source-=bytesFromThisBuffer;
++ }
++ _this->toULength=1;
++ }
+ } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
+ _this->toUCallbackReason = UCNV_UNASSIGNED;
+ }
+@@ -1973,6 +2002,7 @@
+ mySourceChar = args->converter->toUBytes[0];
+ args->converter->toULength = 0;
+ cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
++ targetUniChar = missingCharMarker;
+ goto getTrailByte;
+ }
+
+@@ -2102,17 +2132,44 @@
+ default:
+ /* G0 DBCS */
+ if(mySource < mySourceLimit) {
+- char trailByte;
++ int leadIsOk, trailIsOk;
++ uint8_t trailByte;
+ getTrailByte:
+- trailByte = *mySource++;
+- if(cs == JISX208) {
+- _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailByte, tempBuf);
+- } else {
+- tempBuf[0] = (char)mySourceChar;
+- tempBuf[1] = trailByte;
++ trailByte = (uint8_t)*mySource;
++ /*
++ * Ticket 5691: consistent illegal sequences:
++ * - We include at least the first byte in the illegal sequence.
++ * - If any of the non-initial bytes could be the start of a character,
++ * we stop the illegal sequence before the first one of those.
++ *
++ * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
++ * an ESC/SO/SI, we report only the first byte as the illegal sequence.
++ * Otherwise we convert or report the pair of bytes.
++ */
++ leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
++ trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
++ if (leadIsOk && trailIsOk) {
++ ++mySource;
++ uint32_t tmpSourceChar = (mySourceChar << 8) | trailByte;
++ if(cs == JISX208) {
++ _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
++ mySourceChar = tmpSourceChar;
++ } else {
++ /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
++ mySourceChar = tmpSourceChar;
++ if (cs == KSC5601) {
++ tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
++ }
++ tempBuf[0] = (char)(tmpSourceChar >> 8);
++ tempBuf[1] = (char)(tmpSourceChar);
++ }
++ targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
++ } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
++ /* report a pair of illegal bytes if the second byte is not a DBCS starter */
++ ++mySource;
++ /* add another bit so that the code below writes 2 bytes in case of error */
++ mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
+ }
+- mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
+- targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
+ } else {
+ args->converter->toUBytes[0] = (uint8_t)mySourceChar;
+ args->converter->toULength = 1;
+@@ -2254,7 +2311,12 @@
+ }
+ /* only DBCS or SBCS characters are expected*/
+ /* DB characters with high bit set to 1 are expected */
+- if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080)&& length==2)){
++ if( length > 2 || length==0 ||
++ (length == 1 && targetByteUnit > 0x7f) ||
++ (length == 2 &&
++ ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
++ (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
++ ) {
+ targetByteUnit=missingCharMarker;
+ }
+ if (targetByteUnit != missingCharMarker){
+@@ -2583,17 +2645,34 @@
+ myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
+ if(myData->toU2022State.g == 1) {
+ if(mySource < mySourceLimit) {
+- char trailByte;
++ int leadIsOk, trailIsOk;
++ uint8_t trailByte;
+ getTrailByte:
+- trailByte = *mySource++;
+- tempBuf[0] = (char)(mySourceChar + 0x80);
+- tempBuf[1] = (char)(trailByte + 0x80);
+- mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
+- if((mySourceChar & 0x8080) == 0) {
++ targetUniChar = missingCharMarker;
++ trailByte = (uint8_t)*mySource;
++ /*
++ * Ticket 5691: consistent illegal sequences:
++ * - We include at least the first byte in the illegal sequence.
++ * - If any of the non-initial bytes could be the start of a character,
++ * we stop the illegal sequence before the first one of those.
++ *
++ * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
++ * an ESC/SO/SI, we report only the first byte as the illegal sequence.
++ * Otherwise we convert or report the pair of bytes.
++ */
++ leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
++ trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
++ if (leadIsOk && trailIsOk) {
++ ++mySource;
++ tempBuf[0] = (char)(mySourceChar + 0x80);
++ tempBuf[1] = (char)(trailByte + 0x80);
+ targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
+- } else {
+- /* illegal bytes > 0x7f */
+- targetUniChar = missingCharMarker;
++ mySourceChar = (mySourceChar << 8) | trailByte;
++ } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
++ /* report a pair of illegal bytes if the second byte is not a DBCS starter */
++ ++mySource;
++ /* add another bit so that the code below writes 2 bytes in case of error */
++ mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
+ }
+ } else {
+ args->converter->toUBytes[0] = (uint8_t)mySourceChar;
+@@ -2601,8 +2680,10 @@
+ break;
+ }
+ }
+- else{
++ else if(mySourceChar <= 0x7f) {
+ targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
++ } else {
++ targetUniChar = 0xffff;
+ }
+ if(targetUniChar < 0xfffe){
+ if(args->offsets) {
+@@ -3099,6 +3180,7 @@
+ /* continue with a partial double-byte character */
+ mySourceChar = args->converter->toUBytes[0];
+ args->converter->toULength = 0;
++ targetUniChar = missingCharMarker;
+ goto getTrailByte;
+ }
+
+@@ -3178,29 +3260,50 @@
+ UConverterSharedData *cnv;
+ StateEnum tempState;
+ int32_t tempBufLen;
+- char trailByte;
++ int leadIsOk, trailIsOk;
++ uint8_t trailByte;
+ getTrailByte:
+- trailByte = *mySource++;
+- tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
+- if(tempState > CNS_11643_0) {
+- cnv = myData->myConverterArray[CNS_11643];
+- tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
+- tempBuf[1] = (char) (mySourceChar);
+- tempBuf[2] = trailByte;
+- tempBufLen = 3;
+-
+- }else{
+- cnv = myData->myConverterArray[tempState];
+- tempBuf[0] = (char) (mySourceChar);
+- tempBuf[1] = trailByte;
+- tempBufLen = 2;
++ trailByte = (uint8_t)*mySource;
++ /*
++ * Ticket 5691: consistent illegal sequences:
++ * - We include at least the first byte in the illegal sequence.
++ * - If any of the non-initial bytes could be the start of a character,
++ * we stop the illegal sequence before the first one of those.
++ *
++ * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
++ * an ESC/SO/SI, we report only the first byte as the illegal sequence.
++ * Otherwise we convert or report the pair of bytes.
++ */
++ leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
++ trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
++ if (leadIsOk && trailIsOk) {
++ ++mySource;
++ tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
++ if(tempState >= CNS_11643_0) {
++ cnv = myData->myConverterArray[CNS_11643];
++ tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
++ tempBuf[1] = (char) (mySourceChar);
++ tempBuf[2] = (char) trailByte;
++ tempBufLen = 3;
++
++ }else{
++ cnv = myData->myConverterArray[tempState];
++ tempBuf[0] = (char) (mySourceChar);
++ tempBuf[1] = (char) trailByte;
++ tempBufLen = 2;
++ }
++ targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
++ mySourceChar = (mySourceChar << 8) | trailByte;
++ } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
++ /* report a pair of illegal bytes if the second byte is not a DBCS starter */
++ ++mySource;
++ /* add another bit so that the code below writes 2 bytes in case of error */
++ mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
+ }
+- mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
+ if(pToU2022State->g>=2) {
+ /* return from a single-shift state to the previous one */
+ pToU2022State->g=pToU2022State->prevG;
+ }
+- targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
+ } else {
+ args->converter->toUBytes[0] = (uint8_t)mySourceChar;
+ args->converter->toULength = 1;
+diff -ru icu.6175/source/common/ucnvhz.c icu/source/common/ucnvhz.c
+--- icu.6175/source/common/ucnvhz.c 2009-06-02 15:47:31.000000000 +0100
++++ icu/source/common/ucnvhz.c 2009-06-02 15:57:18.000000000 +0100
+@@ -196,10 +196,30 @@
+ /* if the first byte is equal to TILDE and the trail byte
+ * is not a valid byte then it is an error condition
+ */
+- mySourceChar = 0x7e00 | mySourceChar;
+- targetUniChar = 0xffff;
++ /*
++ * Ticket 5691: consistent illegal sequences:
++ * - We include at least the first byte in the illegal sequence.
++ * - If any of the non-initial bytes could be the start of a character,
++ * we stop the illegal sequence before the first one of those.
++ */
+ myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */
+- break;
++ *err = U_ILLEGAL_ESCAPE_SEQUENCE;
++ args->converter->toUBytes[0] = UCNV_TILDE;
++ if( myData->isStateDBCS ?
++ (0x21 <= mySourceChar && mySourceChar <= 0x7e) :
++ mySourceChar <= 0x7f
++ ) {
++ /* The current byte could be the start of a character: Back it out. */
++ args->converter->toULength = 1;
++ --mySource;
++ } else {
++ /* Include the current byte in the illegal sequence. */
++ args->converter->toUBytes[1] = mySourceChar;
++ args->converter->toULength = 2;
++ }
++ args->target = myTarget;
++ args->source = mySource;
++ return;
+ }
+ } else if(myData->isStateDBCS) {
+ if(args->converter->toUnicodeStatus == 0x00){
+@@ -215,19 +235,36 @@
+ }
+ else{
+ /* trail byte */
++ int leadIsOk, trailIsOk;
+ uint32_t leadByte = args->converter->toUnicodeStatus & 0xff;
+- if( (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21) &&
+- (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21)
+- ) {
++ targetUniChar = 0xffff;
++ /*
++ * Ticket 5691: consistent illegal sequences:
++ * - We include at least the first byte in the illegal sequence.
++ * - If any of the non-initial bytes could be the start of a character,
++ * we stop the illegal sequence before the first one of those.
++ *
++ * In HZ DBCS, if the second byte is in the 21..7e range,
++ * we report only the first byte as the illegal sequence.
++ * Otherwise we convert or report the pair of bytes.
++ */
++ leadIsOk = (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21);
++ trailIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
++ if (leadIsOk && trailIsOk) {
+ tempBuf[0] = (char) (leadByte+0x80) ;
+ tempBuf[1] = (char) (mySourceChar+0x80);
+ targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
+ tempBuf, 2, args->converter->useFallback);
++ mySourceChar= (leadByte << 8) | mySourceChar;
++ } else if (trailIsOk) {
++ /* report a single illegal byte and continue with the following DBCS starter byte */
++ --mySource;
++ mySourceChar = (int32_t)leadByte;
+ } else {
+- targetUniChar = 0xffff;
++ /* report a pair of illegal bytes if the second byte is not a DBCS starter */
++ /* add another bit so that the code below writes 2 bytes in case of error */
++ mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar;
+ }
+- /* add another bit so that the code below writes 2 bytes in case of error */
+- mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar;
+ args->converter->toUnicodeStatus =0x00;
+ }
+ }
+diff -ru icu.6175/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c
+--- icu.6175/source/common/ucnvmbcs.c 2009-06-02 15:47:31.000000000 +0100
++++ icu/source/common/ucnvmbcs.c 2009-06-02 15:56:07.000000000 +0100
+@@ -1697,6 +1697,65 @@
+ pArgs->offsets=offsets;
+ }
+
++static UBool
++hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) {
++ const int32_t *row=stateTable[state];
++ int32_t b, entry;
++ /* First test for final entries in this state for some commonly valid byte values. */
++ entry=row[0xa1];
++ if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
++ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
++ ) {
++ return TRUE;
++ }
++ entry=row[0x41];
++ if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
++ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
++ ) {
++ return TRUE;
++ }
++ /* Then test for final entries in this state. */
++ for(b=0; b<=0xff; ++b) {
++ entry=row[b];
++ if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
++ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
++ ) {
++ return TRUE;
++ }
++ }
++ /* Then recurse for transition entries. */
++ for(b=0; b<=0xff; ++b) {
++ entry=row[b];
++ if( MBCS_ENTRY_IS_TRANSITION(entry) &&
++ hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry))
++ ) {
++ return TRUE;
++ }
++ }
++ return FALSE;
++}
++
++/*
++ * Is byte b a single/lead byte in this state?
++ * Recurse for transition states, because here we don't want to say that
++ * b is a lead byte if all byte sequences that start with b are illegal.
++ */
++static UBool
++isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) {
++ const int32_t *row=stateTable[state];
++ int32_t entry=row[b];
++ if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */
++ return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry));
++ } else {
++ uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
++ if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {
++ return FALSE; /* SI/SO are illegal for DBCS-only conversion */
++ } else {
++ return action!=MBCS_STATE_ILLEGAL;
++ }
++ }
++}
++
+ U_CFUNC void
+ ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
+ UErrorCode *pErrorCode) {
+@@ -2052,6 +2111,34 @@
+ sourceIndex=nextSourceIndex;
+ } else if(U_FAILURE(*pErrorCode)) {
+ /* callback(illegal) */
++ if(byteIndex>1) {
++ /*
++ * Ticket 5691: consistent illegal sequences:
++ * - We include at least the first byte in the illegal sequence.
++ * - If any of the non-initial bytes could be the start of a character,
++ * we stop the illegal sequence before the first one of those.
++ */
++ UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
++ int8_t i;
++ for(i=1;
++ i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]);
++ ++i) {}
++ if(i<byteIndex) {
++ /* Back out some bytes. */
++ int8_t backOutDistance=byteIndex-i;
++ int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source);
++ byteIndex=i; /* length of reported illegal byte sequence */
++ if(backOutDistance<=bytesFromThisBuffer) {
++ source-=backOutDistance;
++ } else {
++ /* Back out bytes from the previous buffer: Need to replay them. */
++ cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
++ /* preToULength is negative! */
++ uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength);
++ source=(const uint8_t *)pArgs->source;
++ }
++ }
++ }
+ break;
+ } else /* unassigned sequences indicated with byteIndex>0 */ {
+ /* try an extension mapping */
+@@ -2062,7 +2149,7 @@
+ &offsets, sourceIndex,
+ pArgs->flush,
+ pErrorCode);
+- sourceIndex=nextSourceIndex+(int32_t)(source-(const uint8_t *)pArgs->source);
++ sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source);
+
+ if(U_FAILURE(*pErrorCode)) {
+ /* not mappable or buffer overflow */
+@@ -2353,15 +2440,37 @@
+
+ if(c<0) {
+ if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {
+- *pErrorCode=U_TRUNCATED_CHAR_FOUND;
+- }
+- if(U_FAILURE(*pErrorCode)) {
+ /* incomplete character byte sequence */
+ uint8_t *bytes=cnv->toUBytes;
+ cnv->toULength=(int8_t)(source-lastSource);
+ do {
+ *bytes++=*lastSource++;
+ } while(lastSource<source);
++ *pErrorCode=U_TRUNCATED_CHAR_FOUND;
++ } else if(U_FAILURE(*pErrorCode)) {
++ /* callback(illegal) */
++ /*
++ * Ticket 5691: consistent illegal sequences:
++ * - We include at least the first byte in the illegal sequence.
++ * - If any of the non-initial bytes could be the start of a character,
++ * we stop the illegal sequence before the first one of those.
++ */
++ UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
++ uint8_t *bytes=cnv->toUBytes;
++ *bytes++=*lastSource++; /* first byte */
++ if(lastSource==source) {
++ cnv->toULength=1;
++ } else /* lastSource<source: multi-byte character */ {
++ int8_t i;
++ for(i=1;
++ lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource);
++ ++i
++ ) {
++ *bytes++=*lastSource++;
++ }
++ cnv->toULength=i;
++ source=lastSource;
++ }
+ } else {
+ /* no output because of empty input or only state changes */
+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+diff -ru icu.6175/source/test/cintltst/nccbtst.c icu/source/test/cintltst/nccbtst.c
+--- icu.6175/source/test/cintltst/nccbtst.c 2009-06-02 15:47:18.000000000 +0100
++++ icu/source/test/cintltst/nccbtst.c 2009-06-02 15:47:38.000000000 +0100
+@@ -2497,13 +2497,13 @@
+
+
+ static const uint8_t text943[] = {
+- 0x82, 0xa9, 0x82, 0x20, /*0xc8,*/ 0x61, 0x8a, 0xbf, 0x8e, 0x9a };
+- static const UChar toUnicode943sub[] = { 0x304b, 0xfffd, /*0xff88,*/ 0x0061, 0x6f22, 0x5b57};
+- static const UChar toUnicode943skip[]= { 0x304b, /*0xff88,*/ 0x0061, 0x6f22, 0x5b57};
++ 0x82, 0xa9, 0x82, 0x20, 0x61, 0x8a, 0xbf, 0x8e, 0x9a };
++ static const UChar toUnicode943sub[] = { 0x304b, 0x1a, 0x20, 0x0061, 0x6f22, 0x5b57 };
++ static const UChar toUnicode943skip[]= { 0x304b, 0x20, 0x0061, 0x6f22, 0x5b57 };
+ static const UChar toUnicode943stop[]= { 0x304b};
+
+- static const int32_t fromIBM943Offssub[] = {0, 2, 4, 5, 7};
+- static const int32_t fromIBM943Offsskip[] = { 0, 4, 5, 7};
++ static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 7 };
++ static const int32_t fromIBM943Offsskip[] = { 0, 3, 4, 5, 7 };
+ static const int32_t fromIBM943Offsstop[] = { 0};
+
+ gInBufferSize = inputsize;
+@@ -2537,9 +2537,9 @@
+ {
+ static const uint8_t sampleText[] = {
+ 0x82, 0xa9, 0x61, 0x62, 0x63 , 0x82,
+- 0xff, /*0x82, 0xa9,*/ 0x32, 0x33};
+- static const UChar toUnicode943sub[] = {0x304b, 0x0061, 0x0062, 0x0063, 0xfffd,/*0x304b,*/ 0x0032, 0x0033};
+- static const int32_t fromIBM943Offssub[] = {0, 2, 3, 4, 5, 7, 8};
++ 0xff, 0x32, 0x33};
++ static const UChar toUnicode943sub[] = { 0x304b, 0x0061, 0x0062, 0x0063, 0x1a, 0x1a, 0x0032, 0x0033 };
++ static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 6, 7, 8 };
+ /*checking illegal value for ibm-943 with substitute*/
+ gInBufferSize = inputsize;
+ gOutBufferSize = outputsize;
+diff -ru icu.6175/source/test/cintltst/nucnvtst.c icu/source/test/cintltst/nucnvtst.c
+--- icu.6175/source/test/cintltst/nucnvtst.c 2009-06-02 15:47:18.000000000 +0100
++++ icu/source/test/cintltst/nucnvtst.c 2009-06-02 15:47:38.000000000 +0100
+@@ -2606,7 +2606,7 @@
+ TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceLimit <= source");
+ /*Test for the condition where there is an invalid character*/
+ {
+- static const uint8_t source2[]={0xa1, 0x01};
++ static const uint8_t source2[]={0xa1, 0x80};
+ TestNextUCharError(cnv, (const char*)source2, (const char*)source2+sizeof(source2), U_ZERO_ERROR, "an invalid character");
+ }
+ /*Test for the condition where we have a truncated char*/
+@@ -3899,11 +3899,11 @@
+ TestISO_2022_KR() {
+ /* test input */
+ static const uint16_t in[]={
+- 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F66,0x9F67,0x9F6A,0x000A,0x000D
+- ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC02,0xAC04
++ 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F67,0x9F6A,0x000A,0x000D
++ ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC04
+ ,0xAC07,0xAC08,0xAC09,0x0025,0x0026,0x0027,0x000A,0x000D,0x0028,0x0029
+ ,0x002A,0x002B,0x002C,0x002D,0x002E,0x53C3,0x53C8,0x53C9,0x53CA,0x53CB
+- ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53DF,0x53E1,0x53E2
++ ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53E1,0x53E2
+ ,0x53E3,0x53E4,0x000A,0x000D};
+ const UChar* uSource;
+ const UChar* uSourceLimit;
+diff -ru icu.6175/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt
+--- icu.6175/source/test/testdata/conversion.txt 2009-06-02 15:47:18.000000000 +0100
++++ icu/source/test/testdata/conversion.txt 2009-06-02 15:57:41.000000000 +0100
+@@ -48,12 +48,144 @@
+ toUnicode {
+ Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }
+ Cases {
++ // Test ticket 5691: consistent illegal sequences
++ // The following test cases are for illegal character byte sequences.
++ //
++ // Unfortunately, we cannot use the Shift-JIS examples from the ticket
++ // comments because our Shift-JIS table is Windows-compatible and
++ // therefore has no illegal single bytes. Same for GBK.
++ // Instead, we use the stricter GB 18030 also for 2-byte examples.
++ // The byte sequences are generally slightly different from the ticket
++ // comment, simply using assigned characters rather than just
++ // theoretically valid sequences.
++ {
++ "gb18030",
++ :bin{ 618140813c81ff7a },
++ "a\u4e02\\x81<\\x81\\xFFz",
++ :intvector{ 0,1,3,3,3,3,4,5,5,5,5,5,5,5,5,7 },
++ :int{1}, :int{0}, "", "&C", :bin{""}
++ }
++ {
++ "EUC-JP",
++ :bin{ 618fb0a98fb03c8f3cb0a97a },
++ "a\u4e28\\x8F\\xB0<\\x8F<\u9022z",
++ :intvector{ 0,1,4,4,4,4,5,5,5,5,6,7,7,7,7,8,9,11 },
++ :int{1}, :int{0}, "", "&C", :bin{""}
++ }
++ {
++ "gb18030",
++ :bin{ 618130fc318130fc8181303c3e813cfc817a },
++ "a\u05ed\\x810\u9f07\\x810<>\\x81<\u9f07z",
++ :intvector{ 0,1,5,5,5,5,6,7,9,9,9,9,10,11,12,13,13,13,13,14,15,17 },
++ :int{1}, :int{0}, "", "&C", :bin{""}
++ }
++ {
++ "UTF-8",
++ :bin{ 61f1808182f180813cf18081fff180ff3cf1ff3c3e7a },
++ "a\U00040042\\xF1\\x80\\x81<\\xF1\\x80\\x81\\xFF\\xF1\\x80\\xFF<\\xF1\\xFF<>z",
++ :intvector{ 0,1,1,5,5,5,5,5,5,5,5,5,5,5,5,8,9,9,9,9,9,9,9,9,9,9,9,9,12,12,12,12,13,13,13,13,13,13,13,13,15,15,15,15,16,17,17,17,17,18,18,18,18,19,20,21 },
++ :int{1}, :int{0}, "", "&C", :bin{""}
++ }
++ {
++ "ISO-2022-JP",
++ :bin{ 1b24424141af4142affe41431b2842 },
++ "\u758f\\xAF\u758e\\xAF\\xFE\u790e",
++ :intvector{ 3,5,5,5,5,6,8,8,8,8,8,8,8,8,10 },
++ :int{1}, :int{0}, "", "&C", :bin{""}
++ }
++ {
++ "ibm-25546",
++ :bin{ 411b242943420e4141af4142affe41430f5a },
++ "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",
++ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
++ :int{1}, :int{0}, "", "&C", :bin{""}
++ }
++ {
++ "ISO-2022-KR",
++ :bin{ 411b242943420e4141af4142affe41430f5a },
++ "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",
++ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
++ :int{1}, :int{0}, "", "&C", :bin{""}
++ }
++ {
++ "ISO-2022-CN",
++ :bin{ 411b242941420e4141af4142affe41430f5a },
++ "AB\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",
++ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
++ :int{1}, :int{0}, "", "&C", :bin{""}
++ }
++ {
++ "HZ",
++ :bin{ 417e7b4141af4142affe41437e7d5a },
++ "A\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",
++ :intvector{ 0,3,5,5,5,5,6,8,8,8,8,8,8,8,8,10,14 },
++ :int{1}, :int{0}, "", "&C", :bin{""}
++ }
++ // Test ticket 5691: consistent illegal sequences
++ // The following test cases are for illegal escape/designator/shift sequences.
++ //
++ // ISO-2022-JP and -CN with illegal escape sequences.
++ {
++ "ISO-2022-JP",
++ :bin{ 611b24201b244241411b283f1b28427a },
++ "a\\x1B$ \u758f\\x1B\u2538z",
++ :intvector{ 0,1,1,1,1,2,3,7,9,9,9,9,10,15 },
++ :int{1}, :int{0}, "", "&C", :bin{""}
++ }
++ {
++ "ISO-2022-CN",
++ :bin{ 611b2429201b2429410e41410f7a },
++ "a\\x1B$) \u4eaez",
++ :intvector{ 0,1,1,1,1,2,3,4,10,13 },
++ :int{1}, :int{0}, "", "&C", :bin{""}
++ }
++ // Test ticket 5691: ISO-2022-JP-2 with illegal single-shift SS2 and SS3 sequences.
++ // The first ESC N comes before its designator sequence, the last sequence is ESC+space.
++ {
++ "ISO-2022-JP-2",
++ :bin{ 4e1b4e4e1b2e414e1b4e4e4e1b204e },
++ "N\\x1BNNN\xceN\\x1B N",
++ :intvector{ 0,1,1,1,1,2,3,7,10,11,12,12,12,12,13,14 },
++ :int{1}, :int{0}, "", "&C", :bin{""}
++ }
++ {
++ "ISO-2022-CN-EXT",
++ :bin{ 4e1b4e4e1b242a484e1b4e4e4e4e1b204e },
++ "N\\x1BNNN\u8f0eN\\x1B N",
++ :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },
++ :int{1}, :int{0}, "", "&C", :bin{""}
++ }
++ {
++ "ISO-2022-CN-EXT",
++ :bin{ 4f1b4f4f1b242b494f1b4f4f4f4f1b204f },
++ "O\\x1BOOO\u492bO\\x1B O",
++ :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },
++ :int{1}, :int{0}, "", "&C", :bin{""}
++ }
++ // Test ticket 5691: HZ with illegal tilde sequences.
++ {
++ "HZ",
++ :bin{ 417e20427e21437e80447e7b41417e207e41427e7f41437e7d5a },
++ "A\\x7E B\\x7E!C\\x7E\\x80D\u4eae\\x7E\\x20\\x7E\u8c05\\x7E\\x7F\u64a9Z",
++ :intvector{ 0,1,1,1,1,2,3,4,4,4,4,5,6,7,7,7,7,7,7,7,7,9, // SBCS
++ 12,14,14,14,14,14,14,14,14,16,16,16,16,17,19,19,19,19,19,19,19,19,21, // DBCS
++ 25 }, // SBCS
++ :int{1}, :int{0}, "", "&C", :bin{""}
++ }
++ // Test ticket 5691: Example from Peter Edberg.
++ {
++ "ISO-2022-JP",
++ :bin{ 1b244230212f7e742630801b284a621b2458631b2842648061 },
++ "\u4e9c\ufffd\u7199\ufffdb\ufffd$Xcd\ufffda",
++ :intvector{ 3,5,7,9,14,15,16,17,18,22,23,24 },
++ :int{1}, :int{0}, "", "?", :bin{""}
++ }
+ // test that HZ limits its byte values to lead bytes 21..7d and trail bytes 21..7e
+ {
+ "HZ",
+- :bin{ 7e7b21212120217e217f772100007e217e7d207e7e807e0a2b },
+- "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd ~\ufffd+",
+- :intvector{ 2,4,6,8,10,12,14,18,19,21,24 },
++ :bin{ 7e7b21212120217e217f772100007e217e7e7d207e7e807e0a2b },
++ "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd\u3013 ~\ufffd+",
++ :intvector{ 2,4,6,8,10,12,14,15,19,20,22,25 },
+ :int{1}, :int{1}, "", "?", :bin{""}
+ }
+ // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and
+@@ -61,8 +193,8 @@
+ {
+ "ISO-2022-JP",
+ :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b2842 },
+- "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e",
+- :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 },
++ "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e",
++ :intvector{ 3,4,5,9,11,12,14,16,17,19,21,23,25,27 },
+ :int{1}, :int{1}, "", "?", :bin{""}
+ }
+ // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBMPWithOffsets()
+@@ -341,7 +473,7 @@
+ {
+ "ISO-2022-CN-EXT",
+ :bin{ 411b4e2121 }, "\x41", :intvector{ 0 },
+- :int{1}, :int{1}, "illesc", ".", :bin{ 1b4e }
++ :int{1}, :int{1}, "illesc", ".", :bin{ 1b }
+ }
+ // G3 designator: recognized, but not supported for -CN (only for -CN-EXT)
+ {
diff --git a/icu.icu5797.backport.patch b/icu.icu5797.backport.patch
new file mode 100644
index 0000000..39e3f77
--- /dev/null
+++ b/icu.icu5797.backport.patch
@@ -0,0 +1,749 @@
+diff -ru icu.5483/source/common/ucnv2022.c icu/source/common/ucnv2022.c
+--- icu.5483/source/common/ucnv2022.c 2009-06-02 12:47:41.000000000 +0100
++++ icu/source/common/ucnv2022.c 2009-06-02 13:18:23.000000000 +0100
+@@ -473,8 +473,7 @@
+ if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
+ myConverterData->myConverterArray[ISO8859_7]= ucnv_loadSharedData("ISO8859_7", NULL, errorCode);
+ }
+- myConverterData->myConverterArray[JISX201] = ucnv_loadSharedData("JISX0201", NULL, errorCode);
+- myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("jisx-208", NULL, errorCode);
++ myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("Shift-JIS", NULL, errorCode);
+ if(jpCharsetMasks[version]&CSM(JISX212)) {
+ myConverterData->myConverterArray[JISX212] = ucnv_loadSharedData("jisx-212", NULL, errorCode);
+ }
+@@ -1045,14 +1044,6 @@
+ length=3;
+ }
+ }
+- /*
+- * TODO(markus): Use Shift-JIS table for JIS X 0208, to save mapping table space.
+- * Pass in parameter for type of output bytes, for validation and shifting:
+- * - Direct: Pass bytes through, but forbid control codes 00-1F (except SI/SO/ESC) and space 20?
+- * (Need to allow some (TAB/LF/CR) or most of them for ASCII and maybe JIS X 0201.)
+- * - A1-FE: Subtract 80 after range check.
+- * - SJIS: Shift DBCS result to 21-7E x 21-7E.
+- */
+ /* is this code point assigned, or do we use fallbacks? */
+ if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
+ /* assigned */
+@@ -1110,6 +1101,23 @@
+ }
+ }
+
++/*
++ * Check that the result is a 2-byte value with each byte in the range A1..FE
++ * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
++ * to move it to the ISO 2022 range 21..7E.
++ * Return 0 if out of range.
++ */
++static U_INLINE uint32_t
++_2022FromGR94DBCS(uint32_t value) {
++ if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
++ (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
++ ) {
++ return value - 0x8080; /* shift down to 21..7e byte range */
++ } else {
++ return 0; /* not valid for ISO 2022 */
++ }
++}
++
+ #ifdef U_ENABLE_GENERIC_ISO_2022
+
+ /**********************************************************************************
+@@ -1238,7 +1246,7 @@
+ }
+ else{
+ cnv->toUBytes[0] =(char) sourceChar;
+- cnv->toULength = 2;
++ cnv->toULength = 1;
+ }
+
+ if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
+@@ -1332,6 +1340,181 @@
+ 3 /* length of <ESC>(I HWKANA_7BIT */
+ };
+
++/* Map 00..7F to Unicode according to JIS X 0201. */
++static U_INLINE uint32_t
++jisx201ToU(uint32_t value) {
++ if(value < 0x5c) {
++ return value;
++ } else if(value == 0x5c) {
++ return 0xa5;
++ } else if(value == 0x7e) {
++ return 0x203e;
++ } else /* value <= 0x7f */ {
++ return value;
++ }
++}
++
++/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
++static U_INLINE uint32_t
++jisx201FromU(uint32_t value) {
++ if(value<=0x7f) {
++ if(value!=0x5c && value!=0x7e) {
++ return value;
++ }
++ } else if(value==0xa5) {
++ return 0x5c;
++ } else if(value==0x203e) {
++ return 0x7e;
++ }
++ return 0xfffe;
++}
++
++/*
++ * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
++ * to JIS X 0208, and convert it to a pair of 21..7E bytes.
++ * Return 0 if the byte pair is out of range.
++ */
++static U_INLINE uint32_t
++_2022FromSJIS(uint32_t value) {
++ uint8_t trail;
++
++ if(value > 0xEFFC) {
++ return 0; /* beyond JIS X 0208 */
++ }
++
++ trail = (uint8_t)value;
++
++ value &= 0xff00; /* lead byte */
++ if(value <= 0x9f00) {
++ value -= 0x7000;
++ } else /* 0xe000 <= value <= 0xef00 */ {
++ value -= 0xb000;
++ }
++ value <<= 1;
++
++ if(trail <= 0x9e) {
++ value -= 0x100;
++ if(trail <= 0x7e) {
++ value |= trail - 0x1f;
++ } else {
++ value |= trail - 0x20;
++ }
++ } else /* trail <= 0xfc */ {
++ value |= trail - 0x7e;
++ }
++ return value;
++}
++
++/*
++ * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
++ * If either byte is outside 21..7E make sure that the result is not valid
++ * for Shift-JIS so that the converter catches it.
++ * Some invalid byte values already turn into equally invalid Shift-JIS
++ * byte values and need not be tested explicitly.
++ */
++static U_INLINE void
++_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
++ if(c1&1) {
++ ++c1;
++ if(c2 <= 0x5f) {
++ c2 += 0x1f;
++ } else if(c2 <= 0x7e) {
++ c2 += 0x20;
++ } else {
++ c2 = 0; /* invalid */
++ }
++ } else {
++ if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
++ c2 += 0x7e;
++ } else {
++ c2 = 0; /* invalid */
++ }
++ }
++ c1 >>= 1;
++ if(c1 <= 0x2f) {
++ c1 += 0x70;
++ } else if(c1 <= 0x3f) {
++ c1 += 0xb0;
++ } else {
++ c1 = 0; /* invalid */
++ }
++ bytes[0] = (char)c1;
++ bytes[1] = (char)c2;
++}
++
++/*
++ * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
++ * Katakana.
++ * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
++ * because Shift-JIS roundtrips half-width Katakana to single bytes.
++ * These were the only fallbacks in ICU's jisx-208.ucm file.
++ */
++static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
++ 0x2123, /* U+FF61 */
++ 0x2156,
++ 0x2157,
++ 0x2122,
++ 0x2126,
++ 0x2572,
++ 0x2521,
++ 0x2523,
++ 0x2525,
++ 0x2527,
++ 0x2529,
++ 0x2563,
++ 0x2565,
++ 0x2567,
++ 0x2543,
++ 0x213C, /* U+FF70 */
++ 0x2522,
++ 0x2524,
++ 0x2526,
++ 0x2528,
++ 0x252A,
++ 0x252B,
++ 0x252D,
++ 0x252F,
++ 0x2531,
++ 0x2533,
++ 0x2535,
++ 0x2537,
++ 0x2539,
++ 0x253B,
++ 0x253D,
++ 0x253F, /* U+FF80 */
++ 0x2541,
++ 0x2544,
++ 0x2546,
++ 0x2548,
++ 0x254A,
++ 0x254B,
++ 0x254C,
++ 0x254D,
++ 0x254E,
++ 0x254F,
++ 0x2552,
++ 0x2555,
++ 0x2558,
++ 0x255B,
++ 0x255E,
++ 0x255F, /* U+FF90 */
++ 0x2560,
++ 0x2561,
++ 0x2562,
++ 0x2564,
++ 0x2566,
++ 0x2568,
++ 0x2569,
++ 0x256A,
++ 0x256B,
++ 0x256C,
++ 0x256D,
++ 0x256F,
++ 0x2573,
++ 0x212B,
++ 0x212C /* U+FF9F */
++};
++
+ /*
+ * The iteration over various code pages works this way:
+ * i) Get the currentState from myConverterData->currentState
+@@ -1504,7 +1687,7 @@
+ }
+ break;
+ case HWKANA_7BIT:
+- if((uint32_t)(HWKANA_END-sourceChar)<=(HWKANA_END-HWKANA_START)) {
++ if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
+ if(converterData->version==3) {
+ /* JIS7: use G1 (SO) */
+ /* Shift U+FF61..U+FF9F to bytes 21..5F. */
+@@ -1531,13 +1714,34 @@
+ break;
+ case JISX201:
+ /* G0 SBCS */
+- len2 = MBCS_SINGLE_FROM_UCHAR32(
++ value = jisx201FromU(sourceChar);
++ if(value <= 0x7f) {
++ targetValue = value;
++ len = 1;
++ cs = cs0;
++ g = 0;
++ useFallback = FALSE;
++ }
++ break;
++ case JISX208:
++ /* G0 DBCS from Shift-JIS table */
++ len2 = MBCS_FROM_UCHAR32_ISO2022(
+ converterData->myConverterArray[cs0],
+ sourceChar, &value,
+- useFallback);
+- if(len2 != 0 && !(len2 < 0 && len != 0) && value <= 0x7f) {
+- targetValue = value;
+- len = len2;
++ useFallback, MBCS_OUTPUT_2);
++ if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
++ value = _2022FromSJIS(value);
++ if(value != 0) {
++ targetValue = value;
++ len = len2;
++ cs = cs0;
++ g = 0;
++ useFallback = FALSE;
++ }
++ } else if(len == 0 && useFallback &&
++ (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
++ targetValue = hwkana_fb[sourceChar - HWKANA_START];
++ len = -2;
+ cs = cs0;
+ g = 0;
+ useFallback = FALSE;
+@@ -1569,17 +1773,10 @@
+ * Check for valid bytes for the encoding scheme.
+ * This is necessary because the sub-converter (windows-949)
+ * has a broader encoding scheme than is valid for 2022.
+- *
+- * Check that the result is a 2-byte value with each byte in the range A1..FE
+- * (strict EUC-KR DBCS) before accepting it and subtracting 0x80 from each byte
+- * to move it to the ISO 2022 range 21..7E.
+ */
+- if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
+- (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
+- ) {
+- value -= 0x8080; /* shift down to 21..7e byte range */
+- } else {
+- break; /* not valid for ISO 2022 */
++ value = _2022FromGR94DBCS(value);
++ if(value == 0) {
++ break;
+ }
+ }
+ targetValue = value;
+@@ -1755,7 +1952,7 @@
+ static void
+ UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
+ UErrorCode* err){
+- char tempBuf[3];
++ char tempBuf[2];
+ const char *mySource = (char *) args->source;
+ UChar *myTarget = args->target;
+ const char *mySourceLimit = args->sourceLimit;
+@@ -1893,10 +2090,7 @@
+ break;
+ case JISX201:
+ if(mySourceChar <= 0x7f) {
+- targetUniChar =
+- _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
+- myData->myConverterArray[cs],
+- mySourceChar);
++ targetUniChar = jisx201ToU(mySourceChar);
+ }
+ break;
+ case HWKANA_7BIT:
+@@ -1910,8 +2104,13 @@
+ if(mySource < mySourceLimit) {
+ char trailByte;
+ getTrailByte:
+- tempBuf[0] = (char) (mySourceChar);
+- tempBuf[1] = trailByte = *mySource++;
++ trailByte = *mySource++;
++ if(cs == JISX208) {
++ _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailByte, tempBuf);
++ } else {
++ tempBuf[0] = (char)mySourceChar;
++ tempBuf[1] = trailByte;
++ }
+ mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
+ targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
+ } else {
+@@ -3254,6 +3453,9 @@
+ /* open a set and initialize it with code points that are algorithmically round-tripped */
+ switch(cnvData->locale[0]){
+ case 'j':
++ /* include JIS X 0201 which is hardcoded */
++ sa->add(sa->set, 0xa5);
++ sa->add(sa->set, 0x203e);
+ if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
+ /* include Latin-1 for some variants of JP */
+ sa->addRange(sa->set, 0, 0xff);
+@@ -3262,6 +3464,11 @@
+ sa->addRange(sa->set, 0, 0x7f);
+ }
+ if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) {
++ /*
++ * TODO(markus): If and when ucnv_getUnicodeSet() supports fallbacks,
++ * we need to include half-width Katakana for all JP variants because
++ * JIS X 0208 has hardcoded fallbacks for them.
++ */
+ /* include half-width Katakana for JP */
+ sa->addRange(sa->set, HWKANA_START, HWKANA_END);
+ }
+@@ -3281,15 +3488,7 @@
+ break;
+ }
+
+- /*
+- * Version-specific for CN:
+- * CN version 0 does not map CNS planes 3..7 although
+- * they are all available in the CNS conversion table;
+- * CN version 1 does map them all.
+- * The two versions create different Unicode sets.
+- */
+- for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
+- if(cnvData->myConverterArray[i]!=NULL) {
++#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
+ if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
+ cnvData->version==0 && i==CNS_11643
+ ) {
+@@ -3299,9 +3498,33 @@
+ sa, UCNV_ROUNDTRIP_SET,
+ 0, 0x81, 0x82,
+ pErrorCode);
++ }
++#endif
++
++ for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
++ UConverterSetFilter filter;
++ if(cnvData->myConverterArray[i]!=NULL) {
++ if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
++ cnvData->version==0 && i==CNS_11643
++ ) {
++ /*
++ * Version-specific for CN:
++ * CN version 0 does not map CNS planes 3..7 although
++ * they are all available in the CNS conversion table;
++ * CN version 1 (-EXT) does map them all.
++ * The two versions create different Unicode sets.
++ */
++ filter=UCNV_SET_FILTER_2022_CN;
++ } else if(cnvData->locale[0]=='j' && i==JISX208) {
++ /*
++ * Only add code points that map to Shift-JIS codes
++ * corresponding to JIS X 0208.
++ */
++ filter=UCNV_SET_FILTER_SJIS;
+ } else {
+- ucnv_MBCSGetUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, pErrorCode);
++ filter=UCNV_SET_FILTER_NONE;
+ }
++ ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
+ }
+ }
+
+diff -ru icu.5483/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c
+--- icu.5483/source/common/ucnvmbcs.c 2009-06-02 12:47:41.000000000 +0100
++++ icu/source/common/ucnvmbcs.c 2009-06-02 12:48:08.000000000 +0100
+@@ -340,6 +340,8 @@
+
+ /* Miscellaneous ------------------------------------------------------------ */
+
++#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
++
+ /* similar to ucnv_MBCSGetNextUChar() but recursive */
+ static void
+ _getUnicodeSetForBytes(const UConverterSharedData *sharedData,
+@@ -432,11 +434,14 @@
+ pErrorCode);
+ }
+
++#endif
++
+ U_CFUNC void
+-ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
+- const USetAdder *sa,
+- UConverterUnicodeSet which,
+- UErrorCode *pErrorCode) {
++ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
++ const USetAdder *sa,
++ UConverterUnicodeSet which,
++ UConverterSetFilter filter,
++ UErrorCode *pErrorCode) {
+ const UConverterMBCSTable *mbcsTable;
+ const uint16_t *table;
+
+@@ -490,50 +495,26 @@
+ c+=1024; /* empty stage 2 block */
+ }
+ }
+- } else if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY) {
+- /* ignore single-byte results */
++ } else {
+ const uint32_t *stage2;
+- const uint16_t *stage3, *results;
+-
+- results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
+-
+- for(st1=0; st1<maxStage1; ++st1) {
+- st2=table[st1];
+- if(st2>(maxStage1>>1)) {
+- stage2=(const uint32_t *)table+st2;
+- for(st2=0; st2<64; ++st2) {
+- if((st3=stage2[st2])!=0) {
+- /* read the stage 3 block */
+- stage3=results+16*(uint32_t)(uint16_t)st3;
++ const uint8_t *stage3, *bytes;
++ uint32_t st3Multiplier;
++ uint32_t value;
+
+- /* get the roundtrip flags for the stage 3 block */
+- st3>>=16;
++ bytes=mbcsTable->fromUnicodeBytes;
+
+- /*
+- * Add code points for which the roundtrip flag is set.
+- * Once we get a set for fallback mappings, we have to check
+- * non-roundtrip stage 3 results for whether they are 0.
+- * See ucnv_MBCSFromUnicodeWithOffsets() for details.
+- *
+- * Ignore single-byte results (<0x100).
+- */
+- do {
+- if((st3&1)!=0 && *stage3>=0x100) {
+- sa->add(sa->set, c);
+- }
+- st3>>=1;
+- ++stage3;
+- } while((++c&0xf)!=0);
+- } else {
+- c+=16; /* empty stage 3 block */
+- }
+- }
+- } else {
+- c+=1024; /* empty stage 2 block */
+- }
++ switch(mbcsTable->outputType) {
++ case MBCS_OUTPUT_3:
++ case MBCS_OUTPUT_4_EUC:
++ st3Multiplier=3;
++ break;
++ case MBCS_OUTPUT_4:
++ st3Multiplier=4;
++ break;
++ default:
++ st3Multiplier=2;
++ break;
+ }
+- } else {
+- const uint32_t *stage2;
+
+ for(st1=0; st1<maxStage1; ++st1) {
+ st2=table[st1];
+@@ -541,6 +522,9 @@
+ stage2=(const uint32_t *)table+st2;
+ for(st2=0; st2<64; ++st2) {
+ if((st3=stage2[st2])!=0) {
++ /* read the stage 3 block */
++ stage3=bytes+st3Multiplier*16*(uint32_t)(uint16_t)st3;
++
+ /* get the roundtrip flags for the stage 3 block */
+ st3>>=16;
+
+@@ -550,12 +534,49 @@
+ * non-roundtrip stage 3 results for whether they are 0.
+ * See ucnv_MBCSFromUnicodeWithOffsets() for details.
+ */
+- do {
+- if(st3&1) {
+- sa->add(sa->set, c);
+- }
+- st3>>=1;
+- } while((++c&0xf)!=0);
++ switch(filter) {
++ case UCNV_SET_FILTER_NONE:
++ do {
++ if(st3&1) {
++ sa->add(sa->set, c);
++ }
++ st3>>=1;
++ } while((++c&0xf)!=0);
++ break;
++ case UCNV_SET_FILTER_DBCS_ONLY:
++ /* Ignore single-byte results (<0x100). */
++ do {
++ if((st3&1)!=0 && *((const uint16_t *)stage3)>=0x100) {
++ sa->add(sa->set, c);
++ }
++ st3>>=1;
++ stage3+=2; /* +=st3Multiplier */
++ } while((++c&0xf)!=0);
++ break;
++ case UCNV_SET_FILTER_2022_CN:
++ /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */
++ do {
++ if((st3&1)!=0 && ((value=*stage3)==0x81 || value==0x82)) {
++ sa->add(sa->set, c);
++ }
++ st3>>=1;
++ stage3+=3; /* +=st3Multiplier */
++ } while((++c&0xf)!=0);
++ break;
++ case UCNV_SET_FILTER_SJIS:
++ /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */
++ do {
++ if((st3&1)!=0 && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {
++ sa->add(sa->set, c);
++ }
++ st3>>=1;
++ stage3+=2; /* +=st3Multiplier */
++ } while((++c&0xf)!=0);
++ break;
++ default:
++ *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
++ return;
++ }
+ } else {
+ c+=16; /* empty stage 3 block */
+ }
+@@ -569,6 +590,19 @@
+ ucnv_extGetUnicodeSet(sharedData, sa, which, pErrorCode);
+ }
+
++U_CFUNC void
++ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
++ const USetAdder *sa,
++ UConverterUnicodeSet which,
++ UErrorCode *pErrorCode) {
++ ucnv_MBCSGetFilteredUnicodeSetForUnicode(
++ sharedData, sa, which,
++ sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ?
++ UCNV_SET_FILTER_DBCS_ONLY :
++ UCNV_SET_FILTER_NONE,
++ pErrorCode);
++}
++
+ static void
+ ucnv_MBCSGetUnicodeSet(const UConverter *cnv,
+ const USetAdder *sa,
+diff -ru icu.5483/source/common/ucnvmbcs.h icu/source/common/ucnvmbcs.h
+--- icu.5483/source/common/ucnvmbcs.h 2009-06-02 12:47:41.000000000 +0100
++++ icu/source/common/ucnvmbcs.h 2009-06-02 12:48:08.000000000 +0100
+@@ -363,6 +363,7 @@
+ ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
+ UErrorCode *pErrorCode);
+
++#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
+ /*
+ * Internal function returning a UnicodeSet for toUnicode() conversion.
+ * Currently only used for ISO-2022-CN, and only handles roundtrip mappings.
+@@ -377,6 +378,7 @@
+ UConverterUnicodeSet which,
+ uint8_t state, int32_t lowByte, int32_t highByte,
+ UErrorCode *pErrorCode);
++#endif
+
+ /*
+ * Internal function returning a UnicodeSet for toUnicode() conversion.
+@@ -388,9 +390,30 @@
+ */
+ U_CFUNC void
+ ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
+- const USetAdder *sa,
+- UConverterUnicodeSet which,
+- UErrorCode *pErrorCode);
++ const USetAdder *sa,
++ UConverterUnicodeSet which,
++ UErrorCode *pErrorCode);
++
++typedef enum UConverterSetFilter {
++ UCNV_SET_FILTER_NONE,
++ UCNV_SET_FILTER_DBCS_ONLY,
++ UCNV_SET_FILTER_2022_CN,
++ UCNV_SET_FILTER_SJIS,
++ UCNV_SET_FILTER_COUNT
++} UConverterSetFilter;
++
++/*
++ * Same as ucnv_MBCSGetUnicodeSetForUnicode() but
++ * the set can be filtered by encoding scheme.
++ * Used by stateful converters which share regular conversion tables
++ * but only use a subset of their mappings.
++ */
++U_CFUNC void
++ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
++ const USetAdder *sa,
++ UConverterUnicodeSet which,
++ UConverterSetFilter filter,
++ UErrorCode *pErrorCode);
+
+ #endif
+
+diff -ru icu.5483/source/test/cintltst/nucnvtst.c icu/source/test/cintltst/nucnvtst.c
+--- icu.5483/source/test/cintltst/nucnvtst.c 2009-06-02 12:47:25.000000000 +0100
++++ icu/source/test/cintltst/nucnvtst.c 2009-06-02 12:58:02.000000000 +0100
+@@ -3202,7 +3202,7 @@
+ 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A, 0x000D, 0x000A,
+ 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, 0x0051, 0x0052, 0x000D, 0x000A,
+ 0x3005, 0x3006, 0x3007, 0x30FC, 0x2015, 0x2010, 0xFF0F, 0x005C, 0x000D, 0x000A,
+- 0x301C, 0x2016, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A,
++ 0x3013, 0x2018, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A,
+ 0x201D, 0x3014, 0x000D, 0x000A,
+ 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x000D, 0x000A,
+ 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x000D, 0x000A,
+@@ -3730,7 +3730,7 @@
+ 0x52C8, 0x52CC, 0x52CF, 0x52D1, 0x52D4, 0x52D6, 0x52DB, 0x52DC, 0x000D, 0x000A,
+ 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, 0x0051, 0x0052, 0x000D, 0x000A,
+ 0x3005, 0x3006, 0x3007, 0x30FC, 0x2015, 0x2010, 0xFF0F, 0x005C, 0x000D, 0x000A,
+- 0x301C, 0x2016, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A,
++ 0x3013, 0x2018, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A,
+ 0x201D, 0x000D, 0x000A,
+ 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x000D, 0x000A,
+ 0x4F94, 0x4F97, 0x52BA, 0x52BB, 0x52BD, 0x52C0, 0x52C4, 0x52C6, 0x000D, 0x000A,
+diff -ru icu.5483/source/test/cintltst/udatatst.c icu/source/test/cintltst/udatatst.c
+--- icu.5483/source/test/cintltst/udatatst.c 2009-06-02 12:47:25.000000000 +0100
++++ icu/source/test/cintltst/udatatst.c 2009-06-02 13:09:15.000000000 +0100
+@@ -1260,6 +1260,11 @@
+ {"gb18030", "cnv", ucnv_swap},
+ /* MBCS conversion table file with extension */
+ {"*test4x", "cnv", ucnv_swap},
++ /*
++ * MBCS conversion table file without extension,
++ * to test swapping and preflighting of UTF-8-friendly mbcsIndex[].
++ */
++ {"jisx-212", "cnv", ucnv_swap},
+ #endif
+
+ #if !UCONFIG_NO_CONVERSION
+diff -ru icu.5483/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt
+--- icu.5483/source/test/testdata/conversion.txt 2009-06-02 12:47:25.000000000 +0100
++++ icu/source/test/testdata/conversion.txt 2009-06-02 12:49:51.000000000 +0100
+@@ -48,6 +48,15 @@
+ toUnicode {
+ Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }
+ Cases {
++ // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and
++ // using the Shift-JIS table for JIS X 0208 (ticket #5797)
++ {
++ "ISO-2022-JP",
++ :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b2842 },
++ "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e",
++ :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 },
++ :int{1}, :int{1}, "", "?", :bin{""}
++ }
+ // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBMPWithOffsets()
+ {
+ "ISO-8859-3",
+@@ -495,6 +504,15 @@
+ }
+ { "UTF-16BE", :bin{ 00 }, "", :intvector{}, :int{1}, :int{0}, "truncated", ".", :bin{ 00 } }
+ { "UTF-16BE", :bin{ d800dc }, "", :intvector{}, :int{1}, :int{0}, "truncated", ".", :bin{ d800dc } }
++ // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and
++ // using the Shift-JIS table for JIS X 0208 (ticket #5797)
++ {
++ "ISO-2022-JP",
++ "\u203e\xa5\u4e00\ufa10\u6f3e\u0391",
++ :bin{ 1b284a7e5c1b2442306c222e5f2126211b2842 },
++ :intvector{ 0,0,0,0,1,2,2,2,2,2,3,3,4,4,5,5,5,5,5 },
++ :int{1}, :int{0}, "", "?=\u3013", "" // U+3013 Geta Mark converts to 222e
++ }
+ // Verify that mappings that would result in byte values outside 20..7F (for SBCS)
+ // or 21..7E (for DBCS) are not used.
+ // ibm-9005_X110-2007.ucm (ISO 8859-7, <ESC>.F=1b2e46):
+@@ -1273,13 +1291,13 @@
+ // versions of ISO-2022-JP
+ {
+ "ISO-2022-JP",
+- "[\x00-\x0d\x10-\x1a\x1c-\x7f\u0391-\u03a1\uff61-\uff9f\u4e00\u4e01\uffe5]",
+- "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\uffe6-\U0010ffff]",
++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u203e\uff61-\uff9f\u4e00\u4e01\uffe5]",
++ "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\ufa0e-\ufa2d\uffe6-\U0010ffff]",
+ :int{0}
+ }
+ {
+ "ISO-2022-JP-2",
+- "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0385-\u038a\u0390-\u03a1\uff61-\uff9f\u4e00-\u4e05\uffe6]",
++ "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\uff61-\uff9f\u4e00-\u4e05\uffe6]",
+ "[\x0e\x0f\x1b\uffe7-\U0010ffff]",
+ :int{0}
+ }
diff --git a/icu.icu6001.backport.patch b/icu.icu6001.backport.patch
new file mode 100644
index 0000000..11b2ee3
--- /dev/null
+++ b/icu.icu6001.backport.patch
@@ -0,0 +1,741 @@
+diff -ru icu.5797/source/common/ucnv2022.c icu/source/common/ucnv2022.c
+--- icu.5797/source/common/ucnv2022.c 2009-06-02 14:45:31.000000000 +0100
++++ icu/source/common/ucnv2022.c 2009-06-02 15:05:10.000000000 +0100
+@@ -3399,11 +3399,19 @@
+ /* include ASCII for JP */
+ sa->addRange(sa->set, 0, 0x7f);
+ }
+- if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) {
++ if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
+ /*
+- * TODO(markus): If and when ucnv_getUnicodeSet() supports fallbacks,
+- * we need to include half-width Katakana for all JP variants because
+- * JIS X 0208 has hardcoded fallbacks for them.
++ * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
++ * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
++ * use half-width Katakana.
++ * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
++ * half-width Katakana via the ESC ( I sequence.
++ * However, we only emit (fromUnicode) half-width Katakana according to the
++ * definition of each variant.
++ *
++ * When including fallbacks,
++ * we need to include half-width Katakana Unicode code points for all JP variants because
++ * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
+ */
+ /* include half-width Katakana for JP */
+ sa->addRange(sa->set, HWKANA_START, HWKANA_END);
+@@ -3457,6 +3465,12 @@
+ * corresponding to JIS X 0208.
+ */
+ filter=UCNV_SET_FILTER_SJIS;
++ } else if(i==KSC5601) {
++ /*
++ * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
++ * are broader than GR94.
++ */
++ filter=UCNV_SET_FILTER_GR94DBCS;
+ } else {
+ filter=UCNV_SET_FILTER_NONE;
+ }
+@@ -3472,6 +3486,9 @@
+ sa->remove(sa->set, 0x0e);
+ sa->remove(sa->set, 0x0f);
+ sa->remove(sa->set, 0x1b);
++
++ /* ISO 2022 converters do not convert C1 controls either */
++ sa->removeRange(sa->set, 0x80, 0x9f);
+ }
+
+ static const UConverterImpl _ISO2022Impl={
+diff -ru icu.5797/source/common/ucnv_ext.c icu/source/common/ucnv_ext.c
+--- icu.5797/source/common/ucnv_ext.c 2009-06-02 14:45:31.000000000 +0100
++++ icu/source/common/ucnv_ext.c 2009-06-02 15:12:21.000000000 +0100
+@@ -946,7 +946,7 @@
+ ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,
+ const int32_t *cx,
+ const USetAdder *sa,
+- UConverterUnicodeSet which,
++ UBool useFallback,
+ int32_t minLength,
+ UChar32 c,
+ UChar s[UCNV_EXT_MAX_UCHARS], int32_t length,
+@@ -966,7 +966,7 @@
+ value=*fromUSectionValues++;
+
+ if( value!=0 &&
+- UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) &&
++ (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || useFallback) &&
+ UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
+ ) {
+ if(c>=0) {
+@@ -987,12 +987,14 @@
+ /* no mapping, do nothing */
+ } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
+ ucnv_extGetUnicodeSetString(
+- sharedData, cx, sa, which, minLength,
++ sharedData, cx, sa, useFallback, minLength,
+ U_SENTINEL, s, length+1,
+ (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
+ pErrorCode);
+- } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
+- UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) &&
++ } else if((useFallback ?
++ (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 :
++ ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
++ UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) &&
+ UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
+ ) {
+ sa->addString(sa->set, s, length+1);
+@@ -1004,6 +1006,7 @@
+ ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
+ const USetAdder *sa,
+ UConverterUnicodeSet which,
++ UConverterSetFilter filter,
+ UErrorCode *pErrorCode) {
+ const int32_t *cx;
+ const uint16_t *stage12, *stage3, *ps2, *ps3;
+@@ -1011,6 +1014,7 @@
+
+ uint32_t value;
+ int32_t st1, stage1Length, st2, st3, minLength;
++ UBool useFallback;
+
+ UChar s[UCNV_EXT_MAX_UCHARS];
+ UChar32 c;
+@@ -1027,12 +1031,20 @@
+
+ stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH];
+
++ useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
++
+ /* enumerate the from-Unicode trie table */
+ c=0; /* keep track of the current code point while enumerating */
+
+- if(sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY) {
++ if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ||
++ filter==UCNV_SET_FILTER_DBCS_ONLY ||
++ filter==UCNV_SET_FILTER_SJIS ||
++ filter==UCNV_SET_FILTER_GR94DBCS
++ ) {
+ /* DBCS-only, ignore single-byte results */
+ minLength=2;
++ } else if(filter==UCNV_SET_FILTER_2022_CN) {
++ minLength=3;
+ } else {
+ minLength=1;
+ }
+@@ -1064,14 +1076,41 @@
+ length=0;
+ U16_APPEND_UNSAFE(s, length, c);
+ ucnv_extGetUnicodeSetString(
+- sharedData, cx, sa, which, minLength,
++ sharedData, cx, sa, useFallback, minLength,
+ c, s, length,
+ (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
+ pErrorCode);
+- } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
+- UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) &&
++ } else if((useFallback ?
++ (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 :
++ ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
++ UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) &&
+ UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
+ ) {
++ switch(filter) {
++ case UCNV_SET_FILTER_2022_CN:
++ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==3 && UCNV_EXT_FROM_U_GET_DATA(value)<=0x82ffff)) {
++ continue;
++ }
++ break;
++ case UCNV_SET_FILTER_SJIS:
++ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && (value=UCNV_EXT_FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)) {
++ continue;
++ }
++ break;
++ case UCNV_SET_FILTER_GR94DBCS:
++ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&
++ (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value)) - 0xa1a1)<=(0xfefe - 0xa1a1) &&
++ (uint8_t)(value - 0xa1)<=(0xfe - 0xa1))) {
++ continue;
++ }
++ break;
++ default:
++ /*
++ * UCNV_SET_FILTER_NONE,
++ * or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength
++ */
++ break;
++ }
+ sa->add(sa->set, c);
+ }
+ } while((++c&0xf)!=0);
+diff -ru icu.5797/source/common/ucnv_ext.h icu/source/common/ucnv_ext.h
+--- icu.5797/source/common/ucnv_ext.h 2009-06-02 14:45:31.000000000 +0100
++++ icu/source/common/ucnv_ext.h 2009-06-02 15:05:10.000000000 +0100
+@@ -382,10 +382,20 @@
+ UConverterFromUnicodeArgs *pArgs, int32_t srcIndex,
+ UErrorCode *pErrorCode);
+
++/*
++ * Add code points and strings to the set according to the extension mappings.
++ * Limitation on the UConverterSetFilter:
++ * The filters currently assume that they are used with 1:1 mappings.
++ * They only apply to single input code points, and then they pass through
++ * only mappings with single-charset-code results.
++ * For example, the Shift-JIS filter only works for 2-byte results and tests
++ * that those 2 bytes are in the JIS X 0208 range of Shift-JIS.
++ */
+ U_CFUNC void
+ ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
+ const USetAdder *sa,
+ UConverterUnicodeSet which,
++ UConverterSetFilter filter,
+ UErrorCode *pErrorCode);
+
+ /* toUnicode helpers -------------------------------------------------------- */
+diff -ru icu.5797/source/common/ucnvhz.c icu/source/common/ucnvhz.c
+--- icu.5797/source/common/ucnvhz.c 2009-06-02 14:45:31.000000000 +0100
++++ icu/source/common/ucnvhz.c 2009-06-02 15:05:10.000000000 +0100
+@@ -528,6 +528,7 @@
+ sa->add(sa->set, 0x7e);
+
+ /* add all of the code points that the sub-converter handles */
++ /* ucnv_MBCSGetFilteredUnicodeSetForUnicode(((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData, sa, which, UCNV_SET_FILTER_GR94DBCS, pErrorCode); */
+ ((UConverterDataHZ*)cnv->extraInfo)->
+ gbConverter->sharedData->impl->
+ getUnicodeSet(((UConverterDataHZ*)cnv->extraInfo)->gbConverter,
+diff -ru icu.5797/source/common/ucnv_lmb.c icu/source/common/ucnv_lmb.c
+--- icu.5797/source/common/ucnv_lmb.c 2009-06-02 14:45:31.000000000 +0100
++++ icu/source/common/ucnv_lmb.c 2009-06-02 15:09:13.000000000 +0100
+@@ -536,7 +536,7 @@
+ NULL,\
+ NULL,\
+ _LMBCSSafeClone,\
+- _LMBCSGetUnicodeSet\
++ ucnv_getCompleteUnicodeSet\
+ };\
+ static const UConverterStaticData _LMBCSStaticData##n={\
+ sizeof(UConverterStaticData),\
+@@ -662,15 +662,14 @@
+ return &newLMBCS->cnv;
+ }
+
+-static void
+-_LMBCSGetUnicodeSet(const UConverter *cnv,
+- const USetAdder *sa,
+- UConverterUnicodeSet which,
+- UErrorCode *pErrorCode) {
+- /* all but U+F6xx, see LMBCS explanation above (search for F6xx) */
+- sa->addRange(sa->set, 0, 0xf5ff);
+- sa->addRange(sa->set, 0xf700, 0x10ffff);
+-}
++/*
++ * There used to be a _LMBCSGetUnicodeSet() function here (up to svn revision 20117)
++ * which added all code points except for U+F6xx
++ * because those cannot be represented in the Unicode group.
++ * However, it turns out that windows-950 has roundtrips for all of U+F6xx
++ * which means that LMBCS can convert all Unicode code points after all.
++ * We now simply use ucnv_getCompleteUnicodeSet().
++ */
+
+ /*
+ Here's the basic helper function that we use when converting from
+diff -ru icu.5797/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c
+--- icu.5797/source/common/ucnvmbcs.c 2009-06-02 14:45:31.000000000 +0100
++++ icu/source/common/ucnvmbcs.c 2009-06-02 15:12:40.000000000 +0100
+@@ -463,9 +463,23 @@
+
+ if(mbcsTable->outputType==MBCS_OUTPUT_1) {
+ const uint16_t *stage2, *stage3, *results;
++ uint16_t minValue;
+
+ results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
+
++ /*
++ * Set a threshold variable for selecting which mappings to use.
++ * See ucnv_MBCSSingleFromBMPWithOffsets() and
++ * MBCS_SINGLE_RESULT_FROM_U() for details.
++ */
++ if(which==UCNV_ROUNDTRIP_SET) {
++ /* use only roundtrips */
++ minValue=0xf00;
++ } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {
++ /* use all roundtrip and fallback results */
++ minValue=0x800;
++ }
++
+ for(st1=0; st1<maxStage1; ++st1) {
+ st2=table[st1];
+ if(st2>maxStage1) {
+@@ -475,15 +489,8 @@
+ /* read the stage 3 block */
+ stage3=results+st3;
+
+- /*
+- * Add code points for which the roundtrip flag is set.
+- * Once we get a set for fallback mappings, we have to use
+- * a threshold variable with a value of 0x800.
+- * See ucnv_MBCSSingleFromBMPWithOffsets() and
+- * MBCS_SINGLE_RESULT_FROM_U() for details.
+- */
+ do {
+- if(*stage3++>=0xf00) {
++ if(*stage3++>=minValue) {
+ sa->add(sa->set, c);
+ }
+ } while((++c&0xf)!=0);
+@@ -500,9 +507,12 @@
+ const uint8_t *stage3, *bytes;
+ uint32_t st3Multiplier;
+ uint32_t value;
++ UBool useFallback;
+
+ bytes=mbcsTable->fromUnicodeBytes;
+
++ useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
++
+ switch(mbcsTable->outputType) {
+ case MBCS_OUTPUT_3:
+ case MBCS_OUTPUT_4_EUC:
+@@ -529,9 +539,8 @@
+ st3>>=16;
+
+ /*
+- * Add code points for which the roundtrip flag is set.
+- * Once we get a set for fallback mappings, we have to check
+- * non-roundtrip stage 3 results for whether they are 0.
++ * Add code points for which the roundtrip flag is set,
++ * or which map to non-zero bytes if we use fallbacks.
+ * See ucnv_MBCSFromUnicodeWithOffsets() for details.
+ */
+ switch(filter) {
+@@ -539,6 +548,23 @@
+ do {
+ if(st3&1) {
+ sa->add(sa->set, c);
++ stage3+=st3Multiplier;
++ } else if(useFallback) {
++ uint8_t b=0;
++ switch(st3Multiplier) {
++ case 4:
++ b|=*stage3++;
++ case 3:
++ b|=*stage3++;
++ case 2:
++ b|=stage3[0]|stage3[1];
++ stage3+=2;
++ default:
++ break;
++ }
++ if(b!=0) {
++ sa->add(sa->set, c);
++ }
+ }
+ st3>>=1;
+ } while((++c&0xf)!=0);
+@@ -546,7 +572,7 @@
+ case UCNV_SET_FILTER_DBCS_ONLY:
+ /* Ignore single-byte results (<0x100). */
+ do {
+- if((st3&1)!=0 && *((const uint16_t *)stage3)>=0x100) {
++ if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) {
+ sa->add(sa->set, c);
+ }
+ st3>>=1;
+@@ -556,7 +582,7 @@
+ case UCNV_SET_FILTER_2022_CN:
+ /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */
+ do {
+- if((st3&1)!=0 && ((value=*stage3)==0x81 || value==0x82)) {
++ if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) {
+ sa->add(sa->set, c);
+ }
+ st3>>=1;
+@@ -566,7 +592,20 @@
+ case UCNV_SET_FILTER_SJIS:
+ /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */
+ do {
+- if((st3&1)!=0 && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {
++ if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {
++ sa->add(sa->set, c);
++ }
++ st3>>=1;
++ stage3+=2; /* +=st3Multiplier */
++ } while((++c&0xf)!=0);
++ break;
++ case UCNV_SET_FILTER_GR94DBCS:
++ /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */
++ do {
++ if( ((st3&1)!=0 || useFallback) &&
++ (uint16_t)((value=*((const uint16_t *)stage3)) - 0xa1a1)<=(0xfefe - 0xa1a1) &&
++ (uint8_t)(value - 0xa1)<=(0xfe - 0xa1)
++ ) {
+ sa->add(sa->set, c);
+ }
+ st3>>=1;
+@@ -587,7 +626,7 @@
+ }
+ }
+
+- ucnv_extGetUnicodeSet(sharedData, sa, which, pErrorCode);
++ ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode);
+ }
+
+ U_CFUNC void
+diff -ru icu.5797/source/common/ucnvmbcs.h icu/source/common/ucnvmbcs.h
+--- icu.5797/source/common/ucnvmbcs.h 2009-06-02 14:45:31.000000000 +0100
++++ icu/source/common/ucnvmbcs.h 2009-06-02 15:05:10.000000000 +0100
+@@ -399,6 +399,7 @@
+ UCNV_SET_FILTER_DBCS_ONLY,
+ UCNV_SET_FILTER_2022_CN,
+ UCNV_SET_FILTER_SJIS,
++ UCNV_SET_FILTER_GR94DBCS,
+ UCNV_SET_FILTER_COUNT
+ } UConverterSetFilter;
+
+diff -ru icu.5797/source/common/ucnv_set.c icu/source/common/ucnv_set.c
+--- icu.5797/source/common/ucnv_set.c 2009-06-02 14:45:31.000000000 +0100
++++ icu/source/common/ucnv_set.c 2009-06-02 15:05:10.000000000 +0100
+@@ -1,7 +1,7 @@
+ /*
+ *******************************************************************************
+ *
+-* Copyright (C) 2003-2005, International Business Machines
++* Copyright (C) 2003-2007, International Business Machines
+ * Corporation and others. All Rights Reserved.
+ *
+ *******************************************************************************
+@@ -52,7 +52,8 @@
+ uset_add,
+ uset_addRange,
+ uset_addString,
+- uset_remove
++ uset_remove,
++ uset_removeRange
+ };
+ sa.set=setFillIn;
+
+diff -ru icu.5797/source/common/unicode/ucnv.h icu/source/common/unicode/ucnv.h
+--- icu.5797/source/common/unicode/ucnv.h 2009-06-02 14:45:30.000000000 +0100
++++ icu/source/common/unicode/ucnv.h 2009-06-02 15:05:10.000000000 +0100
+@@ -870,6 +870,8 @@
+ typedef enum UConverterUnicodeSet {
+ /** Select the set of roundtrippable Unicode code points. @stable ICU 2.6 */
+ UCNV_ROUNDTRIP_SET,
++ /** Select the set of Unicode code points with roundtrip or fallback mappings. @draft ICU 4.0 */
++ UCNV_ROUNDTRIP_AND_FALLBACK_SET,
+ /** Number of UConverterUnicodeSet selectors. @stable ICU 2.6 */
+ UCNV_SET_COUNT
+ } UConverterUnicodeSet;
+@@ -878,11 +880,16 @@
+ /**
+ * Returns the set of Unicode code points that can be converted by an ICU converter.
+ *
+- * The current implementation returns only one kind of set (UCNV_ROUNDTRIP_SET):
++ * Returns one of several kinds of set:
++ *
++ * 1. UCNV_ROUNDTRIP_SET
++ *
+ * The set of all Unicode code points that can be roundtrip-converted
+- * (converted without any data loss) with the converter.
++ * (converted without any data loss) with the converter (ucnv_fromUnicode()).
+ * This set will not include code points that have fallback mappings
+ * or are only the result of reverse fallback mappings.
++ * This set will also not include PUA code points with fallbacks, although
++ * ucnv_fromUnicode() will always uses those mappings despite ucnv_setFallback().
+ * See UTR #22 "Character Mapping Markup Language"
+ * at http://www.unicode.org/reports/tr22/
+ *
+@@ -893,6 +900,12 @@
+ * by comparing its roundtrip set with the set of ExemplarCharacters from
+ * ICU's locale data or other sources
+ *
++ * 2. UCNV_ROUNDTRIP_AND_FALLBACK_SET
++ *
++ * The set of all Unicode code points that can be converted with the converter (ucnv_fromUnicode())
++ * when fallbacks are turned on (see ucnv_setFallback()).
++ * This set includes all code points with roundtrips and fallbacks (but not reverse fallbacks).
++ *
+ * In the future, there may be more UConverterUnicodeSet choices to select
+ * sets with different properties.
+ *
+diff -ru icu.5797/source/common/uset_imp.h icu/source/common/uset_imp.h
+--- icu.5797/source/common/uset_imp.h 2009-06-02 14:45:31.000000000 +0100
++++ icu/source/common/uset_imp.h 2009-06-02 15:05:10.000000000 +0100
+@@ -36,6 +36,9 @@
+ typedef void U_CALLCONV
+ USetRemove(USet *set, UChar32 c);
+
++typedef void U_CALLCONV
++USetRemoveRange(USet *set, UChar32 start, UChar32 end);
++
+ /**
+ * Interface for adding items to a USet, to keep low-level code from
+ * statically depending on the USet implementation.
+@@ -47,6 +50,7 @@
+ USetAddRange *addRange;
+ USetAddString *addString;
+ USetRemove *remove;
++ USetRemoveRange *removeRange;
+ };
+ typedef struct USetAdder USetAdder;
+
+diff -ru icu.5797/source/test/intltest/convtest.cpp icu/source/test/intltest/convtest.cpp
+--- icu.5797/source/test/intltest/convtest.cpp 2009-06-02 14:45:18.000000000 +0100
++++ icu/source/test/intltest/convtest.cpp 2009-06-02 15:09:31.000000000 +0100
+@@ -59,6 +59,7 @@
+ case 0: name="TestToUnicode"; if (exec) TestToUnicode(); break;
+ case 1: name="TestFromUnicode"; if (exec) TestFromUnicode(); break;
+ case 2: name="TestGetUnicodeSet"; if (exec) TestGetUnicodeSet(); break;
++ case 3: name="TestGetUnicodeSet2"; if (exec) TestGetUnicodeSet2(); break;
+ default: name=""; break; //needed to end loop
+ }
+ }
+@@ -454,6 +455,183 @@
+ }
+ }
+
++U_CDECL_BEGIN
++static void U_CALLCONV
++getUnicodeSetCallback(const void *context,
++ UConverterFromUnicodeArgs *fromUArgs,
++ const UChar* codeUnits,
++ int32_t length,
++ UChar32 codePoint,
++ UConverterCallbackReason reason,
++ UErrorCode *pErrorCode) {
++ if(reason<=UCNV_IRREGULAR) {
++ ((UnicodeSet *)context)->remove(codePoint); // the converter cannot convert this code point
++ *pErrorCode=U_ZERO_ERROR; // skip
++ } // else ignore the reset, close and clone calls.
++}
++U_CDECL_END
++
++// Compare ucnv_getUnicodeSet() with the set of characters that can be converted.
++void
++ConversionTest::TestGetUnicodeSet2() {
++ // Build a string with all code points.
++ UChar32 cpLimit;
++ int32_t s0Length;
++ if(quick) {
++ cpLimit=s0Length=0x10000; // BMP only
++ } else {
++ cpLimit=0x110000;
++ s0Length=0x10000+0x200000; // BMP + surrogate pairs
++ }
++ UChar *s0=new UChar[s0Length];
++ if(s0==NULL) {
++ return;
++ }
++ UChar *s=s0;
++ UChar32 c;
++ UChar c2;
++ // low BMP
++ for(c=0; c<=0xd7ff; ++c) {
++ *s++=(UChar)c;
++ }
++ // trail surrogates
++ for(c=0xdc00; c<=0xdfff; ++c) {
++ *s++=(UChar)c;
++ }
++ // lead surrogates
++ // (after trails so that there is not even one surrogate pair in between)
++ for(c=0xd800; c<=0xdbff; ++c) {
++ *s++=(UChar)c;
++ }
++ // high BMP
++ for(c=0xe000; c<=0xffff; ++c) {
++ *s++=(UChar)c;
++ }
++ // supplementary code points = surrogate pairs
++ if(cpLimit==0x110000) {
++ for(c=0xd800; c<=0xdbff; ++c) {
++ for(c2=0xdc00; c2<=0xdfff; ++c2) {
++ *s++=(UChar)c;
++ *s++=c2;
++ }
++ }
++ }
++
++ static const char *const cnvNames[]={
++ "UTF-8",
++ "UTF-7",
++ "UTF-16",
++ "US-ASCII",
++ "ISO-8859-1",
++ "windows-1252",
++ "Shift-JIS",
++ "ibm-1390", // EBCDIC_STATEFUL table
++ "ibm-16684", // DBCS-only extension table based on EBCDIC_STATEFUL table
++ // "HZ", TODO(markus): known bug, the set incorrectly contains [\u02CA\u02CB\u02D9\u2010\u2013\u2015...]
++ "ISO-2022-JP",
++ "JIS7",
++ "ISO-2022-CN",
++ "ISO-2022-CN-EXT",
++ "LMBCS"
++ };
++ char buffer[1024];
++ int32_t i;
++ for(i=0; i<LENGTHOF(cnvNames); ++i) {
++ UErrorCode errorCode=U_ZERO_ERROR;
++ UConverter *cnv=cnv_open(cnvNames[i], errorCode);
++ if(U_FAILURE(errorCode)) {
++ errln("failed to open converter %s - %s", cnvNames[i], u_errorName(errorCode));
++ continue;
++ }
++ UnicodeSet expected;
++ ucnv_setFromUCallBack(cnv, getUnicodeSetCallback, &expected, NULL, NULL, &errorCode);
++ if(U_FAILURE(errorCode)) {
++ errln("failed to set the callback on converter %s - %s", cnvNames[i], u_errorName(errorCode));
++ ucnv_close(cnv);
++ continue;
++ }
++ UConverterUnicodeSet which;
++ for(which=UCNV_ROUNDTRIP_SET; which<UCNV_SET_COUNT; which=(UConverterUnicodeSet)((int)which+1)) {
++ if(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
++ ucnv_setFallback(cnv, TRUE);
++ }
++ expected.add(0, cpLimit-1);
++ s=s0;
++ UBool flush;
++ do {
++ char *t=buffer;
++ flush=(UBool)(s==s0+s0Length);
++ ucnv_fromUnicode(cnv, &t, buffer+sizeof(buffer), (const UChar **)&s, s0+s0Length, NULL, flush, &errorCode);
++ if(U_FAILURE(errorCode)) {
++ if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
++ errorCode=U_ZERO_ERROR;
++ continue;
++ } else {
++ break; // unexpected error, should not occur
++ }
++ }
++ } while(!flush);
++ UnicodeSet set;
++ ucnv_getUnicodeSet(cnv, (USet *)&set, which, &errorCode);
++ if(cpLimit<0x110000) {
++ set.remove(cpLimit, 0x10ffff);
++ }
++ if(which==UCNV_ROUNDTRIP_SET) {
++ // ignore PUA code points because they will be converted even if they
++ // are fallbacks and when other fallbacks are turned off,
++ // but ucnv_getUnicodeSet(UCNV_ROUNDTRIP_SET) delivers true roundtrips
++ expected.remove(0xe000, 0xf8ff);
++ expected.remove(0xf0000, 0xffffd);
++ expected.remove(0x100000, 0x10fffd);
++ set.remove(0xe000, 0xf8ff);
++ set.remove(0xf0000, 0xffffd);
++ set.remove(0x100000, 0x10fffd);
++ }
++ if(set!=expected) {
++ // First try to see if we have different sets because ucnv_getUnicodeSet()
++ // added strings: The above conversion method does not tell us what strings might be convertible.
++ // Remove strings from the set and compare again.
++ // Unfortunately, there are no good, direct set methods for finding out whether there are strings
++ // in the set, nor for enumerating or removing just them.
++ // Intersect all code points with the set. The intersection will not contain strings.
++ UnicodeSet temp(0, 0x10ffff);
++ temp.retainAll(set);
++ set=temp;
++ }
++ if(set!=expected) {
++ UnicodeSet diffSet;
++ UnicodeString out;
++
++ // are there items that must be in the set but are not?
++ (diffSet=expected).removeAll(set);
++ if(!diffSet.isEmpty()) {
++ diffSet.toPattern(out, TRUE);
++ if(out.length()>100) {
++ out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis));
++ }
++ errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - which set: %d",
++ cnvNames[i], which);
++ errln(out);
++ }
++
++ // are there items that must not be in the set but are?
++ (diffSet=set).removeAll(expected);
++ if(!diffSet.isEmpty()) {
++ diffSet.toPattern(out, TRUE);
++ if(out.length()>100) {
++ out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis));
++ }
++ errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - which set: %d",
++ cnvNames[i], which);
++ errln(out);
++ }
++ }
++ }
++ }
++
++ delete [] s0;
++}
++
+ // open testdata or ICU data converter ------------------------------------- ***
+
+ UConverter *
+diff -ru icu.5797/source/test/intltest/convtest.h icu/source/test/intltest/convtest.h
+--- icu.5797/source/test/intltest/convtest.h 2009-06-02 14:45:18.000000000 +0100
++++ icu/source/test/intltest/convtest.h 2009-06-02 15:05:10.000000000 +0100
+@@ -64,6 +64,7 @@
+ void TestToUnicode();
+ void TestFromUnicode();
+ void TestGetUnicodeSet();
++ void TestGetUnicodeSet2();
+
+ private:
+ UBool
+diff -ru icu.5797/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt
+--- icu.5797/source/test/testdata/conversion.txt 2009-06-02 14:45:18.000000000 +0100
++++ icu/source/test/testdata/conversion.txt 2009-06-02 15:25:04.000000000 +0100
+@@ -1198,16 +1198,29 @@
+ // versions of ISO-2022-JP
+ {
+ "ISO-2022-JP",
+- "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u203e\uff61-\uff9f\u4e00\u4e01\uffe5]",
+- "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\ufa0e-\ufa2d\uffe6-\U0010ffff]",
++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2015\u203e\u4e00\u4e01\uffe5]",
++ "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u2014\u301c\u4e02\u4e27-\u4e29\u4fe0\u663b\u9eb5\ufa0e-\ufa2d\uff61-\uff9f\uffe4\uffe6-\U0010ffff]",
+ :int{0}
+ }
+ {
+ "ISO-2022-JP-2",
+- "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\uff61-\uff9f\u4e00-\u4e05\uffe6]",
+- "[\x0e\x0f\x1b\uffe7-\U0010ffff]",
++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uffe6]",
++ "[\x0e\x0f\x1b\uff61-\uff9f\uffe4\uffe7-\U0010ffff]",
+ :int{0}
+ }
++ {
++ "JIS7",
++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uff61-\uff9f\uffe6]",
++ "[\x0e\x0f\x1b\uffe4\uffe7-\U0010ffff]",
++ :int{0}
++ }
++ // with fallbacks
++ {
++ "ISO-2022-JP",
++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2014\u2015\u203e\u301c\u4e00\u4e01\u4fe0\u9eb5\uff61-\uff9f\uffe5]",
++ "[\x0e\x0f\x1b\xa6\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\u663b\ufa0e-\ufa2d\uffe4\uffe6-\U0010ffff]",
++ :int{1}
++ }
+
+ // versions of ISO-2022-CN
+ {
+@@ -1223,6 +1236,14 @@
+ :int{0}
+ }
+
++ // LMBCS
++ {
++ "LMBCS",
++ "[\x00-\U0010ffff]",
++ "[]",
++ :int{0}
++ }
++
+ // DBCS-only
+ {
+ "ibm-971",
diff --git a/icu.icu6002.backport.patch b/icu.icu6002.backport.patch
new file mode 100644
index 0000000..51f0d75
--- /dev/null
+++ b/icu.icu6002.backport.patch
@@ -0,0 +1,397 @@
+diff -ru icu.6001/source/common/ucnv_ext.c icu/source/common/ucnv_ext.c
+--- icu.6001/source/common/ucnv_ext.c 2009-06-02 15:29:01.000000000 +0100
++++ icu/source/common/ucnv_ext.c 2009-06-02 15:29:18.000000000 +0100
+@@ -1036,15 +1036,13 @@
+ /* enumerate the from-Unicode trie table */
+ c=0; /* keep track of the current code point while enumerating */
+
+- if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ||
+- filter==UCNV_SET_FILTER_DBCS_ONLY ||
+- filter==UCNV_SET_FILTER_SJIS ||
+- filter==UCNV_SET_FILTER_GR94DBCS
++ if(filter==UCNV_SET_FILTER_2022_CN) {
++ minLength=3;
++ } else if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ||
++ filter!=UCNV_SET_FILTER_NONE
+ ) {
+ /* DBCS-only, ignore single-byte results */
+ minLength=2;
+- } else if(filter==UCNV_SET_FILTER_2022_CN) {
+- minLength=3;
+ } else {
+ minLength=1;
+ }
+@@ -1104,6 +1102,13 @@
+ continue;
+ }
+ break;
++ case UCNV_SET_FILTER_HZ:
++ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&
++ (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
++ (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) {
++ continue;
++ }
++ break;
+ default:
+ /*
+ * UCNV_SET_FILTER_NONE,
+diff -ru icu.6001/source/common/ucnvhz.c icu/source/common/ucnvhz.c
+--- icu.6001/source/common/ucnvhz.c 2009-06-02 15:29:01.000000000 +0100
++++ icu/source/common/ucnvhz.c 2009-06-02 15:29:15.000000000 +0100
+@@ -72,7 +72,7 @@
+ cnv->extraInfo = uprv_malloc(sizeof(UConverterDataHZ));
+ if(cnv->extraInfo != NULL){
+ uprv_memset(cnv->extraInfo, 0, sizeof(UConverterDataHZ));
+- ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("ibm-1386",errorCode);
++ ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("GBK",errorCode);
+ }
+ else {
+ *errorCode = U_MEMORY_ALLOCATION_ERROR;
+@@ -141,7 +141,7 @@
+ UChar *myTarget = args->target;
+ const char *mySourceLimit = args->sourceLimit;
+ UChar32 targetUniChar = 0x0000;
+- UChar mySourceChar = 0x0000;
++ int32_t mySourceChar = 0x0000;
+ UConverterDataHZ* myData=(UConverterDataHZ*)(args->converter->extraInfo);
+ tempBuf[0]=0;
+ tempBuf[1]=0;
+@@ -156,90 +156,71 @@
+
+ mySourceChar= (unsigned char) *mySource++;
+
+- switch(mySourceChar){
++ if(args->converter->mode == UCNV_TILDE) {
++ /* second byte after ~ */
++ args->converter->mode=0;
++ switch(mySourceChar) {
+ case 0x0A:
+- if(args->converter->mode ==UCNV_TILDE){
+- args->converter->mode=0;
+-
+- }
+- *(myTarget++)=(UChar)mySourceChar;
++ /* no output for ~\n (line-continuation marker) */
+ continue;
+-
+ case UCNV_TILDE:
+- if(args->converter->mode ==UCNV_TILDE){
+- *(myTarget++)=(UChar)mySourceChar;
+- args->converter->mode=0;
+- continue;
+-
++ if(args->offsets) {
++ args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 2);
+ }
+- else if(args->converter->toUnicodeStatus !=0){
+- args->converter->mode=0;
+- break;
+- }
+- else{
+- args->converter->mode = UCNV_TILDE;
+- continue;
+- }
+-
+-
++ *(myTarget++)=(UChar)mySourceChar;
++ continue;
+ case UCNV_OPEN_BRACE:
+- if(args->converter->mode == UCNV_TILDE){
+- args->converter->mode=0;
+- myData->isStateDBCS = TRUE;
+- continue;
+- }
+- else{
+- break;
+- }
+-
+-
++ myData->isStateDBCS = TRUE;
++ continue;
+ case UCNV_CLOSE_BRACE:
+- if(args->converter->mode == UCNV_TILDE){
+- args->converter->mode=0;
+- myData->isStateDBCS = FALSE;
+- continue;
+- }
+- else{
+- break;
+- }
+-
++ myData->isStateDBCS = FALSE;
++ continue;
+ default:
+ /* if the first byte is equal to TILDE and the trail byte
+ * is not a valid byte then it is an error condition
+ */
+- if(args->converter->mode == UCNV_TILDE){
+- args->converter->mode=0;
+- mySourceChar= (UChar)(((UCNV_TILDE+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80));
+- goto SAVE_STATE;
+- }
+-
++ mySourceChar = 0x7e00 | mySourceChar;
++ targetUniChar = 0xffff;
+ break;
+-
+- }
+-
+- if(myData->isStateDBCS){
++ }
++ } else if(myData->isStateDBCS) {
+ if(args->converter->toUnicodeStatus == 0x00){
+- args->converter->toUnicodeStatus = (UChar) mySourceChar;
++ /* lead byte */
++ if(mySourceChar == UCNV_TILDE) {
++ args->converter->mode = UCNV_TILDE;
++ } else {
++ /* add another bit to distinguish a 0 byte from not having seen a lead byte */
++ args->converter->toUnicodeStatus = (uint32_t) (mySourceChar | 0x100);
++ }
+ continue;
+ }
+ else{
+- tempBuf[0] = (char) (args->converter->toUnicodeStatus+0x80) ;
+- tempBuf[1] = (char) (mySourceChar+0x80);
+- mySourceChar= (UChar)(((args->converter->toUnicodeStatus+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80));
++ /* trail byte */
++ uint32_t leadByte = args->converter->toUnicodeStatus & 0xff;
++ if( (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21) &&
++ (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21)
++ ) {
++ tempBuf[0] = (char) (leadByte+0x80) ;
++ tempBuf[1] = (char) (mySourceChar+0x80);
++ targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
++ tempBuf, 2, args->converter->useFallback);
++ } else {
++ targetUniChar = 0xffff;
++ }
++ /* add another bit so that the code below writes 2 bytes in case of error */
++ mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar;
+ args->converter->toUnicodeStatus =0x00;
+- targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
+- tempBuf, 2, args->converter->useFallback);
+ }
+ }
+ else{
+- if(args->converter->fromUnicodeStatus == 0x00){
+- targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
+- mySource - 1, 1, args->converter->useFallback);
+- }
+- else{
+- goto SAVE_STATE;
++ if(mySourceChar == UCNV_TILDE) {
++ args->converter->mode = UCNV_TILDE;
++ continue;
++ } else if(mySourceChar <= 0x7f) {
++ targetUniChar = (UChar)mySourceChar; /* ASCII */
++ } else {
++ targetUniChar = 0xffff;
+ }
+-
+ }
+ if(targetUniChar < 0xfffe){
+ if(args->offsets) {
+@@ -248,26 +229,17 @@
+
+ *(myTarget++)=(UChar)targetUniChar;
+ }
+- else if(targetUniChar>=0xfffe){
+-SAVE_STATE:
++ else /* targetUniChar>=0xfffe */ {
+ if(targetUniChar == 0xfffe){
+ *err = U_INVALID_CHAR_FOUND;
+ }
+ else{
+ *err = U_ILLEGAL_CHAR_FOUND;
+ }
+- if(myData->isStateDBCS){
+- /* this should never occur since isStateDBCS is set to true
+- * only after tempBuf[0] and tempBuf[1]
+- * are set to the input .. just to please BEAM
+- */
+- if(tempBuf[0]==0 || tempBuf[1]==0){
+- *err = U_INTERNAL_PROGRAM_ERROR;
+- }else{
+- args->converter->toUBytes[0] = (uint8_t)(tempBuf[0]-0x80);
+- args->converter->toUBytes[1] = (uint8_t)(tempBuf[1]-0x80);
+- args->converter->toULength=2;
+- }
++ if(mySourceChar > 0xff){
++ args->converter->toUBytes[0] = (uint8_t)(mySourceChar >> 8);
++ args->converter->toUBytes[1] = (uint8_t)mySourceChar;
++ args->converter->toULength=2;
+ }
+ else{
+ args->converter->toUBytes[0] = (uint8_t)mySourceChar;
+@@ -328,16 +300,21 @@
+ escSeq = TILDE_ESCAPE;
+ CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
+ continue;
+- }
+- else{
++ } else if(mySourceChar <= 0x7f) {
++ length = 1;
++ targetUniChar = mySourceChar;
++ } else {
+ length= ucnv_MBCSFromUChar32(myConverterData->gbConverter->sharedData,
+ mySourceChar,&targetUniChar,args->converter->useFallback);
+-
+- }
+- /* only DBCS or SBCS characters are expected*/
+- /* DB haracters with high bit set to 1 are expected */
+- if(length > 2 || length==0 ||(((targetUniChar & 0x8080) != 0x8080)&& length==2)){
+- targetUniChar= missingCharMarker;
++ /* we can only use lead bytes 21..7D and trail bytes 21..7E */
++ if( length == 2 &&
++ (uint16_t)(targetUniChar - 0xa1a1) <= (0xfdfe - 0xa1a1) &&
++ (uint8_t)(targetUniChar - 0xa1) <= (0xfe - 0xa1)
++ ) {
++ targetUniChar -= 0x8080;
++ } else {
++ targetUniChar = missingCharMarker;
++ }
+ }
+ if (targetUniChar != missingCharMarker){
+ myConverterData->isTargetUCharDBCS = isTargetUCharDBCS = (UBool)(targetUniChar>0x00FF);
+@@ -360,22 +337,22 @@
+
+ if(isTargetUCharDBCS){
+ if( myTargetIndex <targetLength){
+- myTarget[myTargetIndex++] =(char) ((targetUniChar >> 8) -0x80);
++ myTarget[myTargetIndex++] =(char) (targetUniChar >> 8);
+ if(offsets){
+ *(offsets++) = mySourceIndex-1;
+ }
+ if(myTargetIndex < targetLength){
+- myTarget[myTargetIndex++] =(char) ((targetUniChar & 0x00FF) -0x80);
++ myTarget[myTargetIndex++] =(char) targetUniChar;
+ if(offsets){
+ *(offsets++) = mySourceIndex-1;
+ }
+ }else{
+- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80);
++ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar;
+ *err = U_BUFFER_OVERFLOW_ERROR;
+ }
+ }else{
+- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) ((targetUniChar >> 8) -0x80);
+- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80);
++ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) (targetUniChar >> 8);
++ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar;
+ *err = U_BUFFER_OVERFLOW_ERROR;
+ }
+
+@@ -524,15 +501,14 @@
+ const USetAdder *sa,
+ UConverterUnicodeSet which,
+ UErrorCode *pErrorCode) {
+- /* the tilde '~' is hardcoded in the converter */
+- sa->add(sa->set, 0x7e);
++ /* HZ converts all of ASCII */
++ sa->addRange(sa->set, 0, 0x7f);
+
+ /* add all of the code points that the sub-converter handles */
+- /* ucnv_MBCSGetFilteredUnicodeSetForUnicode(((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData, sa, which, UCNV_SET_FILTER_GR94DBCS, pErrorCode); */
+- ((UConverterDataHZ*)cnv->extraInfo)->
+- gbConverter->sharedData->impl->
+- getUnicodeSet(((UConverterDataHZ*)cnv->extraInfo)->gbConverter,
+- sa, which, pErrorCode);
++ ucnv_MBCSGetFilteredUnicodeSetForUnicode(
++ ((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData,
++ sa, which, UCNV_SET_FILTER_HZ,
++ pErrorCode);
+ }
+
+ static const UConverterImpl _HZImpl={
+diff -ru icu.6001/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c
+--- icu.6001/source/common/ucnvmbcs.c 2009-06-02 15:29:01.000000000 +0100
++++ icu/source/common/ucnvmbcs.c 2009-06-02 15:35:01.000000000 +0100
+@@ -612,6 +612,19 @@
+ stage3+=2; /* +=st3Multiplier */
+ } while((++c&0xf)!=0);
+ break;
++ case UCNV_SET_FILTER_HZ:
++ /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */
++ do {
++ if( ((st3&1)!=0 || useFallback) &&
++ (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
++ (uint8_t)(value - 0xa1)<=(0xfe - 0xa1)
++ ) {
++ sa->add(sa->set, c);
++ }
++ st3>>=1;
++ stage3+=2; /* +=st3Multiplier */
++ } while((++c&0xf)!=0);
++ break;
+ default:
+ *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
+ return;
+diff -ru icu.6001/source/common/ucnvmbcs.h icu/source/common/ucnvmbcs.h
+--- icu.6001/source/common/ucnvmbcs.h 2009-06-02 15:29:01.000000000 +0100
++++ icu/source/common/ucnvmbcs.h 2009-06-02 15:29:15.000000000 +0100
+@@ -400,6 +400,7 @@
+ UCNV_SET_FILTER_2022_CN,
+ UCNV_SET_FILTER_SJIS,
+ UCNV_SET_FILTER_GR94DBCS,
++ UCNV_SET_FILTER_HZ,
+ UCNV_SET_FILTER_COUNT
+ } UConverterSetFilter;
+
+diff -ru icu.6001/source/test/cintltst/ncnvtst.c icu/source/test/cintltst/ncnvtst.c
+--- icu.6001/source/test/cintltst/ncnvtst.c 2009-06-02 15:28:46.000000000 +0100
++++ icu/source/test/cintltst/ncnvtst.c 2009-06-02 15:29:15.000000000 +0100
+@@ -1928,7 +1928,7 @@
+ #if !UCONFIG_NO_LEGACY_CONVERSION
+ { "UTF-8", 0, 0xd7ff, 0xe000, 0x10ffff, 0xd800, 0xdfff },
+ { "windows-1251", 0, 0x7f, 0x410, 0x44f, 0x3000, 0xd7ff },
+- { "HZ", 0x410, 0x44f, 0x4e00, 0x4eff, 0xac00, 0xd7ff },
++ /* HZ test case fixed and moved to intltest's conversion.txt, ticket #6002 */
+ { "shift-jis", 0x3041, 0x3093, 0x30a1, 0x30f3, 0x900, 0x1cff }
+ #else
+ { "UTF-8", 0, 0xd7ff, 0xe000, 0x10ffff, 0xd800, 0xdfff }
+diff -ru icu.6001/source/test/intltest/convtest.cpp icu/source/test/intltest/convtest.cpp
+--- icu.6001/source/test/intltest/convtest.cpp 2009-06-02 15:28:46.000000000 +0100
++++ icu/source/test/intltest/convtest.cpp 2009-06-02 15:29:15.000000000 +0100
+@@ -527,7 +527,7 @@
+ "Shift-JIS",
+ "ibm-1390", // EBCDIC_STATEFUL table
+ "ibm-16684", // DBCS-only extension table based on EBCDIC_STATEFUL table
+- // "HZ", TODO(markus): known bug, the set incorrectly contains [\u02CA\u02CB\u02D9\u2010\u2013\u2015...]
++ "HZ",
+ "ISO-2022-JP",
+ "JIS7",
+ "ISO-2022-CN",
+diff -ru icu.6001/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt
+--- icu.6001/source/test/testdata/conversion.txt 2009-06-02 15:28:46.000000000 +0100
++++ icu/source/test/testdata/conversion.txt 2009-06-02 15:29:15.000000000 +0100
+@@ -48,6 +48,14 @@
+ toUnicode {
+ Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }
+ Cases {
++ // test that HZ limits its byte values to lead bytes 21..7d and trail bytes 21..7e
++ {
++ "HZ",
++ :bin{ 7e7b21212120217e217f772100007e217e7d207e7e807e0a2b },
++ "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd ~\ufffd+",
++ :intvector{ 2,4,6,8,10,12,14,18,19,21,24 },
++ :int{1}, :int{1}, "", "?", :bin{""}
++ }
+ // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and
+ // using the Shift-JIS table for JIS X 0208 (ticket #5797)
+ {
+@@ -1244,6 +1252,14 @@
+ :int{0}
+ }
+
++ // HZ
++ {
++ "HZ",
++ "[\u0410-\u044f\u4e00\u4e01\u4e03]",
++ "[\u4e02\u4e04-\u4e06\uac00-\ud7ff]",
++ :int{0}
++ }
++
+ // DBCS-only
+ {
+ "ibm-971",
diff --git a/icu.icu6175.emptysegments.patch b/icu.icu6175.emptysegments.patch
new file mode 100644
index 0000000..bb40bd5
--- /dev/null
+++ b/icu.icu6175.emptysegments.patch
@@ -0,0 +1,535 @@
+diff -ru icu.6002/source/common/ucnv2022.c icu/source/common/ucnv2022.c
+--- icu.6002/source/common/ucnv2022.c 2009-06-02 15:38:08.000000000 +0100
++++ icu/source/common/ucnv2022.c 2009-06-02 15:40:20.000000000 +0100
+@@ -201,6 +201,7 @@
+ #ifdef U_ENABLE_GENERIC_ISO_2022
+ UBool isFirstBuffer;
+ #endif
++ UBool isEmptySegment;
+ char name[30];
+ char locale[3];
+ }UConverterDataISO2022;
+@@ -609,6 +610,7 @@
+ if(choice<=UCNV_RESET_TO_UNICODE) {
+ uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
+ myConverterData->key = 0;
++ myConverterData->isEmptySegment = FALSE;
+ }
+ if(choice!=UCNV_RESET_TO_UNICODE) {
+ uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
+@@ -814,6 +816,7 @@
+ if(chosenConverterName == NULL) {
+ /* SS2 or SS3 */
+ *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
++ _this->toUCallbackReason = UCNV_UNASSIGNED;
+ return;
+ }
+
+@@ -935,6 +938,8 @@
+ }
+ if(U_SUCCESS(*err)) {
+ _this->toULength = 0;
++ } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
++ _this->toUCallbackReason = UCNV_UNASSIGNED;
+ }
+ }
+
+@@ -1986,6 +1991,7 @@
+ continue;
+ } else {
+ /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
++ myData->isEmptySegment = FALSE; /* reset this, we have a different error */
+ break;
+ }
+
+@@ -1997,21 +2003,39 @@
+ continue;
+ } else {
+ /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
++ myData->isEmptySegment = FALSE; /* reset this, we have a different error */
+ break;
+ }
+
+ case ESC_2022:
+ mySource--;
+ escape:
+- changeState_2022(args->converter,&(mySource),
+- mySourceLimit, ISO_2022_JP,err);
++ {
++ const char * mySourceBefore = mySource;
++ int8_t toULengthBefore = args->converter->toULength;
++
++ changeState_2022(args->converter,&(mySource),
++ mySourceLimit, ISO_2022_JP,err);
++
++ /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
++ if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
++ *err = U_ILLEGAL_ESCAPE_SEQUENCE;
++ args->converter->toUCallbackReason = UCNV_IRREGULAR;
++ args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
++ }
++ }
+
+ /* invalid or illegal escape sequence */
+ if(U_FAILURE(*err)){
+ args->target = myTarget;
+ args->source = mySource;
++ myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
+ return;
+ }
++ /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
++ if(myData->key==0) {
++ myData->isEmptySegment = TRUE;
++ }
+ continue;
+
+ /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
+@@ -2028,6 +2052,7 @@
+ /* falls through */
+ default:
+ /* convert one or two bytes */
++ myData->isEmptySegment = FALSE;
+ cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
+ if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
+ !IS_JP_DBCS(cs)
+@@ -2524,15 +2549,27 @@
+
+ if(mySourceChar==UCNV_SI){
+ myData->toU2022State.g = 0;
++ if (myData->isEmptySegment) {
++ myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
++ *err = U_ILLEGAL_ESCAPE_SEQUENCE;
++ args->converter->toUCallbackReason = UCNV_IRREGULAR;
++ args->converter->toUBytes[0] = mySourceChar;
++ args->converter->toULength = 1;
++ args->target = myTarget;
++ args->source = mySource;
++ return;
++ }
+ /*consume the source */
+ continue;
+ }else if(mySourceChar==UCNV_SO){
+ myData->toU2022State.g = 1;
++ myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
+ /*consume the source */
+ continue;
+ }else if(mySourceChar==ESC_2022){
+ mySource--;
+ escape:
++ myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
+ changeState_2022(args->converter,&(mySource),
+ mySourceLimit, ISO_2022_KR, err);
+ if(U_FAILURE(*err)){
+@@ -2543,6 +2580,7 @@
+ continue;
+ }
+
++ myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
+ if(myData->toU2022State.g == 1) {
+ if(mySource < mySourceLimit) {
+ char trailByte;
+@@ -3075,27 +3113,52 @@
+ switch(mySourceChar){
+ case UCNV_SI:
+ pToU2022State->g=0;
++ if (myData->isEmptySegment) {
++ myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
++ *err = U_ILLEGAL_ESCAPE_SEQUENCE;
++ args->converter->toUCallbackReason = UCNV_IRREGULAR;
++ args->converter->toUBytes[0] = mySourceChar;
++ args->converter->toULength = 1;
++ args->target = myTarget;
++ args->source = mySource;
++ return;
++ }
+ continue;
+
+ case UCNV_SO:
+ if(pToU2022State->cs[1] != 0) {
+ pToU2022State->g=1;
++ myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
+ continue;
+ } else {
+ /* illegal to have SO before a matching designator */
++ myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */
+ break;
+ }
+
+ case ESC_2022:
+ mySource--;
+ escape:
+- changeState_2022(args->converter,&(mySource),
+- mySourceLimit, ISO_2022_CN,err);
++ {
++ const char * mySourceBefore = mySource;
++ int8_t toULengthBefore = args->converter->toULength;
++
++ changeState_2022(args->converter,&(mySource),
++ mySourceLimit, ISO_2022_CN,err);
++
++ /* After SO there must be at least one character before a designator (designator error handled separately) */
++ if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
++ *err = U_ILLEGAL_ESCAPE_SEQUENCE;
++ args->converter->toUCallbackReason = UCNV_IRREGULAR;
++ args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
++ }
++ }
+
+ /* invalid or illegal escape sequence */
+ if(U_FAILURE(*err)){
+ args->target = myTarget;
+ args->source = mySource;
++ myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
+ return;
+ }
+ continue;
+@@ -3109,6 +3172,7 @@
+ /* falls through */
+ default:
+ /* convert one or two bytes */
++ myData->isEmptySegment = FALSE;
+ if(pToU2022State->g != 0) {
+ if(mySource < mySourceLimit) {
+ UConverterSharedData *cnv;
+diff -ru icu.6002/source/common/ucnv_bld.c icu/source/common/ucnv_bld.c
+--- icu.6002/source/common/ucnv_bld.c 2009-06-02 15:38:05.000000000 +0100
++++ icu/source/common/ucnv_bld.c 2009-06-02 15:38:31.000000000 +0100
+@@ -914,6 +914,7 @@
+ myUConverter->subCharLen = mySharedConverterData->staticData->subCharLen;
+ myUConverter->subChars = (uint8_t *)myUConverter->subUChars;
+ uprv_memcpy(myUConverter->subChars, mySharedConverterData->staticData->subChar, myUConverter->subCharLen);
++ myUConverter->toUCallbackReason = UCNV_ILLEGAL; /* default reason to invoke (*fromCharErrorBehaviour) */
+
+ if(mySharedConverterData->impl->open != NULL) {
+ mySharedConverterData->impl->open(myUConverter, realName, locale, options, err);
+diff -ru icu.6002/source/common/ucnv_bld.h icu/source/common/ucnv_bld.h
+--- icu.6002/source/common/ucnv_bld.h 2009-06-02 15:38:08.000000000 +0100
++++ icu/source/common/ucnv_bld.h 2009-06-02 15:38:31.000000000 +0100
+@@ -226,6 +226,9 @@
+ char preToU[UCNV_EXT_MAX_BYTES];
+ int8_t preFromULength, preToULength; /* negative: replay */
+ int8_t preToUFirstLength; /* length of first character */
++
++ /* new fields for ICU 4.0 */
++ UConverterCallbackReason toUCallbackReason; /* (*fromCharErrorBehaviour) reason, set when error is detected */
+ };
+
+ U_CDECL_END /* end of UConverter */
+diff -ru icu.6002/source/common/ucnv.c icu/source/common/ucnv.c
+--- icu.6002/source/common/ucnv.c 2009-06-02 15:38:05.000000000 +0100
++++ icu/source/common/ucnv.c 2009-06-02 15:38:31.000000000 +0100
+@@ -1473,11 +1473,14 @@
+ cnv->toULength=0;
+
+ /* call the callback function */
++ if(cnv->toUCallbackReason==UCNV_ILLEGAL && *err==U_INVALID_CHAR_FOUND) {
++ cnv->toUCallbackReason = UCNV_UNASSIGNED;
++ }
+ cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs,
+ cnv->invalidCharBuffer, errorInputLength,
+- (*err==U_INVALID_CHAR_FOUND || *err==U_UNSUPPORTED_ESCAPE_SEQUENCE) ?
+- UCNV_UNASSIGNED : UCNV_ILLEGAL,
++ cnv->toUCallbackReason,
+ err);
++ cnv->toUCallbackReason = UCNV_ILLEGAL; /* reset to default value */
+
+ /*
+ * loop back to the offset handling
+diff -ru icu.6002/source/common/ucnvhz.c icu/source/common/ucnvhz.c
+--- icu.6002/source/common/ucnvhz.c 2009-06-02 15:38:08.000000000 +0100
++++ icu/source/common/ucnvhz.c 2009-06-02 15:38:31.000000000 +0100
+@@ -59,6 +59,7 @@
+ UBool isEscapeAppended;
+ UBool isStateDBCS;
+ UBool isTargetUCharDBCS;
++ UBool isEmptySegment;
+ }UConverterDataHZ;
+
+
+@@ -98,6 +99,7 @@
+ cnv->mode=0;
+ if(cnv->extraInfo != NULL){
+ ((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = FALSE;
++ ((UConverterDataHZ*)cnv->extraInfo)->isEmptySegment = FALSE;
+ }
+ }
+ if(choice!=UCNV_RESET_TO_UNICODE) {
+@@ -130,6 +132,10 @@
+ * from-GB code '~}' ($7E7D) is outside the defined GB range.)
+ *
+ * Source: RFC 1842
++*
++* Note that the formal syntax in RFC 1842 is invalid. I assume that the
++* intended definition of single-byte-segment is as follows (pedberg):
++* single-byte-segment = single-byte-seq 1*single-byte-char
+ */
+
+
+@@ -168,12 +174,23 @@
+ args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 2);
+ }
+ *(myTarget++)=(UChar)mySourceChar;
++ myData->isEmptySegment = FALSE;
+ continue;
+ case UCNV_OPEN_BRACE:
+- myData->isStateDBCS = TRUE;
+- continue;
+ case UCNV_CLOSE_BRACE:
+- myData->isStateDBCS = FALSE;
++ myData->isStateDBCS = (mySourceChar == UCNV_OPEN_BRACE);
++ if (myData->isEmptySegment) {
++ myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
++ *err = U_ILLEGAL_ESCAPE_SEQUENCE;
++ args->converter->toUCallbackReason = UCNV_IRREGULAR;
++ args->converter->toUBytes[0] = UCNV_TILDE;
++ args->converter->toUBytes[1] = mySourceChar;
++ args->converter->toULength = 2;
++ args->target = myTarget;
++ args->source = mySource;
++ return;
++ }
++ myData->isEmptySegment = TRUE;
+ continue;
+ default:
+ /* if the first byte is equal to TILDE and the trail byte
+@@ -181,6 +198,7 @@
+ */
+ mySourceChar = 0x7e00 | mySourceChar;
+ targetUniChar = 0xffff;
++ myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */
+ break;
+ }
+ } else if(myData->isStateDBCS) {
+@@ -191,6 +209,7 @@
+ } else {
+ /* add another bit to distinguish a 0 byte from not having seen a lead byte */
+ args->converter->toUnicodeStatus = (uint32_t) (mySourceChar | 0x100);
++ myData->isEmptySegment = FALSE; /* the segment has something, either valid or will produce a different error, so reset this */
+ }
+ continue;
+ }
+@@ -218,8 +237,10 @@
+ continue;
+ } else if(mySourceChar <= 0x7f) {
+ targetUniChar = (UChar)mySourceChar; /* ASCII */
++ myData->isEmptySegment = FALSE; /* the segment has something valid */
+ } else {
+ targetUniChar = 0xffff;
++ myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */
+ }
+ }
+ if(targetUniChar < 0xfffe){
+diff -ru icu.6002/source/test/cintltst/nucnvtst.c icu/source/test/cintltst/nucnvtst.c
+--- icu.6002/source/test/cintltst/nucnvtst.c 2009-06-02 15:37:53.000000000 +0100
++++ icu/source/test/cintltst/nucnvtst.c 2009-06-02 15:40:52.000000000 +0100
+@@ -81,6 +81,7 @@
+ static void TestJitterbug2411(void);
+ #endif
+
++static void TestJitterbug6175(void);
+ static void TestRoundTrippingAllUTF(void);
+ static void TestConv(const uint16_t in[],
+ int len,
+@@ -294,6 +295,7 @@
+ #if !UCONFIG_NO_LEGACY_CONVERSION
+ addTest(root, &TestJitterbug2346, "tsconv/nucnvtst/TestJitterbug2346");
+ addTest(root, &TestJitterbug2411, "tsconv/nucnvtst/TestJitterbug2411");
++ addTest(root, &TestJitterbug6175, "tsconv/nucnvtst/TestJitterbug6175");
+ #endif
+
+ }
+@@ -4454,6 +4456,70 @@
+ free(offsets);
+ }
+
++/* Tests for empty segments in ISO-2022-JP/KR/CN, HZ, check that UConverterCallbackReason is UCNV_IRREGULAR */
++typedef struct {
++ const char * converterName;
++ const char * inputText;
++ int inputTextLength;
++} EmptySegmentTest;
++
++/* Callback for TestJitterbug6175, should only get called for empty segment errors */
++static void UCNV_TO_U_CALLBACK_EMPTYSEGMENT( const void *context, UConverterToUnicodeArgs *toArgs, const char* codeUnits,
++ int32_t length, UConverterCallbackReason reason, UErrorCode * err ) {
++ if (reason > UCNV_IRREGULAR) {
++ return;
++ }
++ if (reason != UCNV_IRREGULAR) {
++ log_err("toUnicode callback invoked for empty segment but reason is not UCNV_IRREGULAR\n");
++ }
++ /* Standard stuff below from UCNV_TO_U_CALLBACK_SUBSTITUTE */
++ *err = U_ZERO_ERROR;
++ ucnv_cbToUWriteSub(toArgs,0,err);
++}
++
++enum { kEmptySegmentToUCharsMax = 64 };
++static void TestJitterbug6175(void) {
++ static const char iso2022jp_a[] = { 0x61, 0x62, 0x1B,0x24,0x42, 0x1B,0x28,0x42, 0x63, 0x64, 0x0D, 0x0A };
++ static const char iso2022kr_a[] = { 0x1B,0x24,0x29,0x43, 0x61, 0x0E, 0x0F, 0x62, 0x0D, 0x0A };
++ static const char iso2022cn_a[] = { 0x61, 0x1B,0x24,0x29,0x41, 0x62, 0x0E, 0x0F, 0x1B,0x24,0x2A,0x48, 0x1B,0x4E, 0x6A,0x65, 0x63, 0x0D, 0x0A };
++ static const char iso2022cn_b[] = { 0x61, 0x1B,0x24,0x29,0x41, 0x62, 0x0E, 0x1B,0x24,0x29,0x47, 0x68,0x64, 0x0F, 0x63, 0x0D, 0x0A };
++ static const char hzGB2312_a[] = { 0x61, 0x62, 0x7E,0x7B, 0x7E,0x7D, 0x63, 0x64 };
++ static const EmptySegmentTest emptySegmentTests[] = {
++ /* converterName inputText inputTextLength */
++ { "ISO-2022-JP", iso2022jp_a, sizeof(iso2022jp_a) },
++ { "ISO-2022-KR", iso2022kr_a, sizeof(iso2022kr_a) },
++ { "ISO-2022-CN", iso2022cn_a, sizeof(iso2022cn_a) },
++ { "ISO-2022-CN", iso2022cn_b, sizeof(iso2022cn_b) },
++ { "HZ-GB-2312", hzGB2312_a, sizeof(hzGB2312_a) },
++ /* terminator: */
++ { NULL, NULL, 0, }
++ };
++ const EmptySegmentTest * testPtr;
++ for (testPtr = emptySegmentTests; testPtr->converterName != NULL; ++testPtr) {
++ UErrorCode err = U_ZERO_ERROR;
++ UConverter * cnv = ucnv_open(testPtr->converterName, &err);
++ if (U_FAILURE(err)) {
++ log_data_err("Unable to open %s converter: %s\n", testPtr->converterName, u_errorName(err));
++ return;
++ }
++ ucnv_setToUCallBack(cnv, UCNV_TO_U_CALLBACK_EMPTYSEGMENT, NULL, NULL, NULL, &err);
++ if (U_FAILURE(err)) {
++ log_data_err("Unable to setToUCallBack for %s converter: %s\n", testPtr->converterName, u_errorName(err));
++ ucnv_close(cnv);
++ return;
++ }
++ {
++ UChar toUChars[kEmptySegmentToUCharsMax];
++ UChar * toUCharsPtr = toUChars;
++ const UChar * toUCharsLimit = toUCharsPtr + kEmptySegmentToUCharsMax;
++ const char * inCharsPtr = testPtr->inputText;
++ const char * inCharsLimit = inCharsPtr + testPtr->inputTextLength;
++ ucnv_toUnicode(cnv, &toUCharsPtr, toUCharsLimit, &inCharsPtr, inCharsLimit, NULL, TRUE, &err);
++ }
++ ucnv_close(cnv);
++ }
++}
++
+ static void
+ TestEBCDIC_STATEFUL() {
+ /* test input */
+diff -ru icu.6002/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt
+--- icu.6002/source/test/testdata/conversion.txt 2009-06-02 15:37:54.000000000 +0100
++++ icu/source/test/testdata/conversion.txt 2009-06-02 15:40:52.000000000 +0100
+@@ -199,6 +199,21 @@
+ :intvector{ 0, 5, 7, 9, 9, 9, 9, 9, 9, 9, 9, 12 },
+ :int{1}, :int{1}, "", "&", :bin{""}
+ }
++ // empty segment (using substitution and stop)
++ {
++ "ISO-2022-KR",
++ :bin{ 1b242943610e0f620d0a },
++ "a\uFFFDb\u000D\u000A",
++ :intvector{ 4, 6, 7, 8, 9 },
++ :int{1}, :int{1}, "", "?", :bin{""}
++ }
++ {
++ "ISO-2022-KR",
++ :bin{ 1b242943610e0f620d0a },
++ "a",
++ :intvector{ 4 },
++ :int{1}, :int{1}, "illesc", ".", :bin{"0f"}
++ }
+
+ // ISO-2022-JP
+
+@@ -249,6 +264,21 @@
+ :bin{ 41c15c1b284a5cc242 }, "A\uff81\\\xa5\uff82B", :intvector{ 0, 1, 2, 6, 7, 8 },
+ :int{1}, :int{1}, "", ".", :bin{""}
+ }
++ // empty segment (using substitution and stop)
++ {
++ "ISO-2022-JP",
++ :bin{ 61621b24421b284263640d0a },
++ "ab\uFFFDcd\u000D\u000A",
++ :intvector{ 0, 1, 5, 8, 9, 10, 11 },
++ :int{1}, :int{1}, "", "?", :bin{""}
++ }
++ {
++ "ISO-2022-JP",
++ :bin{ 61621b24421b284263640d0a },
++ "ab",
++ :intvector{ 0, 1 },
++ :int{1}, :int{1}, "illesc", ".", :bin{"1b2842"}
++ }
+
+ // ISO-2022-CN
+
+@@ -319,6 +349,36 @@
+ :bin{ 411b242b491b4f2121 }, "\x41", :intvector{ 0 },
+ :int{1}, :int{1}, "unsuppesc", ".", :bin{ 1b242b49 }
+ }
++ // empty segment 1 (using substitution and stop)
++ {
++ "ISO-2022-CN",
++ :bin{ 611b242941620e0f1b242a481b4e6a65630d0a },
++ "ab\uFFFD\u994Cc\u000D\u000A",
++ :intvector{ 0, 5, 7, 14, 16, 17, 18 },
++ :int{1}, :int{1}, "", "?", :bin{""}
++ }
++ {
++ "ISO-2022-CN",
++ :bin{ 611b242941620e0f1b242a481b4e6a65630d0a },
++ "ab",
++ :intvector{ 0, 5 },
++ :int{1}, :int{1}, "illesc", ".", :bin{"0f"}
++ }
++ // empty segment 2 (using substitution and stop)
++ {
++ "ISO-2022-CN",
++ :bin{ 611b242941620e1b24294768640f630d0a },
++ "ab\uFFFD\u5F70c\u000D\u000A",
++ :intvector{ 0, 5, 7, 11, 14, 15, 16 },
++ :int{1}, :int{1}, "", "?", :bin{""}
++ }
++ {
++ "ISO-2022-CN",
++ :bin{ 611b242941620e1b24294768640f630d0a },
++ "ab",
++ :intvector{ 0, 5 },
++ :int{1}, :int{1}, "illesc", ".", :bin{"1b242947"}
++ }
+
+ // ISO-2022 SBCS
+ // [U_ENABLE_GENERIC_ISO_2022]
+@@ -333,6 +393,39 @@
+ // :int{1}, :int{1}, "", ".", :bin{""}
+ //}
+
++ // HZ-GB-2312
++
++ // empty segment 1 (using substitution and stop)
++ {
++ "HZ-GB-2312",
++ :bin{ 61627e7b7e7d6364 },
++ "ab\uFFFDcd",
++ :intvector{ 0, 1, 4, 6, 7 },
++ :int{1}, :int{1}, "", "?", :bin{""}
++ }
++ {
++ "HZ-GB-2312",
++ :bin{ 61627e7b7e7d63640d0a },
++ "ab",
++ :intvector{ 0, 1 },
++ :int{1}, :int{1}, "illesc", ".", :bin{"7e7d"}
++ }
++ // empty segment 2 & legal redundant switches (using substitution and stop)
++ {
++ "HZ-GB-2312",
++ :bin{ 61627e7b323b3f557e7b7e7b523b7e7d63647e7d65667e7d7e7d },
++ "ab\u4E0D\u7A7A\uFFFD\u4E00cdef\uFFFD",
++ :intvector{ 0, 1, 4, 6, 10, 12, 16, 17, 20, 21, 24 },
++ :int{1}, :int{1}, "", "?", :bin{""}
++ }
++ {
++ "HZ-GB-2312",
++ :bin{ 61627e7b323b3f557e7b7e7b523b7e7d63647e7d65667e7d7e7d },
++ "ab\u4E0D\u7A7A",
++ :intvector{ 0, 1, 4, 6 },
++ :int{1}, :int{1}, "illesc", ".", :bin{"7e7b"}
++ }
++
+ // DBCS-only extensions
+ {
+ "ibm-970",
diff --git a/icu.icuXXXX.malayalam.bysyllable.patch b/icu.icuXXXX.malayalam.bysyllable.patch
new file mode 100644
index 0000000..d0cd1b1
--- /dev/null
+++ b/icu.icuXXXX.malayalam.bysyllable.patch
@@ -0,0 +1,250 @@
+diff -ruN icu.orig/source/layout/IndicReordering.h icu/source/layout/IndicReordering.h
+--- icu.orig/source/layout/IndicReordering.h 2007-04-27 10:28:22.000000000 +0100
++++ icu/source/layout/IndicReordering.h 2007-04-27 10:39:22.000000000 +0100
+@@ -142,6 +142,7 @@
+ // do not instantiate
+ IndicReordering();
+
++public:
+ static le_int32 findSyllable(const IndicClassTable *classTable, const LEUnicode *chars, le_int32 prev, le_int32 charCount);
+
+ };
+diff -ruN icu.orig/source/layout/LayoutEngine.cpp icu/source/layout/LayoutEngine.cpp
+--- icu.orig/source/layout/LayoutEngine.cpp 2007-04-27 10:28:22.000000000 +0100
++++ icu/source/layout/LayoutEngine.cpp 2007-04-27 10:39:22.000000000 +0100
+@@ -14,6 +14,7 @@
+ #include "CanonShaping.h"
+ #include "HanLayoutEngine.h"
+ #include "HangulLayoutEngine.h"
++#include "MalayalamLayoutEngine.h"
+ #include "IndicLayoutEngine.h"
+ #include "KhmerLayoutEngine.h"
+ #include "ThaiLayoutEngine.h"
+@@ -451,11 +452,13 @@
+
+ if (gsubTable != NULL && gsubTable->coversScript(scriptTag = OpenTypeLayoutEngine::getScriptTag(scriptCode))) {
+ switch (scriptCode) {
++ case mlymScriptCode:
++ result = new MalayalamOpenTypeLayoutEngine(fontInstance, scriptCode, languageCode, typoFlags, gsubTable);
++ break;
+ case bengScriptCode:
+ case devaScriptCode:
+ case gujrScriptCode:
+ case kndaScriptCode:
+- case mlymScriptCode:
+ case oryaScriptCode:
+ case guruScriptCode:
+ case tamlScriptCode:
+@@ -512,11 +515,13 @@
+ result = new GXLayoutEngine(fontInstance, scriptCode, languageCode, morphTable);
+ } else {
+ switch (scriptCode) {
++ case mlymScriptCode:
++ result = new MalayalamOpenTypeLayoutEngine(fontInstance, scriptCode, languageCode, typoFlags);
++ break;
+ case bengScriptCode:
+ case devaScriptCode:
+ case gujrScriptCode:
+ case kndaScriptCode:
+- case mlymScriptCode:
+ case oryaScriptCode:
+ case guruScriptCode:
+ case tamlScriptCode:
+diff -ruN icu.orig/source/layout/LEGlyphStorage.h icu/source/layout/LEGlyphStorage.h
+--- icu.orig/source/layout/LEGlyphStorage.h 2007-04-27 10:28:22.000000000 +0100
++++ icu/source/layout/LEGlyphStorage.h 2007-04-27 10:43:54.000000000 +0100
+@@ -413,6 +413,8 @@
+ */
+ void adoptGlyphArray(LEGlyphStorage &from);
+
++ void appendGlyphStorage(LEGlyphStorage &from);
++
+ /**
+ * Delete the char indices array and replace it with the one
+ * in <code>from</code>. Set the char indices array pointer
+diff -ruN icu.orig/source/layout/Makefile.in icu/source/layout/Makefile.in
+--- icu.orig/source/layout/Makefile.in 2007-04-27 10:28:22.000000000 +0100
++++ icu/source/layout/Makefile.in 2007-04-27 10:39:22.000000000 +0100
+@@ -66,6 +66,7 @@
+ ArabicLayoutEngine.o \
+ GXLayoutEngine.o \
+ HanLayoutEngine.o \
++MalayalamLayoutEngine.o \
+ IndicLayoutEngine.o \
+ LayoutEngine.o \
+ ContextualGlyphSubstProc.o \
+diff -ruN icu.orig/source/layout/MalayalamLayoutEngine.cpp icu/source/layout/MalayalamLayoutEngine.cpp
+--- icu.orig/source/layout/MalayalamLayoutEngine.cpp 1970-01-01 01:00:00.000000000 +0100
++++ icu/source/layout/MalayalamLayoutEngine.cpp 2007-04-27 10:44:26.000000000 +0100
+@@ -0,0 +1,126 @@
++
++/*
++ *
++ * (C) Copyright IBM Corp. 1998-2005 - All Rights Reserved
++ *
++ */
++
++#include "LETypes.h"
++#include "LayoutEngine.h"
++#include "OpenTypeLayoutEngine.h"
++#include "MalayalamLayoutEngine.h"
++#include "ScriptAndLanguageTags.h"
++
++#include "GlyphSubstitutionTables.h"
++#include "GlyphDefinitionTables.h"
++#include "GlyphPositioningTables.h"
++
++#include "GDEFMarkFilter.h"
++#include "LEGlyphStorage.h"
++
++#include "IndicReordering.h"
++
++#include <stdio.h>
++
++U_NAMESPACE_BEGIN
++
++UOBJECT_DEFINE_RTTI_IMPLEMENTATION(MalayalamOpenTypeLayoutEngine)
++
++void LEGlyphStorage::appendGlyphStorage(LEGlyphStorage &from)
++{
++ if (fInsertionList) applyInsertions();
++ if (from.fInsertionList) from.applyInsertions();
++ if ((!fInsertionList) && (from.fInsertionList))
++ {
++ fInsertionList = from.fInsertionList;
++ from.fInsertionList = NULL;
++ }
++
++ if (!from.fGlyphCount)
++ return;
++
++ le_int32 newGlyphCount = fGlyphCount + from.fGlyphCount;
++
++ fGlyphs = (LEGlyphID*)LE_GROW_ARRAY(fGlyphs, newGlyphCount);
++ LE_ARRAY_COPY(fGlyphs+fGlyphCount, from.fGlyphs, from.fGlyphCount);
++
++ le_int32 nLargestIndex = 0;
++ if (fGlyphCount)
++ {
++ for (le_int32 i = 0; i < fGlyphCount; ++i)
++ {
++ if (fCharIndices[i] > nLargestIndex)
++ nLargestIndex = fCharIndices[i];
++ }
++ nLargestIndex+=1;
++ }
++ fCharIndices = (le_int32 *)LE_GROW_ARRAY(fCharIndices, newGlyphCount);
++ for (le_int32 i = 0; i < from.fGlyphCount; ++i)
++ fCharIndices[fGlyphCount+i] = from.fCharIndices[i] + nLargestIndex;
++
++ fAuxData = (le_uint32 *)LE_GROW_ARRAY(fAuxData, newGlyphCount);
++ LE_ARRAY_COPY(fAuxData+fGlyphCount, from.fAuxData, from.fGlyphCount);
++
++ fGlyphCount = newGlyphCount;
++}
++
++le_int32 MalayalamOpenTypeLayoutEngine::glyphPostProcessing(LEGlyphStorage &tempGlyphStorage, LEGlyphStorage &glyphStorage, LEErrorCode &success)
++{
++ if (LE_FAILURE(success)) {
++ return 0;
++ }
++
++ glyphStorage.appendGlyphStorage(tempGlyphStorage);
++
++ return glyphStorage.getGlyphCount();
++}
++
++
++le_int32 MalayalamOpenTypeLayoutEngine::computeGlyphs(const LEUnicode chars[], le_int32 offset, le_int32 count, le_int32 max, le_bool rightToLeft, LEGlyphStorage &glyphStorage, LEErrorCode &success)
++{
++ if (LE_FAILURE(success)) {
++ return 0;
++ }
++
++ if (chars == NULL || offset < 0 || count < 0 || max < 0 || offset >= max || offset + count > max) {
++ success = LE_ILLEGAL_ARGUMENT_ERROR;
++ return 0;
++ }
++
++ le_int32 outGlyphCount=0;
++
++ const IndicClassTable *classTable = IndicClassTable::getScriptClassTable(fScriptCode);
++ le_int32 prev = 0;
++ while (prev < count)
++ {
++ le_int32 outCharCount=0, fakeGlyphCount=0;
++ LEUnicode *outChars = NULL;
++ LEGlyphStorage fakeGlyphStorage;
++
++ le_int32 syllable = IndicReordering::findSyllable(classTable, chars+offset, prev, count);
++ outCharCount = characterProcessing(chars+prev, offset, syllable-prev, max, rightToLeft, outChars, fakeGlyphStorage, success);
++
++ if (LE_FAILURE(success)) {
++ return 0;
++ }
++
++ if (outChars != NULL) {
++ fakeGlyphCount = glyphProcessing(outChars, 0, outCharCount, outCharCount, rightToLeft, fakeGlyphStorage, success);
++ LE_DELETE_ARRAY(outChars); // FIXME: a subclass may have allocated this, in which case this delete might not work...
++ } else {
++ fakeGlyphCount = glyphProcessing(chars+prev, offset, syllable-prev, max, rightToLeft, fakeGlyphStorage, success);
++ }
++
++ if (LE_FAILURE(success)) {
++ return 0;
++ }
++
++ outGlyphCount = glyphPostProcessing(fakeGlyphStorage, glyphStorage, success);
++
++ prev = syllable;
++ }
++
++ return outGlyphCount;
++}
++
++U_NAMESPACE_END
+diff -ruN icu.orig/source/layout/MalayalamLayoutEngine.h icu/source/layout/MalayalamLayoutEngine.h
+--- icu.orig/source/layout/MalayalamLayoutEngine.h 1970-01-01 01:00:00.000000000 +0100
++++ icu/source/layout/MalayalamLayoutEngine.h 2007-04-27 10:39:52.000000000 +0100
+@@ -0,0 +1,41 @@
++
++/*
++ *
++ * (C) Copyright IBM Corp. 1998-2005 - All Rights Reserved
++ *
++ */
++
++#ifndef __MALAYALAMLAYOUTENGINE_H
++#define __MALAYALAMLAYOUTENGINE_H
++
++#include "IndicLayoutEngine.h"
++
++U_NAMESPACE_BEGIN
++
++class MalayalamOpenTypeLayoutEngine : public IndicOpenTypeLayoutEngine
++{
++public:
++ MalayalamOpenTypeLayoutEngine(const LEFontInstance *fontInstance, le_int32 scriptCode, le_int32 languageCode,
++ le_int32 typoFlags, const GlyphSubstitutionTableHeader *gsubTable) :
++ IndicOpenTypeLayoutEngine(fontInstance, scriptCode, languageCode, typoFlags, gsubTable)
++
++ {}
++
++ MalayalamOpenTypeLayoutEngine(const LEFontInstance *fontInstance, le_int32 scriptCode, le_int32 languageCode,
++ le_int32 typoFlags) :
++ IndicOpenTypeLayoutEngine(fontInstance, scriptCode, languageCode, typoFlags)
++
++ {}
++
++ virtual UClassID getDynamicClassID() const;
++ static UClassID getStaticClassID();
++
++protected:
++ virtual le_int32 glyphPostProcessing(LEGlyphStorage &tempGlyphStorage, LEGlyphStorage &glyphStorage, LEErrorCode &success);
++
++ virtual le_int32 computeGlyphs(const LEUnicode chars[], le_int32 offset, le_int32 count, le_int32 max, le_bool rightToLeft, LEGlyphStorage &glyphStorage, LEErrorCode &success);
++};
++
++U_NAMESPACE_END
++#endif
++
diff --git a/icu.icuXXXX.rollbackabi.patch b/icu.icuXXXX.rollbackabi.patch
new file mode 100644
index 0000000..038d4b6
--- /dev/null
+++ b/icu.icuXXXX.rollbackabi.patch
@@ -0,0 +1,131 @@
+diff -ru icu.5691/source/common/ucnv2022.c icu/source/common/ucnv2022.c
+--- icu.5691/source/common/ucnv2022.c 2009-06-02 16:07:36.000000000 +0100
++++ icu/source/common/ucnv2022.c 2009-06-02 16:21:56.000000000 +0100
+@@ -3566,7 +3566,7 @@
+ /* include ASCII for JP */
+ sa->addRange(sa->set, 0, 0x7f);
+ }
+- if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
++ if(cnvData->version==3 || cnvData->version==4) {
+ /*
+ * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
+ * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
+diff -ru icu.5691/source/common/ucnv_ext.c icu/source/common/ucnv_ext.c
+--- icu.5691/source/common/ucnv_ext.c 2009-06-02 16:07:36.000000000 +0100
++++ icu/source/common/ucnv_ext.c 2009-06-02 16:23:12.000000000 +0100
+@@ -1031,7 +1031,7 @@
+
+ stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH];
+
+- useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
++ useFallback=(UBool)(FALSE);
+
+ /* enumerate the from-Unicode trie table */
+ c=0; /* keep track of the current code point while enumerating */
+diff -ru icu.5691/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c
+--- icu.5691/source/common/ucnvmbcs.c 2009-06-02 16:07:36.000000000 +0100
++++ icu/source/common/ucnvmbcs.c 2009-06-02 16:23:50.000000000 +0100
+@@ -340,7 +340,7 @@
+
+ /* Miscellaneous ------------------------------------------------------------ */
+
+-#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
++/* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
+
+ /* similar to ucnv_MBCSGetNextUChar() but recursive */
+ static void
+@@ -434,8 +434,6 @@
+ pErrorCode);
+ }
+
+-#endif
+-
+ U_CFUNC void
+ ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
+ const USetAdder *sa,
+@@ -511,7 +509,7 @@
+
+ bytes=mbcsTable->fromUnicodeBytes;
+
+- useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
++ useFallback=(UBool)(FALSE);
+
+ switch(mbcsTable->outputType) {
+ case MBCS_OUTPUT_3:
+diff -ru icu.5691/source/common/ucnvmbcs.h icu/source/common/ucnvmbcs.h
+--- icu.5691/source/common/ucnvmbcs.h 2009-06-02 16:07:36.000000000 +0100
++++ icu/source/common/ucnvmbcs.h 2009-06-02 16:23:50.000000000 +0100
+@@ -363,7 +363,8 @@
+ ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
+ UErrorCode *pErrorCode);
+
+-#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
++/* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
++
+ /*
+ * Internal function returning a UnicodeSet for toUnicode() conversion.
+ * Currently only used for ISO-2022-CN, and only handles roundtrip mappings.
+@@ -378,7 +379,6 @@
+ UConverterUnicodeSet which,
+ uint8_t state, int32_t lowByte, int32_t highByte,
+ UErrorCode *pErrorCode);
+-#endif
+
+ /*
+ * Internal function returning a UnicodeSet for toUnicode() conversion.
+diff -ru icu.5691/source/common/unicode/ucnv.h icu/source/common/unicode/ucnv.h
+--- icu.5691/source/common/unicode/ucnv.h 2009-06-02 16:07:32.000000000 +0100
++++ icu/source/common/unicode/ucnv.h 2009-06-02 16:20:18.000000000 +0100
+@@ -870,8 +870,6 @@
+ typedef enum UConverterUnicodeSet {
+ /** Select the set of roundtrippable Unicode code points. @stable ICU 2.6 */
+ UCNV_ROUNDTRIP_SET,
+- /** Select the set of Unicode code points with roundtrip or fallback mappings. @draft ICU 4.0 */
+- UCNV_ROUNDTRIP_AND_FALLBACK_SET,
+ /** Number of UConverterUnicodeSet selectors. @stable ICU 2.6 */
+ UCNV_SET_COUNT
+ } UConverterUnicodeSet;
+@@ -880,16 +878,11 @@
+ /**
+ * Returns the set of Unicode code points that can be converted by an ICU converter.
+ *
+- * Returns one of several kinds of set:
+- *
+- * 1. UCNV_ROUNDTRIP_SET
+- *
++ * The current implementation returns only one kind of set (UCNV_ROUNDTRIP_SET):
+ * The set of all Unicode code points that can be roundtrip-converted
+- * (converted without any data loss) with the converter (ucnv_fromUnicode()).
++ * (converted without any data loss) with the converter.
+ * This set will not include code points that have fallback mappings
+ * or are only the result of reverse fallback mappings.
+- * This set will also not include PUA code points with fallbacks, although
+- * ucnv_fromUnicode() will always uses those mappings despite ucnv_setFallback().
+ * See UTR #22 "Character Mapping Markup Language"
+ * at http://www.unicode.org/reports/tr22/
+ *
+@@ -900,12 +893,6 @@
+ * by comparing its roundtrip set with the set of ExemplarCharacters from
+ * ICU's locale data or other sources
+ *
+- * 2. UCNV_ROUNDTRIP_AND_FALLBACK_SET
+- *
+- * The set of all Unicode code points that can be converted with the converter (ucnv_fromUnicode())
+- * when fallbacks are turned on (see ucnv_setFallback()).
+- * This set includes all code points with roundtrips and fallbacks (but not reverse fallbacks).
+- *
+ * In the future, there may be more UConverterUnicodeSet choices to select
+ * sets with different properties.
+ *
+diff -ru icu.5691/source/test/intltest/convtest.cpp icu/source/test/intltest/convtest.cpp
+--- icu.5691/source/test/intltest/convtest.cpp 2009-06-02 16:07:21.000000000 +0100
++++ icu/source/test/intltest/convtest.cpp 2009-06-02 16:24:08.000000000 +0100
+@@ -552,7 +552,7 @@
+ }
+ UConverterUnicodeSet which;
+ for(which=UCNV_ROUNDTRIP_SET; which<UCNV_SET_COUNT; which=(UConverterUnicodeSet)((int)which+1)) {
+- if(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
++ if(FALSE) {
+ ucnv_setFallback(cnv, TRUE);
+ }
+ expected.add(0, cpLimit-1);
diff --git a/icu.icuXXXX.virama.prevnext.patch b/icu.icuXXXX.virama.prevnext.patch
new file mode 100644
index 0000000..49393c2
--- /dev/null
+++ b/icu.icuXXXX.virama.prevnext.patch
@@ -0,0 +1,98 @@
+diff -ur icu.orig/source/common/rbbi.cpp icu/source/common/rbbi.cpp
+--- icu.orig/source/common/rbbi.cpp 2006-10-05 11:54:13.000000000 +0100
++++ icu/source/common/rbbi.cpp 2006-10-05 11:57:31.000000000 +0100
+@@ -879,6 +879,22 @@
+ RBBI_END // state machine processing is after end of user text.
+ };
+
++#define VIRAMA_SCRIPT(wc) ((wc) >= 0x0901 && (wc) <= 0x17FF)
++#define VIRAMA(wc) ((wc) == 0x094D || \
++ (wc) == 0x09CD || \
++ (wc) == 0x0A4D || \
++ (wc) == 0x0ACD || \
++ (wc) == 0x0B4D || \
++ (wc) == 0x0BCD || \
++ (wc) == 0x0C4D || \
++ (wc) == 0x0CCD || \
++ (wc) == 0x0D4D || \
++ (wc) == 0x0DCA || \
++ (wc) == 0x0E3A || \
++ (wc) == 0x0F84 || \
++ (wc) == 0x1039 || \
++ (wc) == 0x17D2 || \
++ (wc) == 0x200D)
+
+ //-----------------------------------------------------------------------------------
+ //
+@@ -896,6 +911,7 @@
+ RBBIRunMode mode;
+
+ RBBIStateTableRow *row;
++ UChar32 prevchar;
+ UChar32 c;
+ int32_t lookaheadStatus = 0;
+ int32_t lookaheadTagIdx = 0;
+@@ -919,6 +935,7 @@
+ // if we're already at the end of the text, return DONE.
+ initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
+ result = initialPosition;
++ prevchar = 0;
+ c = UTEXT_NEXT32(fText);
+ if (fData == NULL || c==U_SENTINEL) {
+ return BreakIterator::DONE;
+@@ -1001,6 +1018,11 @@
+
+ // State Transition - move machine to its next state
+ //
++ if (VIRAMA_SCRIPT(c) && VIRAMA(prevchar))
++ {
++ state = START_STATE;
++ row = (RBBIStateTableRow *) (tableData + tableRowLen * state);
++ }
+ state = row->fNextState[category];
+ row = (RBBIStateTableRow *)
+ // (statetable->fTableData + (statetable->fRowLen * state));
+@@ -1059,6 +1081,7 @@
+ // the input position. The next iteration will be processing the
+ // first real input character.
+ if (mode == RBBI_RUN) {
++ prevchar = c;
+ c = UTEXT_NEXT32(fText);
+ } else {
+ if (mode == RBBI_START) {
+@@ -1107,6 +1130,7 @@
+ int16_t category = 0;
+ RBBIRunMode mode;
+ RBBIStateTableRow *row;
++ UChar32 prevchar;
+ UChar32 c;
+ int32_t lookaheadStatus = 0;
+ int32_t result = 0;
+@@ -1135,6 +1159,7 @@
+ // Set up the starting char.
+ initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
+ result = initialPosition;
++ prevchar = 0;
+ c = UTEXT_PREVIOUS32(fText);
+
+ // Set the initial state for the state machine
+@@ -1218,6 +1243,11 @@
+
+ // State Transition - move machine to its next state
+ //
++ if (VIRAMA_SCRIPT(prevchar) && VIRAMA(c))
++ {
++ state = START_STATE;
++ row = (RBBIStateTableRow *) (statetable->fTableData + (statetable->fRowLen * state));
++ }
+ state = row->fNextState[category];
+ row = (RBBIStateTableRow *)
+ (statetable->fTableData + (statetable->fRowLen * state));
+@@ -1269,6 +1299,7 @@
+ // the input position. The next iteration will be processing the
+ // first real input character.
+ if (mode == RBBI_RUN) {
++ prevchar = c;
+ c = UTEXT_PREVIOUS32(fText);
+ } else {
+ if (mode == RBBI_START) {
diff --git a/icu.rh429023.regexp.patch b/icu.rh429023.regexp.patch
new file mode 100644
index 0000000..ef8eded
--- /dev/null
+++ b/icu.rh429023.regexp.patch
@@ -0,0 +1,307 @@
+diff -ru icu.orig/source/common/uvectr32.cpp icu/source/common/uvectr32.cpp
+--- icu.orig/source/common/uvectr32.cpp 2003-08-27 02:01:30.000000000 +0100
++++ icu/source/common/uvectr32.cpp 2008-01-22 08:37:06.000000000 +0000
+@@ -1,6 +1,6 @@
+ /*
+ ******************************************************************************
+-* Copyright (C) 1999-2003, International Business Machines Corporation and *
++* Copyright (C) 1999-2008, International Business Machines Corporation and *
+ * others. All Rights Reserved. *
+ ******************************************************************************
+ * Date Name Description
+@@ -26,6 +26,7 @@
+ UVector32::UVector32(UErrorCode &status) :
+ count(0),
+ capacity(0),
++ maxCapacity(0),
+ elements(NULL)
+ {
+ _init(DEFUALT_CAPACITY, status);
+@@ -34,6 +35,7 @@
+ UVector32::UVector32(int32_t initialCapacity, UErrorCode &status) :
+ count(0),
+ capacity(0),
++ maxCapacity(0),
+ elements(0)
+ {
+ _init(initialCapacity, status);
+@@ -46,6 +48,9 @@
+ if (initialCapacity < 1) {
+ initialCapacity = DEFUALT_CAPACITY;
+ }
++ if (maxCapacity>0 && maxCapacity<initialCapacity) {
++ initialCapacity = maxCapacity;
++ }
+ elements = (int32_t *)uprv_malloc(sizeof(int32_t)*initialCapacity);
+ if (elements == 0) {
+ status = U_MEMORY_ALLOCATION_ERROR;
+@@ -189,21 +194,35 @@
+ UBool UVector32::expandCapacity(int32_t minimumCapacity, UErrorCode &status) {
+ if (capacity >= minimumCapacity) {
+ return TRUE;
+- } else {
+- int32_t newCap = capacity * 2;
+- if (newCap < minimumCapacity) {
+- newCap = minimumCapacity;
+- }
+- int32_t* newElems = (int32_t *)uprv_malloc(sizeof(int32_t)*newCap);
+- if (newElems == 0) {
+- status = U_MEMORY_ALLOCATION_ERROR;
+- return FALSE;
+- }
+- uprv_memcpy(newElems, elements, sizeof(elements[0]) * count);
+- uprv_free(elements);
+- elements = newElems;
+- capacity = newCap;
+- return TRUE;
++ }
++ if (maxCapacity>0 && minimumCapacity>maxCapacity) {
++ status = U_BUFFER_OVERFLOW_ERROR;
++ return FALSE;
++ }
++ int32_t newCap = capacity * 2;
++ if (newCap < minimumCapacity) {
++ newCap = minimumCapacity;
++ }
++ if (maxCapacity > 0 && newCap > maxCapacity) {
++ newCap = maxCapacity;
++ }
++ int32_t* newElems = (int32_t *)uprv_malloc(sizeof(int32_t)*newCap);
++ if (newElems == 0) {
++ status = U_MEMORY_ALLOCATION_ERROR;
++ return FALSE;
++ }
++ uprv_memcpy(newElems, elements, sizeof(elements[0]) * count);
++ uprv_free(elements);
++ elements = newElems;
++ capacity = newCap;
++ return TRUE;
++}
++
++void UVector32::setMaxCapacity(int32_t limit) {
++ U_ASSERT(limit >= 0);
++ maxCapacity = limit;
++ if (maxCapacity < 0) {
++ maxCapacity = 0;
+ }
+ }
+
+diff -ru icu.orig/source/common/uvectr32.h icu/source/common/uvectr32.h
+--- icu.orig/source/common/uvectr32.h 2006-01-18 03:52:04.000000000 +0000
++++ icu/source/common/uvectr32.h 2008-01-22 08:37:07.000000000 +0000
+@@ -1,6 +1,6 @@
+ /*
+ **********************************************************************
+-* Copyright (C) 1999-2006, International Business Machines
++* Copyright (C) 1999-2008, International Business Machines
+ * Corporation and others. All Rights Reserved.
+ **********************************************************************
+ */
+@@ -61,6 +61,8 @@
+ int32_t count;
+
+ int32_t capacity;
++
++ int32_t maxCapacity; // Limit beyond which capacity is not permitted to grow.
+
+ int32_t* elements;
+
+@@ -162,6 +164,14 @@
+ int32_t *getBuffer() const;
+
+ /**
++ * Set the maximum allowed buffer capacity for this vector/stack.
++ * Default with no limit set is unlimited, go until malloc() fails.
++ * A Limit of zero means unlimited capacity.
++ * Units are vector elements (32 bits each), not bytes.
++ */
++ void setMaxCapacity(int32_t limit);
++
++ /**
+ * ICU "poor man's RTTI", returns a UClassID for this class.
+ */
+ static UClassID U_EXPORT2 getStaticClassID();
+@@ -221,7 +231,9 @@
+ }
+
+ inline int32_t *UVector32::reserveBlock(int32_t size, UErrorCode &status) {
+- ensureCapacity(count+size, status);
++ if (ensureCapacity(count+size, status) == FALSE) {
++ return NULL;
++ }
+ int32_t *rp = elements+count;
+ count += size;
+ return rp;
+diff -ru icu.orig/source/i18n/regexcmp.cpp icu/source/i18n/regexcmp.cpp
+--- icu.orig/source/i18n/regexcmp.cpp 2006-02-02 04:37:14.000000000 +0000
++++ icu/source/i18n/regexcmp.cpp 2008-01-22 08:37:06.000000000 +0000
+@@ -1187,14 +1187,17 @@
+ // Because capture groups can be forward-referenced by back-references,
+ // we fill the operand with the capture group number. At the end
+ // of compilation, it will be changed to the variable's location.
+- U_ASSERT(groupNum > 0);
+- int32_t op;
+- if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
+- op = URX_BUILD(URX_BACKREF_I, groupNum);
++ if (groupNum < 1) {
++ error(U_REGEX_INVALID_BACK_REF);
+ } else {
+- op = URX_BUILD(URX_BACKREF, groupNum);
++ int32_t op;
++ if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
++ op = URX_BUILD(URX_BACKREF_I, groupNum);
++ } else {
++ op = URX_BUILD(URX_BACKREF, groupNum);
++ }
++ fRXPat->fCompiledPat->addElement(op, *fStatus);
+ }
+- fRXPat->fCompiledPat->addElement(op, *fStatus);
+ }
+ break;
+
+diff -ru icu.orig/source/i18n/rematch.cpp icu/source/i18n/rematch.cpp
+--- icu.orig/source/i18n/rematch.cpp 2005-08-25 19:02:20.000000000 +0100
++++ icu/source/i18n/rematch.cpp 2008-01-22 08:37:44.000000000 +0000
+@@ -30,6 +30,15 @@
+
+ U_NAMESPACE_BEGIN
+
++// Limit the size of the back track stack, to avoid system failures caused
++// by heap exhaustion. Units are in 32 bit words, not bytes.
++// This value puts ICU's limits higher than most other regexp implementations,
++// which use recursion rather than the heap, and take more storage per
++// backtrack point.
++// This constant is _temporary_. Proper API to control the value will added.
++//
++static const int32_t BACKTRACK_STACK_CAPACITY = 8000000;
++
+ //-----------------------------------------------------------------------------
+ //
+ // Constructor and Destructor
+@@ -53,6 +62,8 @@
+ }
+ if (fStack == NULL || fData == NULL) {
+ fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
++ } else {
++ fStack->setMaxCapacity(BACKTRACK_STACK_CAPACITY);
+ }
+
+ reset(*RegexStaticSets::gStaticSets->fEmptyString);
+@@ -78,6 +89,8 @@
+ }
+ if (fStack == NULL || fData == NULL) {
+ status = U_MEMORY_ALLOCATION_ERROR;
++ } else {
++ fStack->setMaxCapacity(BACKTRACK_STACK_CAPACITY);
+ }
+ reset(input);
+ }
+@@ -102,6 +115,8 @@
+ }
+ if (fStack == NULL || fData == NULL) {
+ status = U_MEMORY_ALLOCATION_ERROR;
++ } else {
++ fStack->setMaxCapacity(BACKTRACK_STACK_CAPACITY);
+ }
+ reset(*RegexStaticSets::gStaticSets->fEmptyString);
+ }
+@@ -1015,6 +1030,14 @@
+ inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int32_t savePatIdx, int32_t frameSize, UErrorCode &status) {
+ // push storage for a new frame.
+ int32_t *newFP = fStack->reserveBlock(frameSize, status);
++ if (newFP == NULL) {
++ // Heap allocation error on attempted stack expansion.
++ // We need to return a writable stack frame, so just return the
++ // previous frame. The match operation will stop quickly
++ // becuase of the error status, after which the frame will never
++ // be looked at again.
++ return fp;
++ }
+ fp = (REStackFrame *)(newFP - frameSize); // in case of realloc of stack.
+
+ // New stack frame = copy of old top frame.
+@@ -1030,8 +1053,8 @@
+ fp->fPatIdx = savePatIdx;
+ return (REStackFrame *)newFP;
+ }
+-
+-
++
++
+ //--------------------------------------------------------------------------------
+ //
+ // MatchAt This is the actual matching engine.
+@@ -2262,6 +2285,7 @@
+ }
+
+ if (U_FAILURE(status)) {
++ isMatch = FALSE;
+ break;
+ }
+ }
+diff -ru icu.orig/source/test/intltest/regextst.cpp icu/source/test/intltest/regextst.cpp
+--- icu.orig/source/test/intltest/regextst.cpp 2005-07-05 19:39:00.000000000 +0100
++++ icu/source/test/intltest/regextst.cpp 2008-01-22 08:38:21.000000000 +0000
+@@ -66,6 +66,10 @@
+ case 6: name = "PerlTests";
+ if (exec) PerlTests();
+ break;
++ case 7: name = "Bug 6149";
++ if (exec) Bug6149();
++ break;
++
+
+
+ default: name = "";
+@@ -1637,6 +1641,13 @@
+ // UnicodeSet containing a string
+ REGEX_ERR("abc[{def}]xyz", 1, 10, U_REGEX_SET_CONTAINS_STRING);
+
++
++ // Invalid Back Reference \0
++ // For ICU 3.8 and earlier
++ // For ICU versions newer than 3.8, \0 introduces an octal escape.
++ //
++ REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_INVALID_BACK_REF);
++
+ }
+
+
+@@ -2119,6 +2130,26 @@
+ }
+
+
++//--------------------------------------------------------------
++//
++// Bug6149 Verify limits to heap expansion for backtrack stack.
++// Use this pattern,
++// "(a?){1,}"
++// The zero-length match will repeat forever.
++// (That this goes into a loop is another bug)
++//
++//---------------------------------------------------------------
++void RegexTest::Bug6149() {
++ UnicodeString pattern("(a?){1,}");
++ UnicodeString s("xyz");
++ uint32_t flags = 0;
++ UErrorCode status = U_ZERO_ERROR;
++
++ RegexMatcher matcher(pattern, s, flags, status);
++ UBool result = false;
++ REGEX_ASSERT_FAIL(result=matcher.matches(status), U_BUFFER_OVERFLOW_ERROR);
++ REGEX_ASSERT(result == FALSE);
++ }
+
+ #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
+
+diff -ru icu.orig/source/test/intltest/regextst.h icu/source/test/intltest/regextst.h
+--- icu.orig/source/test/intltest/regextst.h 2003-12-03 06:58:28.000000000 +0000
++++ icu/source/test/intltest/regextst.h 2008-01-22 08:37:06.000000000 +0000
+@@ -30,6 +30,7 @@
+ virtual void Extended();
+ virtual void Errors();
+ virtual void PerlTests();
++ virtual void Bug6149();
+
+ // The following functions are internal to the regexp tests.
+ virtual UBool doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int line);