diff options
26 files changed, 5812 insertions, 0 deletions
diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..1e65467 --- /dev/null +++ b/Makefile @@ -0,0 +1,4 @@ +SRCDIR := $(shell pwd) +NAME := $(shell basename $(SRCDIR)) +include ../common/Makefile + diff --git a/canonicalize.patch b/canonicalize.patch new file mode 100644 index 0000000..3ff9c33 --- /dev/null +++ b/canonicalize.patch @@ -0,0 +1,11 @@ +--- source/common/uloc.c	2011-12-12 04:50:00.601092000 -0500 ++++ source/common/uloc.c	2011-12-12 04:56:18.503570000 -0500 +@@ -1712,7 +1712,7 @@ +         /* Check for EURO variants. */ +         sawEuro = _deleteVariant(variant, variantSize, "EURO", 4); +         len -= sawEuro; +-        if (sawEuro > 0 && name[len-1] == '_') { /* delete trailing '_' */ ++        if (sawEuro > 0 && len > 0 && name[len-1] == '_') { /* delete trailing '_' */ +             --len; +         } +  diff --git a/compat-icu36.spec b/compat-icu36.spec new file mode 100644 index 0000000..d8a820e --- /dev/null +++ b/compat-icu36.spec @@ -0,0 +1,189 @@ +Name:           compat-icu36 +Version:        3.6 +Release:        5.16.1 +Summary:        International Components for Unicode + +Group:          System Environment/Libraries +License:        X License +URL:            http://www.ibm.com/software/globalization/icu/ +Source0:        ftp://ftp.software.ibm.com/software/globalization/icu/icu4c-3_6-src.tgz +BuildRoot:      %{_tmppath}/%{name}-%{version}-root + +BuildRequires:  doxygen, autoconf +Patch1:  icu-3.4-multiarchdevel.patch +Patch2:  icu-config +Patch3:  icu.icu5365.dependantvowels.patch +Patch4:  icu.icu5418.malayam.patch +Patch5:  icu.icu5431.malayam.patch +Patch6:  icu.icu5433.oriya.patch +Patch7:  icu.icuXXXX.virama.prevnext.patch +Patch8:  icu.icu5465.telegu.patch +Patch9:  icu.icu5488.assamese.patch +Patch10: icu.icu5500.devicetablecrash.patch +Patch11: icu.icu5501.sinhala.biggerexpand.patch +Patch12: icu.icu5557.safety.patch +Patch13: icu.icu5594.gujarati.patch +Patch14: icu.icu5506.multiplevowels.patch +Patch15: icu.icuXXXX.malayalam.bysyllable.patch +Patch16: icu.rh429023.regexp.patch +Patch17: icu.icu5483.backport.patch +Patch18: icu.icu5797.backport.patch +Patch19: icu.icu6001.backport.patch +Patch20: icu.icu6002.backport.patch +Patch21: icu.icu6175.emptysegments.patch +Patch22: icu.icu5691.backport.patch +Patch23: icu.icuXXXX.rollbackabi.patch +Patch24: canonicalize.patch +Conflicts: icu + +%description +The International Components for Unicode (ICU) libraries provide +robust and full-featured Unicode services on a wide variety of +platforms. ICU supports the most current version of the Unicode +standard, and they provide support for supplementary Unicode +characters (needed for GB 18030 repertoire support). +As computing environments become more heterogeneous, software +portability becomes more important. ICU lets you produce the same +results across all the various platforms you support, without +sacrificing performance. It offers great flexibility to extend and +customize the supplied services. + + +%package     -n compat-libicu36 +Summary:        International Components for Unicode - libraries +Group:          System Environment/Libraries + +%description -n compat-libicu36 +%{summary}. + +This package provides the ICU libraries for package built +against version %{version}. + +%package     -n compat-libicu36-devel +Summary:        Development files for International Components for Unicode +Group:          Development/Libraries +Requires:       compat-libicu36 = %{version}-%{release} +Requires:       pkgconfig +Conflicts:      libicu-devel + +%description -n compat-libicu36-devel +%{summary}. + +%package     -n compat-libicu36-doc +Summary:        Documentation for International Components for Unicode +Group:          Documentation + +%description -n compat-libicu36-doc +%{summary}. + + +%prep +%setup -q -n icu +%patch1  -p1 -b .multiarchdevel +%patch3  -p1 -b .dependantvowels +%patch4  -p1 -b .icu5418.malayam.patch +%patch5  -p1 -b .icu5431.malayam.patch +%patch6  -p1 -b .icu5433.oriya.patch +%patch7  -p1 -b .icuXXXX.virama.prevnext.patch +%patch8  -p1 -b .icu5465.telegu.patch +%patch9  -p1 -b .icu5488.assamese.patch +%patch10 -p1 -b .icu5500.devicetablecrash.patch +%patch11 -p1 -b .icu5501.sinhala.biggerexpand.patch +%patch12 -p1 -b .icu5557.safety.patch +%patch13 -p1 -b .icu5594.gujarati.patch +%patch14 -p1 -b .icu5506.multiplevowels.patch +%patch15 -p1 -b .icuXXXX.malayalam.bysyllable.patch +%patch16 -p1 -b .rh429023.regexp.patch +%patch17 -p1 -b .icu5483.backport.patch +%patch18 -p1 -b .icu5797.backport.patch +%patch19 -p1 -b .icu6001.backport.patch +%patch20 -p1 -b .icu6002.backport.patch +%patch21 -p1 -b .icu6175.emptysegments.patch +%patch22 -p1 -b .icu5691.backport.patch +%patch23 -p1 -b .icuXXXX.rollbackabi.patch +%patch24 -p0 -b .canonicalize.patch + +%build +cd source +export CFLAGS="$RPM_OPT_FLAGS -fno-strict-aliasing" +export CXXFLAGS="$RPM_OPT_FLAGS -fno-strict-aliasing" +autoconf +%configure --with-data-packaging=library --disable-samples +#rhbz#654590 +sed -i -- "s/-nodefaultlibs -nostdlib//" config/mh-linux +make # %{?_smp_mflags} # -j(X>1) may "break" man pages as of 3.2, b.f.u #2357 +make doc + +%install +rm -rf $RPM_BUILD_ROOT source/__docs +make -C source install DESTDIR=$RPM_BUILD_ROOT +make -C source install-doc docdir=__docs +chmod +x $RPM_BUILD_ROOT%{_libdir}/*.so.* +cp %{PATCH2} $RPM_BUILD_ROOT%{_bindir}/icu-config +chmod a+x $RPM_BUILD_ROOT%{_bindir}/icu-config +sed -i s/\\\$\(THREADSCXXFLAGS\)// $RPM_BUILD_ROOT/%{_libdir}/pkgconfig/icu.pc +sed -i s/\\\$\(THREADSCPPFLAGS\)/-D_REENTRANT/ $RPM_BUILD_ROOT/%{_libdir}/pkgconfig/icu.pc + +%check +make -C source check + + +%clean +rm -rf $RPM_BUILD_ROOT + + +%post -n compat-libicu36 -p /sbin/ldconfig + +%postun -n compat-libicu36 -p /sbin/ldconfig + + +%files +%defattr(-,root,root,-) +%doc license.html readme.html +%{_bindir}/derb +%{_bindir}/genbrk +%{_bindir}/gencnval +%{_bindir}/genctd +%{_bindir}/genrb +%{_bindir}/makeconv +%{_bindir}/pkgdata +%{_bindir}/uconv +%{_sbindir}/* +%{_mandir}/man1/derb.1* +%{_mandir}/man1/gencnval.1* +%{_mandir}/man1/genrb.1* +%{_mandir}/man1/genbrk.1* +%{_mandir}/man1/genctd.1* +%{_mandir}/man1/makeconv.1* +%{_mandir}/man1/pkgdata.1* +%{_mandir}/man1/uconv.1* +%{_mandir}/man8/*.8* + +%files -n compat-libicu36 +%defattr(-,root,root,-) +%{_libdir}/*.so.* + +%files -n compat-libicu36-devel +%defattr(-,root,root,-) +%{_bindir}/icu-config +%{_mandir}/man1/icu-config.1* +%{_includedir}/layout +%{_includedir}/unicode +%{_libdir}/*.so +%{_libdir}/icu +%{_libdir}/pkgconfig/icu.pc +%dir %{_datadir}/icu +%dir %{_datadir}/icu/3.6 +%{_datadir}/icu/3.6/mkinstalldirs +%{_datadir}/icu/3.6/config +%doc %{_datadir}/icu/3.6/license.html + +%files -n compat-libicu36-doc +%defattr(-,root,root,-) +%doc source/__docs/icu/html/* + + +%changelog +* Wed Mar 20 2013 Remi Collet <RPMS@famillecollet.com> - 3.6-5.16.1 +- new package from RHEL-5 spec of icu. + diff --git a/icu-3.4-multiarchdevel.patch b/icu-3.4-multiarchdevel.patch new file mode 100644 index 0000000..a7839aa --- /dev/null +++ b/icu-3.4-multiarchdevel.patch @@ -0,0 +1,70 @@ +--- icu/source/configure.in.orig	2006-05-02 12:10:31.000000000 +0100 ++++ icu/source/configure.in	2006-05-02 15:06:07.000000000 +0100 +@@ -1011,6 +1011,7 @@ + 		Makefile \ + 		data/icupkg.inc \ + 		config/Makefile.inc \ ++		config/icu.pc \ + 		data/Makefile \ + 		stubdata/Makefile \ + 		common/Makefile \ +--- /dev/null	2006-04-29 13:38:37.035974750 +0100 ++++ icu/source/config/icu.pc.in	2006-05-02 15:03:14.000000000 +0100 +@@ -0,0 +1,46 @@ ++prefix = @prefix@ ++bindir = @bindir@ ++exec_prefix = @exec_prefix@ ++libdir = @libdir@ ++includedir = @includedir@ ++datadir = @datadir@ ++sbindir = @sbindir@ ++mandir = @mandir@ ++sysconfdir = @sysconfdir@ ++CFLAGS = @CFLAGS@  ++CXXFLAGS = @CXXFLAGS@  ++DEFS = @DEFS@  ++UNICODE_VERSION=@UNICODE_VERSION@ ++ICUPREFIX=icu ++ICULIBSUFFIX=@ICULIBSUFFIX@ ++LIBICU=lib${ICUPREFIX} ++LIBCPPFLAGS=-D_REENTRANT ++CPPFLAGS=@CPPFLAGS@ ${LIBCPPFLAGS} -I${prefix}/include ++SHAREDLIBCPPFLAGS=-DPIC ++SHAREDLIBCXXFLAGS=-fPIC ++SHAREDLIBCFLAGS=-fPIC ++pkglibdir=${libdir}/@PACKAGE@${ICULIBSUFFIX}/@VERSION@ ++pkgdatadir=${datadir}/@PACKAGE@${ICULIBSUFFIX}/@VERSION@ ++ICUDATA_NAME = icudt@LIB_VERSION_MAJOR@@ICUDATA_CHAR@ ++ICUPKGDATA_DIR=@libdir@ ++ICUDATA_DIR=${pkgdatadir} ++SO=so ++ICULIBS_COMMON_LIB_NAME=${LIBICU}uc${ICULIBSUFFIX}.${SO} ++SHLIB_cc=cxx ${DEFS} ${CPPFLAGS} ${CXXFLAGS} @LDFLAGS@ -shared ++SHLIB_c=cc ${DEFS} ${CPPFLAGS} ${CFLAGS} @LDFLAGS@ -shared ++ICULIBS_LAYOUT = -l${ICUPREFIX}le${ICULIBSUFFIX} -l${ICUPREFIX}lx${ICULIBSUFFIX} ++ICULIBS_TOOLUTIL = -l${ICUPREFIX}tu${ICULIBSUFFIX} ++ICULIBS_OBSOLETE = -l${ICUPREFIX}obsolete${ICULIBSUFFIX} ++ICULIBS_ICUIO = -l${ICUPREFIX}io${ICULIBSUFFIX} ++ICULIBS_I18N = -l${ICUPREFIX}i18n${ICULIBSUFFIX} ++ICULIBS_COMMON = -l${ICUPREFIX}uc${ICULIBSUFFIX} ++ICULIBS_DATA = -l${ICUPREFIX}data${ICULIBSUFFIX} ++ICULIBS_LIBSONLY = ${ICULIBS_I18N} ${ICULIBS_COMMON} ${ICULIBS_DATA} ++ICULIBS_SYSTEMLIBS = @LIBS@ ++ICULIBS_BASE = @LIBS@ -L${libdir} ++ICULIBS = ${ICULIBS_BASE} ${ICULIBS_I18N} ${ICULIBS_COMMON} ${ICULIBS_DATA} ++ ++Name: @PACKAGE@ ++Description: International Components for Unicode ++Version: @VERSION@ ++Libs: @LDFLAGS@ ${ICULIBS} @LIBS@ +--- icu/source/Makefile.in.orig	2006-05-02 12:10:31.000000000 +0100 ++++ icu/source/Makefile.in	2006-05-02 15:18:15.000000000 +0100 +@@ -125,6 +125,8 @@ + 	@$(MKINSTALLDIRS) $(DESTDIR)$(sbindir) + 	$(INSTALL_DATA) @platform_make_fragment@ $(DESTDIR)$(pkgdatadir)/config/@platform_make_fragment_name@ + 	$(INSTALL_SCRIPT) $(top_srcdir)/mkinstalldirs $(DESTDIR)$(pkgdatadir)/mkinstalldirs ++	@$(MKINSTALLDIRS) $(DESTDIR)$(libdir)/pkgconfig ++	$(INSTALL_DATA) $(top_srcdir)/config/icu.pc $(DESTDIR)$(libdir)/pkgconfig/icu.pc + 	$(INSTALL_DATA) $(top_srcdir)/../license.html $(DESTDIR)$(pkgdatadir)/license.html + 	$(INSTALL_SCRIPT) $(top_builddir)/config/icu-config $(DESTDIR)$(bindir)/icu-config + 	$(INSTALL_DATA) $(top_builddir)/config/Makefile.inc $(DESTDIR)$(pkglibdir)/Makefile.inc diff --git a/icu-config b/icu-config new file mode 100755 index 0000000..08f9ce8 --- /dev/null +++ b/icu-config @@ -0,0 +1,387 @@ +#!/bin/sh +## -*-sh-*- +#set -x +# BEGIN of icu-config-top +#****************************************************************************** +#   Copyright (C) 1999-2004, International Business Machines +#   Corporation and others.  All Rights Reserved. +#****************************************************************************** +# This script is designed to aid configuration of ICU. +# rpath links a library search path right into the binaries. + + +### END of icu-config-top + +## Zero out prefix. +exec_prefix=`pkg-config --variable=exec_prefix icu` +execprefix=$exec_prefix +prefix=`pkg-config --variable=prefix icu` + + +loaddefs() +{ +LDLIBRARYPATH_ENVVAR="LD_LIBRARY_PATH" +bindir=`pkg-config --variable=bindir icu` +sbindir=`pkg-config --variable=sbindir icu` +libdir=`pkg-config --variable=libdir icu` +sysconfdir=`pkg-config --variable=sysconfdir icu` +mandir=`pkg-config --variable=mandir icu` +datadir=`pkg-config --variable=datadir icu` +pkglibdir=`pkg-config --variable=pkglibdir icu` +ICULIBS_COMMON_LIB_NAME=`pkg-config --variable=ICULIBS_COMMON_LIB_NAME icu` +UNICODE_VERSION=`pkg-config --variable=UNICODE_VERSION icu` +VERSION=`pkg-config --modversion icu` +SO=`pkg-config --variable=SO icu` + +## -*-sh-*- +## BEGIN of icu-config-bottom. +## Copyright (c) 2002-2004, International Business Machines Corporation and +## others. All Rights Reserved. + +ICUUC_FILE=${libdir}/${ICULIBS_COMMON_LIB_NAME} +     +#  echo ENABLE RPATH $ENABLE_RPATH and RPATHLDFLAGS=${RPATH_LDFLAGS} +if [ "x$PKGDATA_MODE" = "x" ]; then +    PKGDATA_MODE=dll +fi + +} + +## The actual code of icu-config goes here. + +ME=`basename $0` + +allflags() +{ +    echo "  --bindir               Print binary directory path (bin)" +    echo "  --cc                   Print C compiler used [CC]" +    echo "  --cflags               Print C compiler flags [CFLAGS]" +    echo "  --cflags-dynamic       Print additional C flags for" +    echo "                             building shared libraries." +    echo "  --cppflags             Print C Preprocessor flags [CPPFLAGS]" +    echo "  --cppflags-dynamic     Print additional C Preprocessor flags for" +    echo "                             building shared libraries." +    echo "  --cppflags-searchpath  Print only -I include directives  (-Iinclude)" +    echo "  --cxx                  Print C++ compiler used [CXX]" +    echo "  --cxxflags             Print C++ compiler flags [CXXFLAGS]" +    echo "  --cxxflags-dynamic     Print additional C++ flags for" +    echo "                             building shared libraries." +    echo "  --detect-prefix        Attempt to detect prefix based on PATH" +    echo "  --exec-prefix          Print prefix for executables (/bin)" +    echo "  --exists               Return with 0 status if ICU exists else fail" +    echo "  --help, -?, --usage    Print this message" +    echo "  --icudata              Print shortname of ICU data file (icudt21l)" +    echo "  --icudata-install-dir  Print path to install data to - use as --install option to pkgdata(1)" +    echo "  --icudata-mode         Print default ICU pkgdata mode (dll) - use as --mode option to pkgdata(1)." +    echo "  --icudatadir           Print path to packaged archive data. Can set as [ICU_DATA]" +    echo "  --invoke               Print commands to invoke an ICU program" +    echo "  --invoke=<prog>        Print commands to invoke an ICU program named <prog> (ex: genrb)"  +    echo "  --ldflags              Print -L search path and -l libraries to link with ICU [LDFLAGS].  This is for the data, uc (common), and i18n libraries only.  " +    echo "  --ldflags-layout       Print ICU layout engine link directive. Use in addition to --ldflags" +    echo "  --ldflags-libsonly     Same as --ldflags, but only the -l directives" +    echo "  --ldflags-searchpath   Print only -L (search path) directive" +    echo "  --ldflags-system       Print only system libs ICU links with (-lpthread, -lm)" +    echo "  --ldflags-icuio        Print ICU icuio link directive. Use in addition to --ldflags " +    echo "  --ldflags-obsolete     Print ICU obsolete link directive. Use in addition to --ldflags. (requires icuapps/obsolete to be built and installed.) " +    echo "  --mandir               Print manpage (man) path" +    echo "  --prefix               Print PREFIX to icu install (/usr/local)" +    echo "  --prefix=XXX           Set prefix to XXX for remainder of command" +    echo "  --sbindir              Print system binary path (sbin) " +    echo "  --shared-datadir       Print shared data (share) path. This is NOT the ICU data dir." +    echo "  --shlib-c              Print the command to compile and build C shared libraries with ICU" +    echo "  --shlib-cc             Print the command to compile and build C++ shared libraries with ICU" +    echo "  --sysconfdir           Print system config (etc) path" +    echo "  --unicode-version      Print version of Unicode data used in ICU ($UNICODE_VERSION)" +    echo "  --version              Print ICU version ($VERSION)" +    echo "  --incfile              Print path to Makefile.inc (for -O option of pkgdata)" +} + +## Print the normal usage message +shortusage() +{ +    echo "usage: ${ME} " `allflags | cut -c-25 | sed -e 's%.*%[ & ]%'` +} + + +usage() +{ +    echo "${ME}: icu-config: ICU configuration helper script" +    echo +    echo "The most commonly used options will be --cflags, --cxxflags, --cppflags, and --ldflags." +    echo 'Example (in make):   CPFLAGS=$(shell icu-config --cppflags)' +    echo '                     LDFLAGS=$(shell icu-config --ldflags)' +    echo "                     (etc).." +    echo +    echo "Usage:" +    allflags + +    echo  +    echo " [Brackets] show MAKE variable equivalents,  (parenthesis) show example output" +    echo +    echo "Copyright (c) 2002, International Business Machines Corporation and others. All Rights Reserved." +} + +## Check the sanity of current variables +sanity() +{ +    if [ ! -f ${ICUUC_FILE} ]; +    then +	echo "### $ME: Can't find ${ICUUC_FILE} - ICU prefix is wrong."  1>&2 +	echo "###      Try the --prefix= or --exec-prefix= options " 1>&2 +	echo "###      or --detect-prefix" +	echo "### $ME: Exitting." 1>&2 +	exit 2 +    fi +} + +## Main starts here. + +if [ $# -lt 1 ]; then +    shortusage +    exit 1 +fi + + +# Load our variables from autoconf +# ALWAYS load twice because of dependencies +loaddefs +loaddefs +sanity + +while [ $# -gt 0 ]; +do +    arg="$1" +    var=`echo $arg | sed -e 's/^[^=]*=//'` +#    echo "### processing $arg" 1>&2 +    case "$arg" in + +        # undocumented. +	--debug) +	    set -x +	    ;; + +        --so) +            echo $SO +            ;; + +	--bindir) +	    echo $bindir +	    ;; + +	--libdir) +	    echo $libdir +	    ;; + +	--exists) +	    sanity +	    ;; + +	--sbindir) +	    echo $sbindir +	    ;; + +	--invoke=*) +	    QUOT="'" +            CMD="${var}" + +            # If it's not a locally executable command (1st choice) then  +            # search for it in the ICU directories.  +            if [ ! -x ${CMD} ]; then +                if [ -x ${bindir}/${var} ]; then +                    CMD="${bindir}/${var}" +                fi +                if [ -x ${sbindir}/${var} ]; then +                    CMD="${sbindir}/${var}" +                fi +            fi + +	    echo "env ${QUOT}${LDLIBRARYPATH_ENVVAR}=${libdir}:"'${'"${LDLIBRARYPATH_ENVVAR}"'}'${QUOT} ${CMD} +	    ;; + +	--invoke) +	    QUOT="'" +	    echo "env ${QUOT}${LDLIBRARYPATH_ENVVAR}=${libdir}:"'${'"${LDLIBRARYPATH_ENVVAR}"'}'${QUOT} +	    ;; + +	--cflags) +            pkg-config --variable=CFLAGS icu +	    ;; +	     +	--cc) +	    echo cc +	    ;; +	     +	--cxx) +	    echo c++ +	    ;; + +	--cxxflags) +            pkg-config --variable=CXXFLAGS icu +	    ;; + +	--cppflags) +	    # Don't echo the -I. - it's unneeded. +            CPPFLAGS=`pkg-config --variable=CPPFLAGS icu` +	    echo $CPPFLAGS | sed -e 's/-I. //' +	    ;; + +	--cppflags-searchpath) +	    echo -I${prefix}/include +	    ;; + +	--cppflags-dynamic) +            pkg-config --variable=SHAREDLIBCPPFLAGS icu +	    ;; + +	--cxxflags-dynamic) +            pkg-config --variable=SHAREDLIBCXXFLAGS icu +	    ;; + +	--cflags-dynamic) +            pkg-config --variable=SHAREDLIBCFLAGS icu +	    ;; + +	--ldflags-system) +            pkg-config --variable=ICULIBS_SYSTEMLIBS icu +	    ;; + +	--ldflags) +            pkg-config --libs icu +# $RPATH_LDFLAGS +	    ;; + +	--ldflags-libsonly) +            pkg-config --variable=ICULIBS_LIBSONLY icu +	    ;; + +	--ldflags-icuio) +            pkg-config --variable=ICULIBS_ICUIO icu +	    ;; + +	--ldflags-obsolete) +            pkg-config --variable=ICULIBS_OBSOLETE icu +	    ;; + +	--ldflags-toolutil) +            pkg-config --variable=ICULIBS_TOOLUTIL icu +	    ;; + +	--ldflags-layout) +            pkg-config --variable=ICULIBS_LAYOUT icu +	    ;; + +	--ldflags-searchpath) +	    echo -L${libdir} +	    ;; + +	--detect-prefix) +	    HERE=`echo $0 | sed -e "s/$ME//g"` +	    if [ -f $HERE/../lib/${ICULIBS_COMMON_LIB_NAME} ]; then +		prefix=$HERE/.. +		echo "## Using --prefix=${prefix}" 1>&2 +	    fi +	    loaddefs +	    loaddefs +	    sanity +	    ;; + +	--exec-prefix) +	    echo $exec_prefix +	    ;; + +	--prefix) +	    echo $prefix +	    ;; + +	--prefix=*) +	    prefix=$var +	    loaddefs +	    loaddefs +	    sanity +	    ;; + +	--sysconfdir) +	    echo $sysconfdir +	    ;; + +	--mandir) +	    echo $mandir +	    ;; + +	--shared-datadir) +	    echo $datadir +	    ;; + +        --incfile) +	    echo $pkglibdir/Makefile.inc +	    ;; + +	--icudata) +            pkg-config --variable=ICUDATA_NAME icu +	    ;; + +	--icudata-mode) +	    echo $PKGDATA_MODE +	    ;; + +	--icudata-install-dir) +            pkg-config --variable=ICUPKGDATA_DIR icu +	    ;; +	     +	--icudatadir) +            pkg-config --variable=ICUDATA_DIR icu +	    ;; + +	--shlib-c) +            pkg-config --variable=SHLIB_c icu +	    ;; + +	--shlib-cc) +            pkg-config --variable=SHLIB_cc icu +	    ;; + +	--version) +            echo $VERSION +            ;; + +        --unicode-version) +            echo $UNICODE_VERSION +            ;; + +	--help) +	    usage +	    exit 0 +	    ;; + +	--usage) +	    usage +	    exit 0 +	    ;; + +#	--enable-rpath=*) +#	    ENABLE_RPATH=$var +#	    loaddefs +#	    ;; + +	-?) +	    usage +	    exit 0 +	    ;; + +        *) +	    echo ${ME}: ERROR Unknown Option $arg 1>&2 +            echo 1>&2 +            shortusage 1>&2 +	    echo "### $ME: Exitting." 1>&2 +            exit 1; +            ;; +    esac +    shift +done + +# Check once before we quit (will check last used prefix) +sanity +## END of icu-config-bottom + +exit 0 + diff --git a/icu.icu5365.dependantvowels.patch b/icu.icu5365.dependantvowels.patch new file mode 100644 index 0000000..5708018 --- /dev/null +++ b/icu.icu5365.dependantvowels.patch @@ -0,0 +1,11 @@ +--- icu/source/layout/IndicReordering.cpp.orig	2006-09-05 17:01:15.000000000 +0100 ++++ icu/source/layout/IndicReordering.cpp	2006-09-05 17:01:19.000000000 +0100 +@@ -377,7 +377,7 @@ +     {-1,  6,  1, -1, -1, -1, -1, -1, -1,  5,  9,  5,  5,  4, 12}, //  2 - consonant with nukta +     {-1,  6,  1, -1, -1, -1, -1, -1,  2,  5,  9,  5,  5,  4, 12}, //  3 - consonant +     {-1, -1, -1, -1, -1, -1,  3,  2, -1, -1, -1, -1, -1, -1,  7}, //  4 - consonant virama +-    {-1,  6,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, //  5 - dependent vowels ++    {-1,  6,  1, -1, -1, -1, -1, -1, -1,  5, -1, -1, -1, -1, -1}, //  5 - dependent vowels +     {-1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, //  6 - vowel mark +     {-1, -1, -1, -1, -1, -1,  3,  2, -1, -1, -1, -1, -1, -1, -1}, //  7 - consonant virama ZWJ, consonant ZWJ virama +     {-1,  6,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  4, -1}, //  8 - independent vowels that can take a virama diff --git a/icu.icu5418.malayam.patch b/icu.icu5418.malayam.patch new file mode 100644 index 0000000..03fbe63 --- /dev/null +++ b/icu.icu5418.malayam.patch @@ -0,0 +1,39 @@ +--- icu/source/layout/IndicClassTables.cpp.orig	2006-08-23 01:12:40.000000000 +0100 ++++ icu/source/layout/IndicClassTables.cpp	2006-09-25 09:06:38.000000000 +0100 +@@ -173,6 +173,19 @@ +     _iv, _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx  // 0CE0 - 0CEF + }; +  ++#if 1 ++//use the pango char class table here ++static const IndicClassTable::CharClass mlymCharClasses[] = ++{ ++    _xx, _xx, _mp, _mp, _xx, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _xx, _iv, _iv, /* 0D00 - 0D0F */ ++    _iv, _xx, _iv, _iv, _iv, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, /* 0D10 - 0D1F */ ++    _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _xx, _ct, _ct, _ct, _ct, _ct, _pb, /* 0D20 - 0D2F */ ++    _pb, _cn, _ct, _ct, _ct, _pb, _ct, _ct, _ct, _ct, _xx, _xx, _xx, _xx, _dr, _dr, /* 0D30 - 0D3F */ ++    _dr, _dr, _dr, _dr, _xx, _xx, _dl, _dl, _dl, _xx, _s1, _s2, _s3, _vr, _xx, _xx, /* 0D40 - 0D4F */ ++    _xx, _xx, _xx, _xx, _xx, _xx, _xx, _dr, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, /* 0D50 - 0D5F */ ++    _iv, _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx  /* 0D60 - 0D6F */ ++}; ++#else + // FIXME: this is correct for old-style Malayalam (MAL) but not for reformed Malayalam (MLR) + // FIXME: should there be a REPH for old-style Malayalam? + static const IndicClassTable::CharClass mlymCharClasses[] = +@@ -185,6 +198,7 @@ +     _xx, _xx, _xx, _xx, _xx, _xx, _xx, _m2, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, // 0D50 - 0D5F +     _iv, _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx  // 0D60 - 0D6F + }; ++#endif +   + static const IndicClassTable::CharClass sinhCharClasses[] = + { +@@ -232,7 +246,7 @@ + #define TAML_SCRIPT_FLAGS (SF_MPRE_FIXUP | SF_NO_POST_BASE_LIMIT | SF_FILTER_ZERO_WIDTH) + #define TELU_SCRIPT_FLAGS (SF_MATRAS_AFTER_BASE | SF_FILTER_ZERO_WIDTH | 3) + #define KNDA_SCRIPT_FLAGS (SF_MATRAS_AFTER_BASE | SF_FILTER_ZERO_WIDTH | 3) +-#define MLYM_SCRIPT_FLAGS (SF_MPRE_FIXUP | SF_NO_POST_BASE_LIMIT | SF_FILTER_ZERO_WIDTH) ++#define MLYM_SCRIPT_FLAGS (SF_MPRE_FIXUP | SF_NO_POST_BASE_LIMIT) + #define SINH_SCRIPT_FLAGS (SF_MPRE_FIXUP | SF_NO_POST_BASE_LIMIT) +  + // diff --git a/icu.icu5431.malayam.patch b/icu.icu5431.malayam.patch new file mode 100644 index 0000000..48a549d --- /dev/null +++ b/icu.icu5431.malayam.patch @@ -0,0 +1,107 @@ +--- icu.orig/source/layout/IndicReordering.cpp	2006-12-21 09:24:42.000000000 +0000 ++++ icu/source/layout/IndicReordering.cpp	2006-12-21 09:16:15.000000000 +0000 +@@ -50,6 +50,14 @@ + #define distFeatureMask 0x00010000UL + #define initFeatureMask 0x00008000UL +  ++// TODO: Find better names for these! ++#define tagArray4 (loclFeatureMask | nuktFeatureMask | akhnFeatureMask | vatuFeatureMask | presFeatureMask | blwsFeatureMask | abvsFeatureMask | pstsFeatureMask | halnFeatureMask | blwmFeatureMask | abvmFeatureMask | distFeatureMask) ++#define tagArray3 (pstfFeatureMask | tagArray4) ++#define tagArray2 (halfFeatureMask | tagArray3) ++#define tagArray1 (blwfFeatureMask | tagArray2) ++#define tagArray0 (rphfFeatureMask | tagArray1) ++ ++ + class IndicReorderingOutput : public UMemory { + private: +     le_int32   fOutIndex; +@@ -154,6 +162,27 @@ +         fSMabove = fSMbelow = 0; +     } +  ++    void swapChars(int a, int b) ++    { ++	LEErrorCode success = LE_NO_ERROR; ++        LEUnicode temp_char; ++        le_uint32 temp_index; ++        FeatureMask temp_tag; ++ ++        temp_char = fOutChars[fOutIndex + b]; ++	temp_index = fGlyphStorage.getCharIndex(fOutIndex + b, success); ++        temp_tag = fGlyphStorage.getAuxData(fOutIndex + b, success); ++ ++        fOutChars[fOutIndex + b] = fOutChars[fOutIndex + a]; ++        le_uint32 toswap = fGlyphStorage.getCharIndex(fOutIndex + a, success); ++        fGlyphStorage.setCharIndex(fOutIndex + b,  toswap, success); ++        fGlyphStorage.setAuxData(fOutIndex + b, tagArray3, success); ++ ++        fOutChars[fOutIndex + a] = temp_char; ++        fGlyphStorage.setCharIndex(fOutIndex + a, temp_index, success); ++        fGlyphStorage.setAuxData(fOutIndex + a, temp_tag, success); ++    } ++ +     void writeChar(LEUnicode ch, le_uint32 charIndex, FeatureMask charFeatures) +     { +         LEErrorCode success = LE_NO_ERROR; +@@ -335,13 +364,6 @@ +     C_DOTTED_CIRCLE = 0x25CC + }; +  +-// TODO: Find better names for these! +-#define tagArray4 (loclFeatureMask | nuktFeatureMask | akhnFeatureMask | vatuFeatureMask | presFeatureMask | blwsFeatureMask | abvsFeatureMask | pstsFeatureMask | halnFeatureMask | blwmFeatureMask | abvmFeatureMask | distFeatureMask) +-#define tagArray3 (pstfFeatureMask | tagArray4) +-#define tagArray2 (halfFeatureMask | tagArray3) +-#define tagArray1 (blwfFeatureMask | tagArray2) +-#define tagArray0 (rphfFeatureMask | tagArray1) +- + static const FeatureMap featureMap[] = + { +     {loclFeatureTag, loclFeatureMask}, +@@ -629,6 +651,21 @@ +                 output.writeChar(chars[i], i, tagArray4); +             } +  ++            /* for the special conjuction of Cons+0x0d4d+0x0d31 or Cons+0x0d4d+0x0d30 of Malayalam */ ++            if ((baseConsonant - 2 >= 0) && ++                (chars[baseConsonant - 1] == 0x0d4d) && ++		((chars[baseConsonant] == 0x0d31) ||  ++		 (chars[baseConsonant] == 0x0d30)) && ++                ((chars[baseConsonant - 2] >= 0x0d15) && ++                 (chars[baseConsonant - 2] <= 0x0d39)))  { ++               if (baseConsonant < 3 || chars[baseConsonant - 3] != 0x0d4d) { ++                    output.swapChars(-1, -3); ++ ++		    if (mpreFixups) ++		        mpreFixups->reduce(); ++		} ++            } ++ +             if ((classTable->scriptFlags & SF_MATRAS_AFTER_BASE) != 0) { +                 output.writeMbelow(); +                 output.writeSMbelow(); // FIXME: there are no SMs in these scripts... +--- icu.orig/source/layout/MPreFixups.h	2006-11-10 09:42:47.000000000 +0000 ++++ icu/source/layout/MPreFixups.h	2006-12-21 09:13:47.000000000 +0000 +@@ -31,6 +31,8 @@ +      +     void apply(LEGlyphStorage &glyphStorage); +  ++    void reduce(); ++ + private: +     FixupData *fFixupData; +     le_int32   fFixupCount; +--- icu.orig/source/layout/MPreFixups.cpp	2006-11-10 09:42:47.000000000 +0000 ++++ icu/source/layout/MPreFixups.cpp	2006-12-21 09:16:33.000000000 +0000 +@@ -40,6 +40,12 @@ +     } + } +  ++void MPreFixups::reduce() ++{ ++    if (fFixupCount > 0) ++        fFixupCount--; ++} ++ + void MPreFixups::apply(LEGlyphStorage &glyphStorage) + { +     for (le_int32 fixup = 0; fixup < fFixupCount; fixup += 1) { diff --git a/icu.icu5433.oriya.patch b/icu.icu5433.oriya.patch new file mode 100644 index 0000000..f35f5a2 --- /dev/null +++ b/icu.icu5433.oriya.patch @@ -0,0 +1,31 @@ +diff -ru icu.orig/source/layout/IndicClassTables.cpp icu/source/layout/IndicClassTables.cpp +--- icu.orig/source/layout/IndicClassTables.cpp	2006-10-03 14:27:47.000000000 +0100 ++++ icu/source/layout/IndicClassTables.cpp	2006-10-03 14:30:07.000000000 +0100 +@@ -120,6 +120,19 @@ +     _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx  // 0AE0 - 0AEF + }; +  ++#if 1 ++static const IndicClassTable::CharClass oryaCharClasses[] = ++{ ++    _xx, _ma, _mp, _mp, _xx, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _xx, _xx, _iv, /* 0B00 - 0B0F */ ++    _iv, _xx, _xx, _iv, _iv, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _ct, _bb, /* 0B10 - 0B1F */ ++    _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _xx, _bb, _bb, _bb, _bb, _bb, _pb, /* 0B20 - 0B2F */ ++    _rb, _xx, _bb, _bb, _xx, _bb, _bb, _bb, _bb, _bb, _xx, _xx, _nu, _xx, _dr, _da, /* 0B30 - 0B3F */ ++    _dr, _db, _db, _db, _xx, _xx, _xx, _dl, _s1, _xx, _xx, _s2, _s3, _vr, _xx, _xx, /* 0B40 - 0B4F */ ++    _xx, _xx, _xx, _xx, _xx, _xx, _da, _dr, _xx, _xx, _xx, _xx, _cn, _cn, _xx, _pb, /* 0B50 - 0B5F */ ++    _iv, _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, /* 0B60 - 0B6F */ ++    _xx, _bb                                                                        /* 0B70 - 0B71 */ ++}; ++#else + static const IndicClassTable::CharClass oryaCharClasses[] = + { +     _xx, _ma, _mp, _mp, _xx, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _xx, _xx, _iv, // 0B00 - 0B0F +@@ -131,6 +144,7 @@ +     _iv, _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, // 0B60 - 0B6F +     _xx, _ct                                                                        // 0B70 - 0B71 + }; ++#endif +  + static const IndicClassTable::CharClass tamlCharClasses[] = + { diff --git a/icu.icu5465.telegu.patch b/icu.icu5465.telegu.patch new file mode 100644 index 0000000..7e80103 --- /dev/null +++ b/icu.icu5465.telegu.patch @@ -0,0 +1,29 @@ +--- icu.orig/source/layout/IndicClassTables.cpp	2007-02-05 14:44:17.000000000 +0000 ++++ icu/source/layout/IndicClassTables.cpp	2007-02-05 14:47:49.000000000 +0000 +@@ -145,6 +145,7 @@ + }; +  + // FIXME: Should some of the bb's be pb's? (KA, NA, MA, YA, VA, etc. (approx 13)) ++#if 0 + static const IndicClassTable::CharClass teluCharClasses[] = + { +     _xx, _mp, _mp, _mp, _xx, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _xx, _iv, _iv, // 0C00 - 0C0F +@@ -155,6 +156,18 @@ +     _xx, _xx, _xx, _xx, _xx, _da, _m2, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, // 0C50 - 0C5F +     _iv, _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx  // 0C60 - 0C6F + }; ++#else ++static const IndicClassTable::CharClass teluCharClasses[] = ++{    ++    _xx, _mp, _mp, _mp, _xx, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _xx, _iv, _iv, /* 0C00 - 0C0F */ ++    _iv, _xx, _iv, _iv, _iv, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, /* 0C10 - 0C1F */ ++    _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _xx, _bb, _bb, _bb, _bb, _bb, _bb, /* 0C20 - 0C2F */ ++    _bb, _bb, _bb, _bb, _xx, _bb, _bb, _bb, _bb, _bb, _xx, _xx, _xx, _xx, _da, _da, /* 0C30 - 0C3F */ ++    _da, _dr, _dr, _dr, _dr, _xx, _da, _da, _s1, _xx, _da, _da, _da, _vr, _xx, _xx, /* 0C40 - 0C4F */ ++    _xx, _xx, _xx, _xx, _xx, _da, _db, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, /* 0C50 - 0C5F */ ++    _iv, _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx  /* 0C60 - 0C6F */ ++}; ++#endif +  + // U+CC3 and U+CC4 are _lm here not _dr since the Kannada rendering + // rules want them below and to the right of the entire cluster diff --git a/icu.icu5483.backport.patch b/icu.icu5483.backport.patch new file mode 100644 index 0000000..039dee2 --- /dev/null +++ b/icu.icu5483.backport.patch @@ -0,0 +1,874 @@ +diff -ru icu.orig/source/common/ucnv2022.c icu/source/common/ucnv2022.c +--- icu.orig/source/common/ucnv2022.c	2009-06-02 11:48:38.000000000 +0100 ++++ icu/source/common/ucnv2022.c	2009-06-02 12:30:29.000000000 +0100 +@@ -84,6 +84,26 @@ + #define V_TAB   0x0B + #define SPACE   0x20 +  ++enum { ++    HWKANA_START=0xff61, ++    HWKANA_END=0xff9f ++}; ++ ++/* ++ * 94-character sets with native byte values A1..FE are encoded in ISO 2022 ++ * as bytes 21..7E. (Subtract 0x80.) ++ * 96-character sets with native byte values A0..FF are encoded in ISO 2022 ++ * as bytes 20..7F. (Subtract 0x80.) ++ * Do not encode C1 control codes with native bytes 80..9F ++ * as bytes 00..1F (C0 control codes). ++ */ ++enum { ++    GR94_START=0xa1, ++    GR94_END=0xfe, ++    GR96_START=0xa0, ++    GR96_END=0xff ++}; ++ + /* +  * ISO 2022 control codes must not be converted from Unicode +  * because they would mess up the byte stream. +@@ -981,22 +1001,27 @@ +  +  + /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c +- * any future change in _MBCSFromUChar32() function should be reflected in  +- * this macro ++ * any future change in _MBCSFromUChar32() function should be reflected here. ++ * @return number of bytes in *value; negative number if fallback; 0 if no mapping +  */ +-static U_INLINE void  ++static U_INLINE int32_t + MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData, +                                          UChar32 c,   +                                          uint32_t* value,  +                                          UBool useFallback,  +-                                         int32_t *length,  +                                          int outputType) + { +     const int32_t *cx; +     const uint16_t *table; +     uint32_t stage2Entry; +     uint32_t myValue; ++    int32_t length; +     const uint8_t *p; ++    /* ++     * TODO(markus): Use and require new, faster MBCS conversion table structures. ++     * Use internal version of ucnv_open() that verifies that the new structures are available, ++     * else U_INTERNAL_PROGRAM_ERROR. ++     */ +     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ +     if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { +         table=sharedData->mbcs.fromUnicodeTable; +@@ -1005,51 +1030,60 @@ +         if(outputType==MBCS_OUTPUT_2){ +             myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); +             if(myValue<=0xff) { +-                *length=1; ++                length=1; +             } else { +-                *length=2; ++                length=2; +             } +         } else /* outputType==MBCS_OUTPUT_3 */ { +             p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); +             myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; +             if(myValue<=0xff) { +-                *length=1; ++                length=1; +             } else if(myValue<=0xffff) { +-                *length=2; ++                length=2; +             } else { +-                *length=3; ++                length=3; +             } +         } ++        /* ++         * TODO(markus): Use Shift-JIS table for JIS X 0208, to save mapping table space. ++         * Pass in parameter for type of output bytes, for validation and shifting: ++         * - Direct: Pass bytes through, but forbid control codes 00-1F (except SI/SO/ESC) and space 20? ++         *   (Need to allow some (TAB/LF/CR) or most of them for ASCII and maybe JIS X 0201.) ++         * - A1-FE: Subtract 80 after range check. ++         * - SJIS: Shift DBCS result to 21-7E x 21-7E. ++         */ +         /* is this code point assigned, or do we use fallbacks? */ +-        if( (stage2Entry&(1<<(16+(c&0xf))))!=0 || +-            (FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) +-        ) { ++        if((stage2Entry&(1<<(16+(c&0xf))))!=0) { ++            /* assigned */ ++            *value=myValue; ++            return length; ++        } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) { +             /* +              * We allow a 0 byte output if the "assigned" bit is set for this entry. +              * There is no way with this data structure for fallback output +              * to be a zero byte. +              */ +-            /* assigned */ +             *value=myValue; +-            return; ++            return -length; +         } +     } +  +     cx=sharedData->mbcs.extIndexes; +     if(cx!=NULL) { +-        *length=ucnv_extSimpleMatchFromU(cx, c, value, useFallback); +-        return; ++        return ucnv_extSimpleMatchFromU(cx, c, value, useFallback); +     } +  +     /* unassigned */ +-    *length=0; ++    return 0; + } +  + /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c +- * any future change in _MBCSSingleFromUChar32() function should be reflected in  +- * this macro ++ * any future change in _MBCSSingleFromUChar32() function should be reflected here. ++ * @param retval pointer to output byte ++ * @return 1 roundtrip byte  0 no mapping  -1 fallback byte +  */ +-static U_INLINE void  ++static U_INLINE int32_t + MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData, +                                        UChar32 c,  +                                        uint32_t* retval,  +@@ -1059,20 +1093,21 @@ +     int32_t value; +     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ +     if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { +-        *retval=(uint16_t)-1; +-        return; ++        return 0; +     } +     /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ +     table=sharedData->mbcs.fromUnicodeTable; +     /* get the byte for the output */ +     value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); +     /* is this code point assigned, or do we use fallbacks? */ +-    if(useFallback ? value>=0x800 : value>=0xc00) { +-        value &=0xff; ++    *retval=(uint32_t)(value&0xff); ++    if(value>=0xf00) { ++        return 1;  /* roundtrip */ ++    } else if(useFallback ? value>=0x800 : value>=0xc00) { ++        return -1;  /* fallback taken */ +     } else { +-        value= -1; ++        return 0;  /* no mapping */ +     } +-    *retval=(uint16_t) value; + } +  + #ifdef U_ENABLE_GENERIC_ISO_2022 +@@ -1316,6 +1351,7 @@ +  + static void  + UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) { ++    UConverter *cnv = args->converter; +     UConverterDataISO2022 *converterData; +     ISO2022State *pFromU2022State; +     uint8_t *target = (uint8_t *) args->target; +@@ -1335,14 +1371,13 @@ +     int8_t cs, g; +  +     /* set up the state */ +-    converterData     = (UConverterDataISO2022*)args->converter->extraInfo; ++    converterData     = (UConverterDataISO2022*)cnv->extraInfo; +     pFromU2022State   = &converterData->fromU2022State; +-    useFallback       = args->converter->useFallback; +  +     choiceCount = 0; +  +     /* check if the last codepoint of previous buffer was a lead surrogate*/ +-    if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) { ++    if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { +         goto getTrail; +     } +  +@@ -1361,26 +1396,26 @@ +                         if(UTF_IS_SECOND_SURROGATE(trail)) { +                             source++; +                             sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); +-                            args->converter->fromUChar32=0x00; ++                            cnv->fromUChar32=0x00; +                             /* convert this supplementary code point */ +                             /* exit this condition tree */ +                         } else { +                             /* this is an unmatched lead code unit (1st surrogate) */ +                             /* callback(illegal) */ +                             *err=U_ILLEGAL_CHAR_FOUND; +-                            args->converter->fromUChar32=sourceChar; ++                            cnv->fromUChar32=sourceChar; +                             break; +                         } +                     } else { +                         /* no more input */ +-                        args->converter->fromUChar32=sourceChar; ++                        cnv->fromUChar32=sourceChar; +                         break; +                     } +                 } else { +                     /* this is an unmatched trail code unit (2nd surrogate) */ +                     /* callback(illegal) */ +                     *err=U_ILLEGAL_CHAR_FOUND; +-                    args->converter->fromUChar32=sourceChar; ++                    cnv->fromUChar32=sourceChar; +                     break; +                 } +             } +@@ -1389,7 +1424,7 @@ +             if(IS_2022_CONTROL(sourceChar)) { +                 /* callback(illegal) */ +                 *err=U_ILLEGAL_CHAR_FOUND; +-                args->converter->fromUChar32=sourceChar; ++                cnv->fromUChar32=sourceChar; +                 break; +             } +  +@@ -1407,9 +1442,10 @@ +  +                 /* JIS7/8: try single-byte half-width Katakana before JISX208 */ +                 if(converterData->version == 3 || converterData->version == 4) { +-                    choices[choiceCount++] = cs = (int8_t)HWKANA_7BIT; +-                    csm &= ~CSM(cs); ++                    choices[choiceCount++] = (int8_t)HWKANA_7BIT; +                 } ++                /* Do not try single-byte half-width Katakana for other versions. */ ++                csm &= ~CSM(HWKANA_7BIT); +  +                 /* try the current G0 charset */ +                 choices[choiceCount++] = cs = pFromU2022State->cs[0]; +@@ -1432,86 +1468,134 @@ +             } +  +             cs = g = 0; ++            /* ++             * len==0: no mapping found yet ++             * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks ++             * len>0: found a roundtrip result, done ++             */ +             len = 0; ++            /* ++             * We will turn off useFallback after finding a fallback, ++             * but we still get fallbacks from PUA code points as usual. ++             * Therefore, we will also need to check that we don't overwrite ++             * an early fallback with a later one. ++             */ ++            useFallback = cnv->useFallback; +  +-            for(i = 0; i < choiceCount && len == 0; ++i) { +-                cs = choices[i]; +-                switch(cs) { ++            for(i = 0; i < choiceCount && len <= 0; ++i) { ++                uint32_t value; ++                int32_t len2; ++                int8_t cs0 = choices[i]; ++                switch(cs0) { +                 case ASCII: +                     if(sourceChar <= 0x7f) { +                         targetValue = (uint32_t)sourceChar; +                         len = 1; ++                        cs = cs0; ++                        g = 0; +                     } +                     break; +                 case ISO8859_1: +-                    if(0x80 <= sourceChar && sourceChar <= 0xff) { ++                    if(GR96_START <= sourceChar && sourceChar <= GR96_END) { +                         targetValue = (uint32_t)sourceChar - 0x80; +                         len = 1; ++                        cs = cs0; +                         g = 2; +                     } +                     break; +                 case HWKANA_7BIT: +-                    if((uint32_t)(0xff9f-sourceChar)<=(0xff9f-0xff61)) { +-                        targetValue = (uint32_t)(sourceChar - (0xff61 - 0x21)); +-                        len = 1; +- ++                    if((uint32_t)(HWKANA_END-sourceChar)<=(HWKANA_END-HWKANA_START)) { +                         if(converterData->version==3) { +                             /* JIS7: use G1 (SO) */ +-                            pFromU2022State->cs[1] = cs; /* do not output an escape sequence */ ++                            /* Shift U+FF61..U+FF9F to bytes 21..5F. */ ++                            targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21)); ++                            len = 1; ++                            pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */ +                             g = 1; +                         } else if(converterData->version==4) { +                             /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */ +-                            int8_t cs0; +- +-                            targetValue += 0x80; ++                            /* Shift U+FF61..U+FF9F to bytes A1..DF. */ ++                            targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1)); ++                            len = 1; +  +-                            cs0 = pFromU2022State->cs[0]; +-                            if(IS_JP_DBCS(cs0)) { ++                            cs = pFromU2022State->cs[0]; ++                            if(IS_JP_DBCS(cs)) { +                                 /* switch from a DBCS charset to JISX201 */ +                                 cs = (int8_t)JISX201; +-                            } else { +-                                /* stay in the current G0 charset */ +-                                cs = cs0; +                             } ++                            /* else stay in the current G0 charset */ ++                            g = 0; +                         } ++                        /* else do not use HWKANA_7BIT with other versions */ +                     } +                     break; +                 case JISX201: +                     /* G0 SBCS */ +-                    MBCS_SINGLE_FROM_UCHAR32( +-                        converterData->myConverterArray[cs], +-                        sourceChar, &targetValue, +-                        useFallback); +-                    if(targetValue <= 0x7f) { +-                        len = 1; ++                    len2 = MBCS_SINGLE_FROM_UCHAR32( ++                                converterData->myConverterArray[cs0], ++                                sourceChar, &value, ++                                useFallback); ++                    if(len2 != 0 && !(len2 < 0 && len != 0) && value <= 0x7f) { ++                        targetValue = value; ++                        len = len2; ++                        cs = cs0; ++                        g = 0; ++                        useFallback = FALSE; +                     } +                     break; +                 case ISO8859_7: +                     /* G0 SBCS forced to 7-bit output */ +-                    MBCS_SINGLE_FROM_UCHAR32( +-                        converterData->myConverterArray[cs], +-                        sourceChar, &targetValue, +-                        useFallback); +-                    if(0x80 <= targetValue && targetValue <= 0xff) { +-                        targetValue -= 0x80; +-                        len = 1; ++                    len2 = MBCS_SINGLE_FROM_UCHAR32( ++                                converterData->myConverterArray[cs0], ++                                sourceChar, &value, ++                                useFallback); ++                    if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) { ++                        targetValue = value - 0x80; ++                        len = len2; ++                        cs = cs0; +                         g = 2; ++                        useFallback = FALSE; +                     } +                     break; +                 default: +                     /* G0 DBCS */ +-                    MBCS_FROM_UCHAR32_ISO2022( +-                        converterData->myConverterArray[cs], +-                        sourceChar, &targetValue, +-                        useFallback, &len, MBCS_OUTPUT_2); +-                    if(len != 2) { +-                        len = 0; ++                    len2 = MBCS_FROM_UCHAR32_ISO2022( ++                                converterData->myConverterArray[cs0], ++                                sourceChar, &value, ++                                useFallback, MBCS_OUTPUT_2); ++                    if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */ ++                        if(cs0 == KSC5601) { ++                            /* ++                             * Check for valid bytes for the encoding scheme. ++                             * This is necessary because the sub-converter (windows-949) ++                             * has a broader encoding scheme than is valid for 2022. ++                             * ++                             * Check that the result is a 2-byte value with each byte in the range A1..FE ++                             * (strict EUC-KR DBCS) before accepting it and subtracting 0x80 from each byte ++                             * to move it to the ISO 2022 range 21..7E. ++                             */ ++                            if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && ++                                (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) ++                            ) { ++                                value -= 0x8080;  /* shift down to 21..7e byte range */ ++                            } else { ++                                break;  /* not valid for ISO 2022 */ ++                            } ++                        } ++                        targetValue = value; ++                        len = len2; ++                        cs = cs0; ++                        g = 0; ++                        useFallback = FALSE; +                     } +                     break; +                 } +             } +  +-            if(len > 0) { ++            if(len != 0) { ++                if(len < 0) { ++                    len = -len;  /* fallback */ ++                } +                 outLen = 0; /* count output bytes */ +  +                 /* write SI if necessary (only for JIS7) */ +@@ -1560,7 +1644,7 @@ +                  * then this is an error +                  */ +                 *err = U_INVALID_CHAR_FOUND; +-                args->converter->fromUChar32=sourceChar; ++                cnv->fromUChar32=sourceChar; +                 break; +             } +  +@@ -1586,7 +1670,7 @@ +                 } +             } else { +                 fromUWriteUInt8( +-                    args->converter, ++                    cnv, +                     buffer, outLen, +                     &target, (const char *)targetLimit, +                     &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), +@@ -1615,7 +1699,7 @@ +      */ +     if( U_SUCCESS(*err) && +         (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) && +-        args->flush && source>=sourceLimit && args->converter->fromUChar32==0 ++        args->flush && source>=sourceLimit && cnv->fromUChar32==0 +     ) { +         int32_t sourceIndex; +  +@@ -1654,7 +1738,7 @@ +         } +  +         fromUWriteUInt8( +-            args->converter, ++            cnv, +             buffer, outLen, +             &target, (const char *)targetLimit, +             &offsets, sourceIndex, +@@ -1777,7 +1861,7 @@ +                     !IS_JP_DBCS(cs) +                 ) { +                     /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */ +-                    targetUniChar = mySourceChar + (0xff61 - 0xa1); ++                    targetUniChar = mySourceChar + (HWKANA_START - 0xa1); +  +                     /* return from a single-shift state to the previous one */ +                     if(pToU2022State->g >= 2) { +@@ -1818,7 +1902,7 @@ +                 case HWKANA_7BIT: +                     if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) { +                         /* 7-bit halfwidth Katakana */ +-                        targetUniChar = mySourceChar + (0xff61 - 0x21); ++                        targetUniChar = mySourceChar + (HWKANA_START - 0x21); +                     } +                     break; +                 default: +@@ -1965,9 +2049,10 @@ +                 break; +             } +  +-           /* length= ucnv_MBCSFromUChar32(converterData->currentConverter->sharedData, +-                sourceChar,&targetByteUnit,args->converter->useFallback);*/ +-            MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,&length,MBCS_OUTPUT_2); ++            length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2); ++            if(length < 0) { ++                length = -length;  /* fallback */ ++            } +             /* only DBCS or SBCS characters are expected*/ +             /* DB characters with high bit set to 1 are expected */ +             if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080)&& length==2)){ +@@ -2449,7 +2534,7 @@ +  + static void  + UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ +- ++    UConverter *cnv = args->converter; +     UConverterDataISO2022 *converterData; +     ISO2022State *pFromU2022State; +     uint8_t *target = (uint8_t *) args->target; +@@ -2466,14 +2551,13 @@ +     UBool useFallback; +  +     /* set up the state */ +-    converterData     = (UConverterDataISO2022*)args->converter->extraInfo; ++    converterData     = (UConverterDataISO2022*)cnv->extraInfo; +     pFromU2022State   = &converterData->fromU2022State; +-    useFallback       = args->converter->useFallback; +  +     choiceCount = 0; +  +     /* check if the last codepoint of previous buffer was a lead surrogate*/ +-    if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) { ++    if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { +         goto getTrail; +     } +  +@@ -2492,26 +2576,26 @@ +                         if(UTF_IS_SECOND_SURROGATE(trail)) { +                             source++; +                             sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); +-                            args->converter->fromUChar32=0x00; ++                            cnv->fromUChar32=0x00; +                             /* convert this supplementary code point */ +                             /* exit this condition tree */ +                         } else { +                             /* this is an unmatched lead code unit (1st surrogate) */ +                             /* callback(illegal) */ +                             *err=U_ILLEGAL_CHAR_FOUND; +-                            args->converter->fromUChar32=sourceChar; ++                            cnv->fromUChar32=sourceChar; +                             break; +                         } +                     } else { +                         /* no more input */ +-                        args->converter->fromUChar32=sourceChar; ++                        cnv->fromUChar32=sourceChar; +                         break; +                     } +                 } else { +                     /* this is an unmatched trail code unit (2nd surrogate) */ +                     /* callback(illegal) */ +                     *err=U_ILLEGAL_CHAR_FOUND; +-                    args->converter->fromUChar32=sourceChar; ++                    cnv->fromUChar32=sourceChar; +                     break; +                 } +             } +@@ -2522,7 +2606,7 @@ +                 if(IS_2022_CONTROL(sourceChar)) { +                     /* callback(illegal) */ +                     *err=U_ILLEGAL_CHAR_FOUND; +-                    args->converter->fromUChar32=sourceChar; ++                    cnv->fromUChar32=sourceChar; +                     break; +                 } +  +@@ -2545,7 +2629,6 @@ +             } +             else{ +                 /* convert U+0080..U+10ffff */ +-                UConverterSharedData *cnv; +                 int32_t i; +                 int8_t cs, g; +  +@@ -2593,17 +2676,41 @@ +                 } +  +                 cs = g = 0; ++                /* ++                 * len==0: no mapping found yet ++                 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks ++                 * len>0: found a roundtrip result, done ++                 */ +                 len = 0; ++                /* ++                 * We will turn off useFallback after finding a fallback, ++                 * but we still get fallbacks from PUA code points as usual. ++                 * Therefore, we will also need to check that we don't overwrite ++                 * an early fallback with a later one. ++                 */ ++                useFallback = cnv->useFallback; +  +-                for(i = 0; i < choiceCount && len == 0; ++i) { +-                    cs = choices[i]; +-                    if(cs > 0) { +-                        if(cs > CNS_11643_0) { +-                            cnv = converterData->myConverterArray[CNS_11643]; +-                            MBCS_FROM_UCHAR32_ISO2022(cnv,sourceChar,&targetValue,useFallback,&len,MBCS_OUTPUT_3); +-                            if(len==3) { +-                                cs = (int8_t)(CNS_11643_0 + (targetValue >> 16) - 0x80); +-                                len = 2; ++                for(i = 0; i < choiceCount && len <= 0; ++i) { ++                    int8_t cs0 = choices[i]; ++                    if(cs0 > 0) { ++                        uint32_t value; ++                        int32_t len2; ++                        if(cs0 > CNS_11643_0) { ++                            len2 = MBCS_FROM_UCHAR32_ISO2022( ++                                        converterData->myConverterArray[CNS_11643], ++                                        sourceChar, ++                                        &value, ++                                        useFallback, ++                                        MBCS_OUTPUT_3); ++                            if(len2 == 3 || (len2 == -3 && len == 0)) { ++                                targetValue = value; ++                                cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80); ++                                if(len2 >= 0) { ++                                    len = 2; ++                                } else { ++                                    len = -2; ++                                    useFallback = FALSE; ++                                } +                                 if(cs == CNS_11643_1) { +                                     g = 1; +                                 } else if(cs == CNS_11643_2) { +@@ -2617,15 +2724,25 @@ +                             } +                         } else { +                             /* GB2312_1 or ISO-IR-165 */ +-                            cnv = converterData->myConverterArray[cs]; +-                            MBCS_FROM_UCHAR32_ISO2022(cnv,sourceChar,&targetValue,useFallback,&len,MBCS_OUTPUT_2); +-                            g = 1; /* used if len == 2 */ ++                            len2 = MBCS_FROM_UCHAR32_ISO2022( ++                                        converterData->myConverterArray[cs0], ++                                        sourceChar, ++                                        &value, ++                                        useFallback, ++                                        MBCS_OUTPUT_2); ++                            if(len2 == 2 || (len2 == -2 && len == 0)) { ++                                targetValue = value; ++                                len = len2; ++                                cs = cs0; ++                                g = 1; ++                                useFallback = FALSE; ++                            } +                         } +                     } +                 } +  +-                if(len > 0) { +-                    len = 0; /* count output bytes; it must have been len == 2 */ ++                if(len != 0) { ++                    len = 0; /* count output bytes; it must have been abs(len) == 2 */ +  +                     /* write the designation sequence if necessary */ +                     if(cs != pFromU2022State->cs[g]) { +@@ -2670,7 +2787,7 @@ +                      * then this is an error +                      */ +                     *err = U_INVALID_CHAR_FOUND; +-                    args->converter->fromUChar32=sourceChar; ++                    cnv->fromUChar32=sourceChar; +                     break; +                 } +             } +@@ -2691,7 +2808,7 @@ +                 } +             } else { +                 fromUWriteUInt8( +-                    args->converter, ++                    cnv, +                     buffer, len, +                     &target, (const char *)targetLimit, +                     &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), +@@ -2720,7 +2837,7 @@ +      */ +     if( U_SUCCESS(*err) && +         pFromU2022State->g!=0 && +-        args->flush && source>=sourceLimit && args->converter->fromUChar32==0 ++        args->flush && source>=sourceLimit && cnv->fromUChar32==0 +     ) { +         int32_t sourceIndex; +  +@@ -2748,7 +2865,7 @@ +         } +  +         fromUWriteUInt8( +-            args->converter, ++            cnv, +             SHIFT_IN_STR, 1, +             &target, (const char *)targetLimit, +             &offsets, sourceIndex, +@@ -3146,7 +3263,7 @@ +         } +         if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) { +             /* include half-width Katakana for JP */ +-            sa->addRange(sa->set, 0xff61, 0xff9f); ++            sa->addRange(sa->set, HWKANA_START, HWKANA_END); +         } +         break; +     case 'c': +diff -ru icu.orig/source/common/ucnv_ext.c icu/source/common/ucnv_ext.c +--- icu.orig/source/common/ucnv_ext.c	2009-06-02 11:48:38.000000000 +0100 ++++ icu/source/common/ucnv_ext.c	2009-06-02 12:14:20.000000000 +0100 +@@ -551,6 +551,12 @@ +         return 0; +     } +  ++    /* ++     * Tests for (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0: ++     * Do not interpret values with reserved bits used, for forward compatibility, ++     * and do not even remember intermediate results with reserved bits used. ++     */ ++ +     if(UCNV_EXT_TO_U_IS_PARTIAL(value)) { +         /* partial match, enter the loop below */ +         index=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value); +@@ -575,7 +581,8 @@ +             value=*fromUSectionValues++; +             if( value!=0 && +                 (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || +-                 FROM_U_USE_FALLBACK(useFallback, firstCP)) ++                 FROM_U_USE_FALLBACK(useFallback, firstCP)) && ++                (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 +             ) { +                 /* remember longest match so far */ +                 matchValue=value; +@@ -613,8 +620,9 @@ +                     /* partial match, continue */ +                     index=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value); +                 } else { +-                    if( UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || +-                         FROM_U_USE_FALLBACK(useFallback, firstCP) ++                    if( (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || ++                         FROM_U_USE_FALLBACK(useFallback, firstCP)) && ++                        (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 +                     ) { +                         /* full match, stop with result */ +                         matchValue=value; +@@ -632,8 +640,9 @@ +             return 0; +         } +     } else /* result from firstCP trie lookup */ { +-        if( UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || +-             FROM_U_USE_FALLBACK(useFallback, firstCP) ++        if( (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || ++             FROM_U_USE_FALLBACK(useFallback, firstCP)) && ++            (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 +         ) { +             /* full match, stop with result */ +             matchValue=value; +@@ -644,20 +653,18 @@ +         } +     } +  +-    if(matchValue&UCNV_EXT_FROM_U_RESERVED_MASK) { +-        /* do not interpret values with reserved bits used, for forward compatibility */ +-        return 0; +-    } +- +     /* return result */ +     if(matchValue==UCNV_EXT_FROM_U_SUBCHAR1) { +         return 1; /* assert matchLength==2 */ +     } +  +-    *pMatchValue=UCNV_EXT_FROM_U_MASK_ROUNDTRIP(matchValue); ++    *pMatchValue=matchValue; +     return matchLength; + } +  ++/* ++ * @param value fromUnicode mapping table value; ignores roundtrip and reserved bits ++ */ + static U_INLINE void + ucnv_extWriteFromU(UConverter *cnv, const int32_t *cx, +                    uint32_t value, +@@ -792,6 +799,10 @@ +     } + } +  ++/* ++ * Used by ISO 2022 implementation. ++ * @return number of bytes in *pValue; negative number if fallback; 0 for no mapping ++ */ + U_CFUNC int32_t + ucnv_extSimpleMatchFromU(const int32_t *cx, +                          UChar32 cp, uint32_t *pValue, +@@ -809,13 +820,15 @@ +     if(match>=2) { +         /* write result for simple, single-character conversion */ +         int32_t length; +-         ++        int isRoundtrip; ++ ++        isRoundtrip=UCNV_EXT_FROM_U_IS_ROUNDTRIP(value); +         length=UCNV_EXT_FROM_U_GET_LENGTH(value); +         value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(value); +  +         if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) { +             *pValue=value; +-            return length; ++            return isRoundtrip ? length : -length; + #if 0 /* not currently used */ +         } else if(length==4) { +             /* de-serialize a 4-byte result */ +@@ -825,7 +838,7 @@ +                 ((uint32_t)result[1]<<16)| +                 ((uint32_t)result[2]<<8)| +                 result[3]; +-            return 4; ++            return isRoundtrip ? 4 : -4; + #endif +         } +     } +diff -ru icu.orig/source/common/ucnv_ext.h icu/source/common/ucnv_ext.h +--- icu.orig/source/common/ucnv_ext.h	2009-06-02 11:48:38.000000000 +0100 ++++ icu/source/common/ucnv_ext.h	2009-06-02 12:14:20.000000000 +0100 +@@ -452,7 +452,7 @@ + #define UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) (((value)&UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)!=0) + #define UCNV_EXT_FROM_U_MASK_ROUNDTRIP(value) ((value)&~UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) +  +-/* use after masking off the roundtrip flag */ ++/* get length; masks away all other bits */ + #define UCNV_EXT_FROM_U_GET_LENGTH(value) (int32_t)(((value)>>UCNV_EXT_FROM_U_LENGTH_SHIFT)&UCNV_EXT_MAX_BYTES) +  + /* get bytes or bytes index */ +diff -ru icu.orig/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c +--- icu.orig/source/common/ucnvmbcs.c	2009-06-02 11:48:38.000000000 +0100 ++++ icu/source/common/ucnvmbcs.c	2009-06-02 12:14:20.000000000 +0100 +@@ -3785,7 +3785,8 @@ +  +     cx=sharedData->mbcs.extIndexes; +     if(cx!=NULL) { +-        return ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback); ++        length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback); ++        return length>=0 ? length : -length;  /* return abs(length); */ +     } +  +     /* unassigned */ +diff -ru icu.orig/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt +--- icu.orig/source/test/testdata/conversion.txt	2009-06-02 11:48:26.000000000 +0100 ++++ icu/source/test/testdata/conversion.txt	2009-06-02 12:14:20.000000000 +0100 +@@ -495,6 +495,46 @@ +         } +         { "UTF-16BE", :bin{ 00 }, "", :intvector{}, :int{1}, :int{0}, "truncated", ".", :bin{ 00 } } +         { "UTF-16BE", :bin{ d800dc }, "", :intvector{}, :int{1}, :int{0}, "truncated", ".", :bin{ d800dc } } ++        // Verify that mappings that would result in byte values outside 20..7F (for SBCS) ++        // or 21..7E (for DBCS) are not used. ++        // ibm-9005_X110-2007.ucm (ISO 8859-7, <ESC>.F=1b2e46): ++        //   <U009F> \x9F |0 (also in ISO 8859-1) ++        //   <U0387> \xB7 |1 ++        // windows-949-2000 (KSC_5601, <ESC>$(C=1b242843): ++        //   <UC829> \xA0\xA1 |0 ++        //   <UD4FE> \xC0\x41 |0 ++        //   <UD79D> \xC8\xFE |0 ++        { ++          "JIS8",  // =ISO_2022,locale=ja,version=4 ++          "\u009f\u0387\uc829\ud4fe\ud79d", ++          :bin{       1a1b2e461b4e371a1a1b242843487e1b2842 }, ++          :intvector{ 0,1,1,1,1,1,1,2,3,4,4,4,4,4,4,4,4,4 }, ++          :int{1}, :int{1}, "", "?", "" ++        } ++        // Ticket 5483: ISO 2022 converter incorrectly using fallback mapping ++        // Verify that a roundtrip mapping is used even when a fallback mapping is ++        // available in the current state. ++        //   U+FF61 is handled in code ++        // jisx-208.ucm (<ESC>$B=1b2442): ++        //   <U30FE> \x21\x34 |0 ++        //   <UFF5D> \x21\x51 |0  and ++        // ibm-897_P100-1995.ucm (JIS X 0201, <ESC>(J=1b284a): ++        //   <UFF5D> \x7D |1 ++        // ibm-9005_X110-2007.ucm (ISO 8859-7, <ESC>.F=1b2e46): ++        //   <U03D5> \xF6 |1 ++        //   <U2015> \xAF |0 ++        //   <UFF5D> \x7D |1 (not legal for ISO 2022) ++        // windows-949-2000 (KSC_5601, <ESC>$(C=1b242843): ++        //   <UAC00> \xB0\xA1 |0 ++        //   <UFF5D> \xA3\xFD |0 ++        //   <U223C> \xA1\xAD |0 (in extension table) ++        { ++          "JIS8",  // =ISO_2022,locale=ja,version=4 ++          "a\uff61\u03d5\uff5d\uac00\u223c\uff5d\u30fe\uff5d",  // Make it switch to ISO-8859-7, KSC 5601 and JIS X 0208. ++          :bin{       61a11b2e461b4e761b244221511b2428433021212d237d1b2442213421511b2842 }, ++          :intvector{ 0,1,2,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,4,5,5,6,6,7,7,7,7,7,8,8,8,8,8 }, ++          :int{1}, :int{1}, "", "?", "" ++        } +  +         // e4b8 is a partial sequence +         { "UTF-8", :bin{ 31e4ba8ce4b8 }, "1\u4e8c", :intvector{ 0, 1 }, :int{1}, :int{0}, "truncated", ".", :bin{ e4b8 } } diff --git a/icu.icu5488.assamese.patch b/icu.icu5488.assamese.patch new file mode 100644 index 0000000..8b5d773 --- /dev/null +++ b/icu.icu5488.assamese.patch @@ -0,0 +1,11 @@ +--- icu.orig/source/layout/IndicClassTables.cpp	2006-10-18 09:05:20.000000000 +0100 ++++ icu/source/layout/IndicClassTables.cpp	2006-11-01 09:26:58.000000000 +0000 +@@ -94,7 +94,7 @@ +     _dr, _db, _db, _db, _db, _xx, _xx, _l1, _dl, _xx, _xx, _s1, _s2, _vr, _xx, _xx, // 09C0 - 09CF +     _xx, _xx, _xx, _xx, _xx, _xx, _xx, _m2, _xx, _xx, _xx, _xx, _cn, _cn, _xx, _cn, // 09D0 - 09DF +     _iv, _iv, _dv, _dv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, // 09E0 - 09EF +-    _ct, _ct, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx                           // 09F0 - 09FA ++    _rv, _ct, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx                           /* 09F0 - 09FA */ + }; +  + static const IndicClassTable::CharClass punjCharClasses[] = diff --git a/icu.icu5500.devicetablecrash.patch b/icu.icu5500.devicetablecrash.patch new file mode 100644 index 0000000..16ea5b7 --- /dev/null +++ b/icu.icu5500.devicetablecrash.patch @@ -0,0 +1,11 @@ +--- icu.orig/source/layout/DeviceTables.cpp	2006-10-18 09:05:20.000000000 +0100 ++++ icu/source/layout/DeviceTables.cpp	2006-11-08 09:08:09.000000000 +0000 +@@ -22,7 +22,7 @@ +     le_uint16 format = SWAPW(deltaFormat) - 1; +     le_int16 result = 0; +      +-    if (ppem >= start && ppem <= SWAPW(endSize)) { ++    if (ppem >= start && ppem <= SWAPW(endSize) && format < sizeof(fieldBits)/sizeof(fieldBits[0])) { +         le_uint16 sizeIndex = ppem - start; +         le_uint16 bits = fieldBits[format]; +         le_uint16 count = 16 / bits; diff --git a/icu.icu5501.sinhala.biggerexpand.patch b/icu.icu5501.sinhala.biggerexpand.patch new file mode 100644 index 0000000..6013780 --- /dev/null +++ b/icu.icu5501.sinhala.biggerexpand.patch @@ -0,0 +1,11 @@ +--- icu.orig/source/layout/IndicClassTables.cpp	2006-10-18 09:05:20.000000000 +0100 ++++ icu/source/layout/IndicClassTables.cpp	2006-11-08 11:20:55.000000000 +0000 +@@ -284,7 +284,7 @@ +  + static const IndicClassTable mlymClassTable = {0x0D00, 0x0D6F, 3, MLYM_SCRIPT_FLAGS, mlymCharClasses, mlymSplitTable}; +  +-static const IndicClassTable sinhClassTable = {0x0D80, 0x0DF4, 3, SINH_SCRIPT_FLAGS, sinhCharClasses, sinhSplitTable}; ++static const IndicClassTable sinhClassTable = {0x0D80, 0x0DF4, 4, SINH_SCRIPT_FLAGS, sinhCharClasses, sinhSplitTable}; +  + // + // IndicClassTable addresses diff --git a/icu.icu5506.multiplevowels.patch b/icu.icu5506.multiplevowels.patch new file mode 100644 index 0000000..a58ec64 --- /dev/null +++ b/icu.icu5506.multiplevowels.patch @@ -0,0 +1,61 @@ +diff -ur icu.orig/source/layout/IndicReordering.cpp icu/source/layout/IndicReordering.cpp +--- icu.orig/source/layout/IndicReordering.cpp	2006-11-10 09:42:44.000000000 +0000 ++++ icu/source/layout/IndicReordering.cpp	2006-11-10 09:47:05.000000000 +0000 +@@ -395,7 +395,7 @@ +     {-1,  6,  1, -1, -1, -1, -1, -1, -1,  5,  9,  5,  5,  4, 12}, //  2 - consonant with nukta +     {-1,  6,  1, -1, -1, -1, -1, -1,  2,  5,  9,  5,  5,  4, 12}, //  3 - consonant +     {-1, -1, -1, -1, -1, -1,  3,  2, -1, -1, -1, -1, -1, -1,  7}, //  4 - consonant virama +-    {-1,  6,  1, -1, -1, -1, -1, -1, -1,  5, -1, -1, -1, -1, -1}, //  5 - dependent vowels ++    {-1,  6,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, //  5 - dependent vowels +     {-1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, //  6 - vowel mark +     {-1, -1, -1, -1, -1, -1,  3,  2, -1, -1, -1, -1, -1, -1, -1}, //  7 - consonant virama ZWJ, consonant ZWJ virama +     {-1,  6,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  4, -1}, //  8 - independent vowels that can take a virama +@@ -423,6 +423,48 @@ +  +         state = stateTable[state][charClass & CF_CLASS_MASK]; +  ++	/*for the components of split matra*/	 ++	if ((charCount >= cursor + 3) && ++	    (chars[cursor] == 0x0DD9 && chars[cursor + 1] == 0x0DCF && chars[cursor + 2] == 0x0DCA)) {  /*for 3 split matra of Sinhala*/ ++	    return cursor + 3; ++	}         ++	else if ((charCount >= cursor + 3) && ++	         (chars[cursor] == 0x0CC6 && chars[cursor + 1] == 0x0CC2 && chars[cursor + 2] == 0x0CD5)) {  /*for 3 split matra of Kannada*/ ++	    return cursor + 3; ++	} ++        /*for 2 split matra*/	 ++	else if (charCount >= cursor + 2) { ++	        /*for Bengali*/ ++            if ((chars[cursor] == 0x09C7 && chars[cursor + 1] == 0x09BE) ||	        ++	        (chars[cursor] == 0x09C7 && chars[cursor + 1] == 0x09D7) ||		 ++	        /*for Oriya*/ ++	        (chars[cursor] == 0x0B47 && chars[cursor + 1] == 0x0B3E) ||		 ++	        (chars[cursor] == 0x0B47 && chars[cursor + 1] == 0x0B56) ||		 ++	        (chars[cursor] == 0x0B47 && chars[cursor + 1] == 0x0B57) || ++	        /*for Tamil*/ ++	        (chars[cursor] == 0x0BC6 && chars[cursor + 1] == 0x0BBE) ||		 ++	        (chars[cursor] == 0x0BC6 && chars[cursor + 1] == 0x0BD7) ||		 ++	        (chars[cursor] == 0x0BC7 && chars[cursor + 1] == 0x0BBE) ||		 ++	        /*for Malayalam*/ ++	        (chars[cursor] == 0x0D46 && chars[cursor + 1] == 0x0D3E) ||	 ++	        (chars[cursor] == 0x0D46 && chars[cursor + 1] == 0x0D57) ||		 ++	        (chars[cursor] == 0x0D47 && chars[cursor + 1] == 0x0D3E) ||	 ++	        /*for Sinhala*/ ++	        (chars[cursor] == 0x0DD9 && chars[cursor + 1] == 0x0DCA) ||		 ++	        (chars[cursor] == 0x0DD9 && chars[cursor + 1] == 0x0DCF) ||		 ++	        (chars[cursor] == 0x0DD9 && chars[cursor + 1] == 0x0DDF) ||		 ++	        (chars[cursor] == 0x0DDC && chars[cursor + 1] == 0x0DCA) ||		 ++	        /*for Telugu*/ ++	        (chars[cursor] == 0x0C46 && chars[cursor + 1] == 0x0C56) ||	 ++	        /*for Kannada*/ ++	        (chars[cursor] == 0x0CBF && chars[cursor + 1] == 0x0CD5) ||		     ++	        (chars[cursor] == 0x0CC6 && chars[cursor + 1] == 0x0CD5) ||		 ++	        (chars[cursor] == 0x0CC6 && chars[cursor + 1] == 0x0CD6) ||		 ++	        (chars[cursor] == 0x0CC6 && chars[cursor + 1] == 0x0CC2) ||		 ++	        (chars[cursor] == 0x0CCA && chars[cursor + 1] == 0x0CD5)) ++		    return cursor + 2; ++	} ++ +         if (state < 0) { +             break; +         } diff --git a/icu.icu5557.safety.patch b/icu.icu5557.safety.patch new file mode 100644 index 0000000..682caa1 --- /dev/null +++ b/icu.icu5557.safety.patch @@ -0,0 +1,14 @@ +--- icu.orig/source/layout/CoverageTables.cpp	2007-01-09 12:57:41.000000000 +0000 ++++ icu/source/layout/CoverageTables.cpp	2007-01-09 12:59:09.000000000 +0000 +@@ -44,6 +44,11 @@ +     le_uint16 count = SWAPW(glyphCount); +     le_uint8 bit = OpenTypeUtilities::highBit(count); +     le_uint16 power = 1 << bit; ++ ++    if (count == 0) { ++        return -1; ++    } ++ +     le_uint16 extra = count - power; +     le_uint16 probe = power; +     le_uint16 index = 0; diff --git a/icu.icu5594.gujarati.patch b/icu.icu5594.gujarati.patch new file mode 100644 index 0000000..b21418d --- /dev/null +++ b/icu.icu5594.gujarati.patch @@ -0,0 +1,14 @@ +--- icu.orig/source/layout/IndicClassTables.cpp	2007-02-09 14:26:04.000000000 +0000 ++++ icu/source/layout/IndicClassTables.cpp	2007-02-13 15:41:52.000000000 +0000 +@@ -117,7 +117,11 @@ +     _rv, _xx, _ct, _ct, _xx, _ct, _ct, _ct, _ct, _ct, _xx, _xx, _nu, _xx, _dr, _dl, // 0AB0 - 0ABF +     _dr, _db, _db, _db, _db, _da, _xx, _da, _da, _dr, _xx, _dr, _dr, _vr, _xx, _xx, // 0AC0 - 0ACF +     _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, // 0AD0 - 0ADF ++#if 1 ++    _iv, _xx, _db, _db, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx  // 0AE0 - 0AEF ++#else +     _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx  // 0AE0 - 0AEF ++#endif + }; +  + #if 1 diff --git a/icu.icu5691.backport.patch b/icu.icu5691.backport.patch new file mode 100644 index 0000000..906ecd3 --- /dev/null +++ b/icu.icu5691.backport.patch @@ -0,0 +1,730 @@ +diff -ru icu.6175/source/common/ucnv2022.c icu/source/common/ucnv2022.c +--- icu.6175/source/common/ucnv2022.c	2009-06-02 15:47:31.000000000 +0100 ++++ icu/source/common/ucnv2022.c	2009-06-02 16:03:15.000000000 +0100 +@@ -754,6 +754,7 @@ +     UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo); +     uint32_t key = myData2022->key; +     int32_t offset = 0; ++    int8_t initialToULength = _this->toULength; +     char c; +  +     value = VALID_NON_TERMINAL_2022; +@@ -806,7 +807,6 @@ +         return; +     } else if (value == INVALID_2022 ) { +         *err = U_ILLEGAL_ESCAPE_SEQUENCE; +-        return; +     } else /* value == VALID_TERMINAL_2022 */ { +         switch(var){ + #ifdef U_ENABLE_GENERIC_ISO_2022 +@@ -938,6 +938,35 @@ +     } +     if(U_SUCCESS(*err)) { +         _this->toULength = 0; ++    } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) { ++        if(_this->toULength>1) { ++            /* ++             * Ticket 5691: consistent illegal sequences: ++             * - We include at least the first byte (ESC) in the illegal sequence. ++             * - If any of the non-initial bytes could be the start of a character, ++             *   we stop the illegal sequence before the first one of those. ++             *   In escape sequences, all following bytes are "printable", that is, ++             *   unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS), ++             *   they are valid single/lead bytes. ++             *   For simplicity, we always only report the initial ESC byte as the ++             *   illegal sequence and back out all other bytes we looked at. ++             */ ++            /* Back out some bytes. */ ++            int8_t backOutDistance=_this->toULength-1; ++            int8_t bytesFromThisBuffer=_this->toULength-initialToULength; ++            if(backOutDistance<=bytesFromThisBuffer) { ++                /* same as initialToULength<=1 */ ++                *source-=backOutDistance; ++            } else { ++                /* Back out bytes from the previous buffer: Need to replay them. */ ++                _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); ++                /* same as -(initialToULength-1) */ ++                /* preToULength is negative! */ ++                uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength); ++                *source-=bytesFromThisBuffer; ++            } ++            _this->toULength=1; ++        } +     } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) { +         _this->toUCallbackReason = UCNV_UNASSIGNED; +     } +@@ -1973,6 +2002,7 @@ +         mySourceChar = args->converter->toUBytes[0]; +         args->converter->toULength = 0; +         cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; ++        targetUniChar = missingCharMarker; +         goto getTrailByte; +     } +  +@@ -2102,17 +2132,44 @@ +                 default: +                     /* G0 DBCS */ +                     if(mySource < mySourceLimit) { +-                        char trailByte; ++                        int leadIsOk, trailIsOk; ++                        uint8_t trailByte; + getTrailByte: +-                        trailByte = *mySource++; +-                        if(cs == JISX208) { +-                            _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailByte, tempBuf); +-                        } else { +-                            tempBuf[0] = (char)mySourceChar; +-                            tempBuf[1] = trailByte; ++                        trailByte = (uint8_t)*mySource; ++                        /* ++                         * Ticket 5691: consistent illegal sequences: ++                         * - We include at least the first byte in the illegal sequence. ++                         * - If any of the non-initial bytes could be the start of a character, ++                         *   we stop the illegal sequence before the first one of those. ++                         * ++                         * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is ++                         * an ESC/SO/SI, we report only the first byte as the illegal sequence. ++                         * Otherwise we convert or report the pair of bytes. ++                         */ ++                        leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); ++                        trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); ++                        if (leadIsOk && trailIsOk) { ++                            ++mySource; ++                            uint32_t tmpSourceChar = (mySourceChar << 8) | trailByte; ++                            if(cs == JISX208) { ++                                _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf); ++                                mySourceChar = tmpSourceChar; ++                            } else { ++                                /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */ ++                                mySourceChar = tmpSourceChar; ++                                if (cs == KSC5601) { ++                                    tmpSourceChar += 0x8080;  /* = _2022ToGR94DBCS(tmpSourceChar) */ ++                                } ++                                tempBuf[0] = (char)(tmpSourceChar >> 8); ++                                tempBuf[1] = (char)(tmpSourceChar); ++                            } ++                            targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); ++                        } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { ++                            /* report a pair of illegal bytes if the second byte is not a DBCS starter */ ++                            ++mySource; ++                            /* add another bit so that the code below writes 2 bytes in case of error */ ++                            mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; +                         } +-                        mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte); +-                        targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); +                     } else { +                         args->converter->toUBytes[0] = (uint8_t)mySourceChar; +                         args->converter->toULength = 1; +@@ -2254,7 +2311,12 @@ +             } +             /* only DBCS or SBCS characters are expected*/ +             /* DB characters with high bit set to 1 are expected */ +-            if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080)&& length==2)){ ++            if( length > 2 || length==0 || ++                (length == 1 && targetByteUnit > 0x7f) || ++                (length == 2 && ++                    ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) || ++                    (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1))) ++            ) { +                 targetByteUnit=missingCharMarker; +             } +             if (targetByteUnit != missingCharMarker){ +@@ -2583,17 +2645,34 @@ +             myData->isEmptySegment = FALSE;	/* Any invalid char errors will be detected separately, so just reset this */ +             if(myData->toU2022State.g == 1) { +                 if(mySource < mySourceLimit) { +-                    char trailByte; ++                    int leadIsOk, trailIsOk; ++                    uint8_t trailByte; + getTrailByte: +-                    trailByte = *mySource++; +-                    tempBuf[0] = (char)(mySourceChar + 0x80); +-                    tempBuf[1] = (char)(trailByte + 0x80); +-                    mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte); +-                    if((mySourceChar & 0x8080) == 0) { ++                    targetUniChar = missingCharMarker; ++                    trailByte = (uint8_t)*mySource; ++                    /* ++                     * Ticket 5691: consistent illegal sequences: ++                     * - We include at least the first byte in the illegal sequence. ++                     * - If any of the non-initial bytes could be the start of a character, ++                     *   we stop the illegal sequence before the first one of those. ++                     * ++                     * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is ++                     * an ESC/SO/SI, we report only the first byte as the illegal sequence. ++                     * Otherwise we convert or report the pair of bytes. ++                     */ ++                    leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); ++                    trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); ++                    if (leadIsOk && trailIsOk) { ++                        ++mySource; ++                        tempBuf[0] = (char)(mySourceChar + 0x80); ++                        tempBuf[1] = (char)(trailByte + 0x80); +                         targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback); +-                    } else { +-                        /* illegal bytes > 0x7f */ +-                        targetUniChar = missingCharMarker; ++                        mySourceChar = (mySourceChar << 8) | trailByte; ++                    } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { ++                        /* report a pair of illegal bytes if the second byte is not a DBCS starter */ ++                        ++mySource; ++                        /* add another bit so that the code below writes 2 bytes in case of error */ ++                        mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; +                     } +                 } else { +                     args->converter->toUBytes[0] = (uint8_t)mySourceChar; +@@ -2601,8 +2680,10 @@ +                     break; +                 } +             } +-            else{ ++            else if(mySourceChar <= 0x7f) { +                 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback); ++            } else { ++                targetUniChar = 0xffff; +             } +             if(targetUniChar < 0xfffe){ +                 if(args->offsets) { +@@ -3099,6 +3180,7 @@ +         /* continue with a partial double-byte character */ +         mySourceChar = args->converter->toUBytes[0]; +         args->converter->toULength = 0; ++        targetUniChar = missingCharMarker; +         goto getTrailByte; +     } +  +@@ -3178,29 +3260,50 @@ +                         UConverterSharedData *cnv; +                         StateEnum tempState; +                         int32_t tempBufLen; +-                        char trailByte; ++                        int leadIsOk, trailIsOk; ++                        uint8_t trailByte; + getTrailByte: +-                        trailByte = *mySource++; +-                        tempState = (StateEnum)pToU2022State->cs[pToU2022State->g]; +-                        if(tempState > CNS_11643_0) { +-                            cnv = myData->myConverterArray[CNS_11643]; +-                            tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0)); +-                            tempBuf[1] = (char) (mySourceChar); +-                            tempBuf[2] = trailByte; +-                            tempBufLen = 3; +- +-                        }else{ +-                            cnv = myData->myConverterArray[tempState]; +-                            tempBuf[0] = (char) (mySourceChar); +-                            tempBuf[1] = trailByte; +-                            tempBufLen = 2; ++                        trailByte = (uint8_t)*mySource; ++                        /* ++                         * Ticket 5691: consistent illegal sequences: ++                         * - We include at least the first byte in the illegal sequence. ++                         * - If any of the non-initial bytes could be the start of a character, ++                         *   we stop the illegal sequence before the first one of those. ++                         * ++                         * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is ++                         * an ESC/SO/SI, we report only the first byte as the illegal sequence. ++                         * Otherwise we convert or report the pair of bytes. ++                         */ ++                        leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); ++                        trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); ++                        if (leadIsOk && trailIsOk) { ++                            ++mySource; ++                            tempState = (StateEnum)pToU2022State->cs[pToU2022State->g]; ++                            if(tempState >= CNS_11643_0) { ++                                cnv = myData->myConverterArray[CNS_11643]; ++                                tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0)); ++                                tempBuf[1] = (char) (mySourceChar); ++                                tempBuf[2] = (char) trailByte; ++                                tempBufLen = 3; ++ ++                            }else{ ++                                cnv = myData->myConverterArray[tempState]; ++                                tempBuf[0] = (char) (mySourceChar); ++                                tempBuf[1] = (char) trailByte; ++                                tempBufLen = 2; ++                            } ++                            targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE); ++                            mySourceChar = (mySourceChar << 8) | trailByte; ++                        } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { ++                            /* report a pair of illegal bytes if the second byte is not a DBCS starter */ ++                            ++mySource; ++                            /* add another bit so that the code below writes 2 bytes in case of error */ ++                            mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; +                         } +-                        mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte); +                         if(pToU2022State->g>=2) { +                             /* return from a single-shift state to the previous one */ +                             pToU2022State->g=pToU2022State->prevG; +                         } +-                        targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE); +                     } else { +                         args->converter->toUBytes[0] = (uint8_t)mySourceChar; +                         args->converter->toULength = 1; +diff -ru icu.6175/source/common/ucnvhz.c icu/source/common/ucnvhz.c +--- icu.6175/source/common/ucnvhz.c	2009-06-02 15:47:31.000000000 +0100 ++++ icu/source/common/ucnvhz.c	2009-06-02 15:57:18.000000000 +0100 +@@ -196,10 +196,30 @@ +                      /* if the first byte is equal to TILDE and the trail byte +                      * is not a valid byte then it is an error condition +                      */ +-                    mySourceChar = 0x7e00 | mySourceChar; +-                    targetUniChar = 0xffff; ++                    /* ++                     * Ticket 5691: consistent illegal sequences: ++                     * - We include at least the first byte in the illegal sequence. ++                     * - If any of the non-initial bytes could be the start of a character, ++                     *   we stop the illegal sequence before the first one of those. ++                     */ +                     myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */ +-                    break; ++                    *err = U_ILLEGAL_ESCAPE_SEQUENCE; ++                    args->converter->toUBytes[0] = UCNV_TILDE; ++                    if( myData->isStateDBCS ? ++                            (0x21 <= mySourceChar && mySourceChar <= 0x7e) : ++                            mySourceChar <= 0x7f ++                    ) { ++                        /* The current byte could be the start of a character: Back it out. */ ++                        args->converter->toULength = 1; ++                        --mySource; ++                    } else { ++                        /* Include the current byte in the illegal sequence. */ ++                        args->converter->toUBytes[1] = mySourceChar; ++                        args->converter->toULength = 2; ++                    } ++                    args->target = myTarget; ++                    args->source = mySource; ++                    return; +                 } +             } else if(myData->isStateDBCS) { +                 if(args->converter->toUnicodeStatus == 0x00){ +@@ -215,19 +235,36 @@ +                 } +                 else{ +                     /* trail byte */ ++                    int leadIsOk, trailIsOk; +                     uint32_t leadByte = args->converter->toUnicodeStatus & 0xff; +-                    if( (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21) && +-                        (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21) +-                    ) { ++                    targetUniChar = 0xffff; ++                    /* ++                     * Ticket 5691: consistent illegal sequences: ++                     * - We include at least the first byte in the illegal sequence. ++                     * - If any of the non-initial bytes could be the start of a character, ++                     *   we stop the illegal sequence before the first one of those. ++                     * ++                     * In HZ DBCS, if the second byte is in the 21..7e range, ++                     * we report only the first byte as the illegal sequence. ++                     * Otherwise we convert or report the pair of bytes. ++                     */ ++                    leadIsOk = (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21); ++                    trailIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); ++                    if (leadIsOk && trailIsOk) { +                         tempBuf[0] = (char) (leadByte+0x80) ; +                         tempBuf[1] = (char) (mySourceChar+0x80); +                         targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData, +                             tempBuf, 2, args->converter->useFallback); ++                        mySourceChar= (leadByte << 8) | mySourceChar; ++                    } else if (trailIsOk) { ++                        /* report a single illegal byte and continue with the following DBCS starter byte */ ++                        --mySource; ++                        mySourceChar = (int32_t)leadByte; +                     } else { +-                        targetUniChar = 0xffff; ++                        /* report a pair of illegal bytes if the second byte is not a DBCS starter */ ++                        /* add another bit so that the code below writes 2 bytes in case of error */ ++                        mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar; +                     } +-                    /* add another bit so that the code below writes 2 bytes in case of error */ +-                    mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar; +                     args->converter->toUnicodeStatus =0x00; +                 } +             } +diff -ru icu.6175/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c +--- icu.6175/source/common/ucnvmbcs.c	2009-06-02 15:47:31.000000000 +0100 ++++ icu/source/common/ucnvmbcs.c	2009-06-02 15:56:07.000000000 +0100 +@@ -1697,6 +1697,65 @@ +     pArgs->offsets=offsets; + } +  ++static UBool ++hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) { ++    const int32_t *row=stateTable[state]; ++    int32_t b, entry; ++    /* First test for final entries in this state for some commonly valid byte values. */ ++    entry=row[0xa1]; ++    if( !MBCS_ENTRY_IS_TRANSITION(entry) && ++        MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL ++    ) { ++        return TRUE; ++    } ++    entry=row[0x41]; ++    if( !MBCS_ENTRY_IS_TRANSITION(entry) && ++        MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL ++    ) { ++        return TRUE; ++    } ++    /* Then test for final entries in this state. */ ++    for(b=0; b<=0xff; ++b) { ++        entry=row[b]; ++        if( !MBCS_ENTRY_IS_TRANSITION(entry) && ++            MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL ++        ) { ++            return TRUE; ++        } ++    } ++    /* Then recurse for transition entries. */ ++    for(b=0; b<=0xff; ++b) { ++        entry=row[b]; ++        if( MBCS_ENTRY_IS_TRANSITION(entry) && ++            hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry)) ++        ) { ++            return TRUE; ++        } ++    } ++    return FALSE; ++} ++ ++/* ++ * Is byte b a single/lead byte in this state? ++ * Recurse for transition states, because here we don't want to say that ++ * b is a lead byte if all byte sequences that start with b are illegal. ++ */ ++static UBool ++isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) { ++    const int32_t *row=stateTable[state]; ++    int32_t entry=row[b]; ++    if(MBCS_ENTRY_IS_TRANSITION(entry)) {   /* lead byte */ ++        return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry)); ++    } else { ++        uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); ++        if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) { ++            return FALSE;   /* SI/SO are illegal for DBCS-only conversion */ ++        } else { ++            return action!=MBCS_STATE_ILLEGAL; ++        } ++    } ++} ++ + U_CFUNC void + ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, +                           UErrorCode *pErrorCode) { +@@ -2052,6 +2111,34 @@ +             sourceIndex=nextSourceIndex; +         } else if(U_FAILURE(*pErrorCode)) { +             /* callback(illegal) */ ++            if(byteIndex>1) { ++                /* ++                 * Ticket 5691: consistent illegal sequences: ++                 * - We include at least the first byte in the illegal sequence. ++                 * - If any of the non-initial bytes could be the start of a character, ++                 *   we stop the illegal sequence before the first one of those. ++                 */ ++                UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0); ++                int8_t i; ++                for(i=1; ++                    i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]); ++                    ++i) {} ++                if(i<byteIndex) { ++                    /* Back out some bytes. */ ++                    int8_t backOutDistance=byteIndex-i; ++                    int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source); ++                    byteIndex=i;  /* length of reported illegal byte sequence */ ++                    if(backOutDistance<=bytesFromThisBuffer) { ++                        source-=backOutDistance; ++                    } else { ++                        /* Back out bytes from the previous buffer: Need to replay them. */ ++                        cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); ++                        /* preToULength is negative! */ ++                        uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength); ++                        source=(const uint8_t *)pArgs->source; ++                    } ++                } ++            } +             break; +         } else /* unassigned sequences indicated with byteIndex>0 */ { +             /* try an extension mapping */ +@@ -2062,7 +2149,7 @@ +                               &offsets, sourceIndex, +                               pArgs->flush, +                               pErrorCode); +-            sourceIndex=nextSourceIndex+(int32_t)(source-(const uint8_t *)pArgs->source); ++            sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source); +  +             if(U_FAILURE(*pErrorCode)) { +                 /* not mappable or buffer overflow */ +@@ -2353,15 +2440,37 @@ +  +     if(c<0) { +         if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) { +-            *pErrorCode=U_TRUNCATED_CHAR_FOUND; +-        } +-        if(U_FAILURE(*pErrorCode)) { +             /* incomplete character byte sequence */ +             uint8_t *bytes=cnv->toUBytes; +             cnv->toULength=(int8_t)(source-lastSource); +             do { +                 *bytes++=*lastSource++; +             } while(lastSource<source); ++            *pErrorCode=U_TRUNCATED_CHAR_FOUND; ++        } else if(U_FAILURE(*pErrorCode)) { ++            /* callback(illegal) */ ++            /* ++             * Ticket 5691: consistent illegal sequences: ++             * - We include at least the first byte in the illegal sequence. ++             * - If any of the non-initial bytes could be the start of a character, ++             *   we stop the illegal sequence before the first one of those. ++             */ ++            UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0); ++            uint8_t *bytes=cnv->toUBytes; ++            *bytes++=*lastSource++;     /* first byte */ ++            if(lastSource==source) { ++                cnv->toULength=1; ++            } else /* lastSource<source: multi-byte character */ { ++                int8_t i; ++                for(i=1; ++                    lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource); ++                    ++i ++                ) { ++                    *bytes++=*lastSource++; ++                } ++                cnv->toULength=i; ++                source=lastSource; ++            } +         } else { +             /* no output because of empty input or only state changes */ +             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; +diff -ru icu.6175/source/test/cintltst/nccbtst.c icu/source/test/cintltst/nccbtst.c +--- icu.6175/source/test/cintltst/nccbtst.c	2009-06-02 15:47:18.000000000 +0100 ++++ icu/source/test/cintltst/nccbtst.c	2009-06-02 15:47:38.000000000 +0100 +@@ -2497,13 +2497,13 @@ +  +  +     static const uint8_t text943[] = { +-        0x82, 0xa9, 0x82, 0x20, /*0xc8,*/  0x61, 0x8a, 0xbf, 0x8e, 0x9a }; +-    static const UChar toUnicode943sub[] = { 0x304b, 0xfffd, /*0xff88,*/ 0x0061, 0x6f22,  0x5b57}; +-    static const UChar toUnicode943skip[]= { 0x304b, /*0xff88,*/ 0x0061, 0x6f22,  0x5b57}; ++        0x82, 0xa9, 0x82, 0x20, 0x61, 0x8a, 0xbf, 0x8e, 0x9a }; ++    static const UChar toUnicode943sub[] = { 0x304b, 0x1a, 0x20, 0x0061, 0x6f22,  0x5b57 }; ++    static const UChar toUnicode943skip[]= { 0x304b, 0x20, 0x0061, 0x6f22,  0x5b57 }; +     static const UChar toUnicode943stop[]= { 0x304b}; +  +-    static const int32_t  fromIBM943Offssub[]  = {0, 2, 4, 5, 7}; +-    static const int32_t  fromIBM943Offsskip[] = { 0, 4, 5, 7}; ++    static const int32_t  fromIBM943Offssub[]  = { 0, 2, 3, 4, 5, 7 }; ++    static const int32_t  fromIBM943Offsskip[] = { 0, 3, 4, 5, 7 }; +     static const int32_t  fromIBM943Offsstop[] = { 0}; +  +     gInBufferSize = inputsize; +@@ -2537,9 +2537,9 @@ + { +     static const uint8_t sampleText[] = { +         0x82, 0xa9, 0x61, 0x62, 0x63 , 0x82, +-        0xff, /*0x82, 0xa9,*/ 0x32, 0x33}; +-    static const UChar toUnicode943sub[] = {0x304b, 0x0061, 0x0062, 0x0063,  0xfffd,/*0x304b,*/ 0x0032, 0x0033}; +-    static const int32_t  fromIBM943Offssub[]  = {0, 2, 3, 4, 5, 7, 8}; ++        0xff, 0x32, 0x33}; ++    static const UChar toUnicode943sub[] = { 0x304b, 0x0061, 0x0062, 0x0063, 0x1a, 0x1a, 0x0032, 0x0033 }; ++    static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 6, 7, 8 }; +     /*checking illegal value for ibm-943 with substitute*/  +     gInBufferSize = inputsize; +     gOutBufferSize = outputsize; +diff -ru icu.6175/source/test/cintltst/nucnvtst.c icu/source/test/cintltst/nucnvtst.c +--- icu.6175/source/test/cintltst/nucnvtst.c	2009-06-02 15:47:18.000000000 +0100 ++++ icu/source/test/cintltst/nucnvtst.c	2009-06-02 15:47:38.000000000 +0100 +@@ -2606,7 +2606,7 @@ +     TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceLimit <= source"); +     /*Test for the condition where there is an invalid character*/ +     { +-        static const uint8_t source2[]={0xa1, 0x01}; ++        static const uint8_t source2[]={0xa1, 0x80}; +         TestNextUCharError(cnv, (const char*)source2, (const char*)source2+sizeof(source2), U_ZERO_ERROR, "an invalid character"); +     } +     /*Test for the condition where we have a truncated char*/ +@@ -3899,11 +3899,11 @@ + TestISO_2022_KR() { +     /* test input */ +     static const uint16_t in[]={ +-                    0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F66,0x9F67,0x9F6A,0x000A,0x000D +-                   ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC02,0xAC04 ++                    0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F67,0x9F6A,0x000A,0x000D ++                   ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC04 +                    ,0xAC07,0xAC08,0xAC09,0x0025,0x0026,0x0027,0x000A,0x000D,0x0028,0x0029 +                    ,0x002A,0x002B,0x002C,0x002D,0x002E,0x53C3,0x53C8,0x53C9,0x53CA,0x53CB +-                   ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53DF,0x53E1,0x53E2 ++                   ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53E1,0x53E2 +                    ,0x53E3,0x53E4,0x000A,0x000D}; +     const UChar* uSource; +     const UChar* uSourceLimit; +diff -ru icu.6175/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt +--- icu.6175/source/test/testdata/conversion.txt	2009-06-02 15:47:18.000000000 +0100 ++++ icu/source/test/testdata/conversion.txt	2009-06-02 15:57:41.000000000 +0100 +@@ -48,12 +48,144 @@ +     toUnicode { +       Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" } +       Cases { ++        // Test ticket 5691: consistent illegal sequences ++        // The following test cases are for illegal character byte sequences. ++        // ++        // Unfortunately, we cannot use the Shift-JIS examples from the ticket ++        // comments because our Shift-JIS table is Windows-compatible and ++        // therefore has no illegal single bytes. Same for GBK. ++        // Instead, we use the stricter GB 18030 also for 2-byte examples. ++        // The byte sequences are generally slightly different from the ticket ++        // comment, simply using assigned characters rather than just ++        // theoretically valid sequences. ++        { ++          "gb18030", ++          :bin{ 618140813c81ff7a }, ++          "a\u4e02\\x81<\\x81\\xFFz", ++          :intvector{ 0,1,3,3,3,3,4,5,5,5,5,5,5,5,5,7 }, ++          :int{1}, :int{0}, "", "&C", :bin{""} ++        } ++        { ++          "EUC-JP", ++          :bin{ 618fb0a98fb03c8f3cb0a97a }, ++          "a\u4e28\\x8F\\xB0<\\x8F<\u9022z", ++          :intvector{ 0,1,4,4,4,4,5,5,5,5,6,7,7,7,7,8,9,11 }, ++          :int{1}, :int{0}, "", "&C", :bin{""} ++        } ++        { ++          "gb18030", ++          :bin{ 618130fc318130fc8181303c3e813cfc817a }, ++          "a\u05ed\\x810\u9f07\\x810<>\\x81<\u9f07z", ++          :intvector{ 0,1,5,5,5,5,6,7,9,9,9,9,10,11,12,13,13,13,13,14,15,17 }, ++          :int{1}, :int{0}, "", "&C", :bin{""} ++        } ++        { ++          "UTF-8", ++          :bin{ 61f1808182f180813cf18081fff180ff3cf1ff3c3e7a }, ++          "a\U00040042\\xF1\\x80\\x81<\\xF1\\x80\\x81\\xFF\\xF1\\x80\\xFF<\\xF1\\xFF<>z", ++          :intvector{ 0,1,1,5,5,5,5,5,5,5,5,5,5,5,5,8,9,9,9,9,9,9,9,9,9,9,9,9,12,12,12,12,13,13,13,13,13,13,13,13,15,15,15,15,16,17,17,17,17,18,18,18,18,19,20,21 }, ++          :int{1}, :int{0}, "", "&C", :bin{""} ++        } ++        { ++          "ISO-2022-JP", ++          :bin{ 1b24424141af4142affe41431b2842 }, ++          "\u758f\\xAF\u758e\\xAF\\xFE\u790e", ++          :intvector{ 3,5,5,5,5,6,8,8,8,8,8,8,8,8,10 }, ++          :int{1}, :int{0}, "", "&C", :bin{""} ++        } ++        { ++          "ibm-25546", ++          :bin{ 411b242943420e4141af4142affe41430f5a }, ++          "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ", ++          :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 }, ++          :int{1}, :int{0}, "", "&C", :bin{""} ++        } ++        { ++          "ISO-2022-KR", ++          :bin{ 411b242943420e4141af4142affe41430f5a }, ++          "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ", ++          :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 }, ++          :int{1}, :int{0}, "", "&C", :bin{""} ++        } ++        { ++          "ISO-2022-CN", ++          :bin{ 411b242941420e4141af4142affe41430f5a }, ++          "AB\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z", ++          :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 }, ++          :int{1}, :int{0}, "", "&C", :bin{""} ++        } ++        { ++          "HZ", ++          :bin{ 417e7b4141af4142affe41437e7d5a }, ++          "A\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z", ++          :intvector{ 0,3,5,5,5,5,6,8,8,8,8,8,8,8,8,10,14 }, ++          :int{1}, :int{0}, "", "&C", :bin{""} ++        } ++        // Test ticket 5691: consistent illegal sequences ++        // The following test cases are for illegal escape/designator/shift sequences. ++        // ++        // ISO-2022-JP and -CN with illegal escape sequences. ++        { ++          "ISO-2022-JP", ++          :bin{ 611b24201b244241411b283f1b28427a }, ++          "a\\x1B$ \u758f\\x1B\u2538z", ++          :intvector{ 0,1,1,1,1,2,3,7,9,9,9,9,10,15 }, ++          :int{1}, :int{0}, "", "&C", :bin{""} ++        } ++        { ++          "ISO-2022-CN", ++          :bin{ 611b2429201b2429410e41410f7a }, ++          "a\\x1B$) \u4eaez", ++          :intvector{ 0,1,1,1,1,2,3,4,10,13 }, ++          :int{1}, :int{0}, "", "&C", :bin{""} ++        } ++        // Test ticket 5691: ISO-2022-JP-2 with illegal single-shift SS2 and SS3 sequences. ++        // The first ESC N comes before its designator sequence, the last sequence is ESC+space. ++        { ++          "ISO-2022-JP-2", ++          :bin{ 4e1b4e4e1b2e414e1b4e4e4e1b204e }, ++          "N\\x1BNNN\xceN\\x1B N", ++          :intvector{ 0,1,1,1,1,2,3,7,10,11,12,12,12,12,13,14 }, ++          :int{1}, :int{0}, "", "&C", :bin{""} ++        } ++        { ++          "ISO-2022-CN-EXT", ++          :bin{ 4e1b4e4e1b242a484e1b4e4e4e4e1b204e }, ++          "N\\x1BNNN\u8f0eN\\x1B N", ++          :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 }, ++          :int{1}, :int{0}, "", "&C", :bin{""} ++        } ++        { ++          "ISO-2022-CN-EXT", ++          :bin{ 4f1b4f4f1b242b494f1b4f4f4f4f1b204f }, ++          "O\\x1BOOO\u492bO\\x1B O", ++          :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 }, ++          :int{1}, :int{0}, "", "&C", :bin{""} ++        } ++        // Test ticket 5691: HZ with illegal tilde sequences. ++        { ++          "HZ", ++          :bin{ 417e20427e21437e80447e7b41417e207e41427e7f41437e7d5a }, ++          "A\\x7E B\\x7E!C\\x7E\\x80D\u4eae\\x7E\\x20\\x7E\u8c05\\x7E\\x7F\u64a9Z", ++          :intvector{ 0,1,1,1,1,2,3,4,4,4,4,5,6,7,7,7,7,7,7,7,7,9,                          // SBCS ++                      12,14,14,14,14,14,14,14,14,16,16,16,16,17,19,19,19,19,19,19,19,19,21, // DBCS ++                      25 },                                                                 // SBCS ++          :int{1}, :int{0}, "", "&C", :bin{""} ++        } ++        // Test ticket 5691: Example from Peter Edberg. ++        { ++          "ISO-2022-JP", ++          :bin{ 1b244230212f7e742630801b284a621b2458631b2842648061 }, ++          "\u4e9c\ufffd\u7199\ufffdb\ufffd$Xcd\ufffda", ++          :intvector{ 3,5,7,9,14,15,16,17,18,22,23,24 }, ++          :int{1}, :int{0}, "", "?", :bin{""} ++        } +         // test that HZ limits its byte values to lead bytes 21..7d and trail bytes 21..7e +         { +           "HZ", +-          :bin{ 7e7b21212120217e217f772100007e217e7d207e7e807e0a2b }, +-          "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd ~\ufffd+", +-          :intvector{ 2,4,6,8,10,12,14,18,19,21,24 }, ++          :bin{ 7e7b21212120217e217f772100007e217e7e7d207e7e807e0a2b }, ++          "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd\u3013 ~\ufffd+", ++          :intvector{ 2,4,6,8,10,12,14,15,19,20,22,25 }, +           :int{1}, :int{1}, "", "?", :bin{""} +         } +         // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and +@@ -61,8 +193,8 @@ +         { +           "ISO-2022-JP", +           :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b2842 }, +-          "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e", +-          :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 }, ++          "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e", ++          :intvector{ 3,4,5,9,11,12,14,16,17,19,21,23,25,27 }, +           :int{1}, :int{1}, "", "?", :bin{""} +         } +         // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBMPWithOffsets() +@@ -341,7 +473,7 @@ +         { +           "ISO-2022-CN-EXT", +           :bin{ 411b4e2121 }, "\x41", :intvector{ 0 }, +-          :int{1}, :int{1}, "illesc", ".", :bin{ 1b4e } ++          :int{1}, :int{1}, "illesc", ".", :bin{ 1b } +         } +         // G3 designator: recognized, but not supported for -CN (only for -CN-EXT) +         { diff --git a/icu.icu5797.backport.patch b/icu.icu5797.backport.patch new file mode 100644 index 0000000..39e3f77 --- /dev/null +++ b/icu.icu5797.backport.patch @@ -0,0 +1,749 @@ +diff -ru icu.5483/source/common/ucnv2022.c icu/source/common/ucnv2022.c +--- icu.5483/source/common/ucnv2022.c	2009-06-02 12:47:41.000000000 +0100 ++++ icu/source/common/ucnv2022.c	2009-06-02 13:18:23.000000000 +0100 +@@ -473,8 +473,7 @@ +             if(jpCharsetMasks[version]&CSM(ISO8859_7)) { +                 myConverterData->myConverterArray[ISO8859_7]= ucnv_loadSharedData("ISO8859_7", NULL, errorCode); +             } +-            myConverterData->myConverterArray[JISX201]      = ucnv_loadSharedData("JISX0201", NULL, errorCode); +-            myConverterData->myConverterArray[JISX208]      = ucnv_loadSharedData("jisx-208", NULL, errorCode); ++            myConverterData->myConverterArray[JISX208]      = ucnv_loadSharedData("Shift-JIS", NULL, errorCode); +             if(jpCharsetMasks[version]&CSM(JISX212)) { +                 myConverterData->myConverterArray[JISX212]  = ucnv_loadSharedData("jisx-212", NULL, errorCode); +             } +@@ -1045,14 +1044,6 @@ +                 length=3; +             } +         } +-        /* +-         * TODO(markus): Use Shift-JIS table for JIS X 0208, to save mapping table space. +-         * Pass in parameter for type of output bytes, for validation and shifting: +-         * - Direct: Pass bytes through, but forbid control codes 00-1F (except SI/SO/ESC) and space 20? +-         *   (Need to allow some (TAB/LF/CR) or most of them for ASCII and maybe JIS X 0201.) +-         * - A1-FE: Subtract 80 after range check. +-         * - SJIS: Shift DBCS result to 21-7E x 21-7E. +-         */ +         /* is this code point assigned, or do we use fallbacks? */ +         if((stage2Entry&(1<<(16+(c&0xf))))!=0) { +             /* assigned */ +@@ -1110,6 +1101,23 @@ +     } + } +  ++/* ++ * Check that the result is a 2-byte value with each byte in the range A1..FE ++ * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte ++ * to move it to the ISO 2022 range 21..7E. ++ * Return 0 if out of range. ++ */ ++static U_INLINE uint32_t ++_2022FromGR94DBCS(uint32_t value) { ++    if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && ++        (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) ++    ) { ++        return value - 0x8080;  /* shift down to 21..7e byte range */ ++    } else { ++        return 0;  /* not valid for ISO 2022 */ ++    } ++} ++ + #ifdef U_ENABLE_GENERIC_ISO_2022 +  + /********************************************************************************** +@@ -1238,7 +1246,7 @@ +     } +     else{ +         cnv->toUBytes[0] =(char) sourceChar; +-        cnv->toULength = 2; ++        cnv->toULength = 1; +     } +  +     if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){ +@@ -1332,6 +1340,181 @@ +     3  /* length of <ESC>(I  HWKANA_7BIT */ + }; +  ++/* Map 00..7F to Unicode according to JIS X 0201. */ ++static U_INLINE uint32_t ++jisx201ToU(uint32_t value) { ++    if(value < 0x5c) { ++        return value; ++    } else if(value == 0x5c) { ++        return 0xa5; ++    } else if(value == 0x7e) { ++        return 0x203e; ++    } else /* value <= 0x7f */ { ++        return value; ++    } ++} ++ ++/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */ ++static U_INLINE uint32_t ++jisx201FromU(uint32_t value) { ++    if(value<=0x7f) { ++        if(value!=0x5c && value!=0x7e) { ++            return value; ++        } ++    } else if(value==0xa5) { ++        return 0x5c; ++    } else if(value==0x203e) { ++        return 0x7e; ++    } ++    return 0xfffe; ++} ++ ++/* ++ * Take a valid Shift-JIS byte pair, check that it is in the range corresponding ++ * to JIS X 0208, and convert it to a pair of 21..7E bytes. ++ * Return 0 if the byte pair is out of range. ++ */ ++static U_INLINE uint32_t ++_2022FromSJIS(uint32_t value) { ++    uint8_t trail; ++ ++    if(value > 0xEFFC) { ++        return 0;  /* beyond JIS X 0208 */ ++    } ++ ++    trail = (uint8_t)value; ++ ++    value &= 0xff00;  /* lead byte */ ++    if(value <= 0x9f00) { ++        value -= 0x7000; ++    } else /* 0xe000 <= value <= 0xef00 */ { ++        value -= 0xb000; ++    } ++    value <<= 1; ++ ++    if(trail <= 0x9e) { ++        value -= 0x100; ++        if(trail <= 0x7e) { ++            value |= trail - 0x1f; ++        } else { ++            value |= trail - 0x20; ++        } ++    } else /* trail <= 0xfc */ { ++        value |= trail - 0x7e; ++    } ++    return value; ++} ++ ++/* ++ * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS. ++ * If either byte is outside 21..7E make sure that the result is not valid ++ * for Shift-JIS so that the converter catches it. ++ * Some invalid byte values already turn into equally invalid Shift-JIS ++ * byte values and need not be tested explicitly. ++ */ ++static U_INLINE void ++_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) { ++    if(c1&1) { ++        ++c1; ++        if(c2 <= 0x5f) { ++            c2 += 0x1f; ++        } else if(c2 <= 0x7e) { ++            c2 += 0x20; ++        } else { ++            c2 = 0;  /* invalid */ ++        } ++    } else { ++        if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) { ++            c2 += 0x7e; ++        } else { ++            c2 = 0;  /* invalid */ ++        } ++    } ++    c1 >>= 1; ++    if(c1 <= 0x2f) { ++        c1 += 0x70; ++    } else if(c1 <= 0x3f) { ++        c1 += 0xb0; ++    } else { ++        c1 = 0;  /* invalid */ ++    } ++    bytes[0] = (char)c1; ++    bytes[1] = (char)c2; ++} ++ ++/* ++ * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS) ++ * Katakana. ++ * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks ++ * because Shift-JIS roundtrips half-width Katakana to single bytes. ++ * These were the only fallbacks in ICU's jisx-208.ucm file. ++ */ ++static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = { ++    0x2123,  /* U+FF61 */ ++    0x2156, ++    0x2157, ++    0x2122, ++    0x2126, ++    0x2572, ++    0x2521, ++    0x2523, ++    0x2525, ++    0x2527, ++    0x2529, ++    0x2563, ++    0x2565, ++    0x2567, ++    0x2543, ++    0x213C,  /* U+FF70 */ ++    0x2522, ++    0x2524, ++    0x2526, ++    0x2528, ++    0x252A, ++    0x252B, ++    0x252D, ++    0x252F, ++    0x2531, ++    0x2533, ++    0x2535, ++    0x2537, ++    0x2539, ++    0x253B, ++    0x253D, ++    0x253F,  /* U+FF80 */ ++    0x2541, ++    0x2544, ++    0x2546, ++    0x2548, ++    0x254A, ++    0x254B, ++    0x254C, ++    0x254D, ++    0x254E, ++    0x254F, ++    0x2552, ++    0x2555, ++    0x2558, ++    0x255B, ++    0x255E, ++    0x255F,  /* U+FF90 */ ++    0x2560, ++    0x2561, ++    0x2562, ++    0x2564, ++    0x2566, ++    0x2568, ++    0x2569, ++    0x256A, ++    0x256B, ++    0x256C, ++    0x256D, ++    0x256F, ++    0x2573, ++    0x212B, ++    0x212C   /* U+FF9F */ ++}; ++ + /* + * The iteration over various code pages works this way: + * i)   Get the currentState from myConverterData->currentState +@@ -1504,7 +1687,7 @@ +                     } +                     break; +                 case HWKANA_7BIT: +-                    if((uint32_t)(HWKANA_END-sourceChar)<=(HWKANA_END-HWKANA_START)) { ++                    if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { +                         if(converterData->version==3) { +                             /* JIS7: use G1 (SO) */ +                             /* Shift U+FF61..U+FF9F to bytes 21..5F. */ +@@ -1531,13 +1714,34 @@ +                     break; +                 case JISX201: +                     /* G0 SBCS */ +-                    len2 = MBCS_SINGLE_FROM_UCHAR32( ++                    value = jisx201FromU(sourceChar); ++                    if(value <= 0x7f) { ++                        targetValue = value; ++                        len = 1; ++                        cs = cs0; ++                        g = 0; ++                        useFallback = FALSE; ++                    } ++                    break; ++                case JISX208: ++                    /* G0 DBCS from Shift-JIS table */ ++                    len2 = MBCS_FROM_UCHAR32_ISO2022( +                                 converterData->myConverterArray[cs0], +                                 sourceChar, &value, +-                                useFallback); +-                    if(len2 != 0 && !(len2 < 0 && len != 0) && value <= 0x7f) { +-                        targetValue = value; +-                        len = len2; ++                                useFallback, MBCS_OUTPUT_2); ++                    if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */ ++                        value = _2022FromSJIS(value); ++                        if(value != 0) { ++                            targetValue = value; ++                            len = len2; ++                            cs = cs0; ++                            g = 0; ++                            useFallback = FALSE; ++                        } ++                    } else if(len == 0 && useFallback && ++                              (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { ++                        targetValue = hwkana_fb[sourceChar - HWKANA_START]; ++                        len = -2; +                         cs = cs0; +                         g = 0; +                         useFallback = FALSE; +@@ -1569,17 +1773,10 @@ +                              * Check for valid bytes for the encoding scheme. +                              * This is necessary because the sub-converter (windows-949) +                              * has a broader encoding scheme than is valid for 2022. +-                             * +-                             * Check that the result is a 2-byte value with each byte in the range A1..FE +-                             * (strict EUC-KR DBCS) before accepting it and subtracting 0x80 from each byte +-                             * to move it to the ISO 2022 range 21..7E. +                              */ +-                            if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && +-                                (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) +-                            ) { +-                                value -= 0x8080;  /* shift down to 21..7e byte range */ +-                            } else { +-                                break;  /* not valid for ISO 2022 */ ++                            value = _2022FromGR94DBCS(value); ++                            if(value == 0) { ++                                break; +                             } +                         } +                         targetValue = value; +@@ -1755,7 +1952,7 @@ + static void  + UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, +                                                UErrorCode* err){ +-    char tempBuf[3]; ++    char tempBuf[2]; +     const char *mySource = (char *) args->source; +     UChar *myTarget = args->target; +     const char *mySourceLimit = args->sourceLimit; +@@ -1893,10 +2090,7 @@ +                     break; +                 case JISX201: +                     if(mySourceChar <= 0x7f) { +-                        targetUniChar = +-                            _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP( +-                                myData->myConverterArray[cs], +-                                mySourceChar); ++                        targetUniChar = jisx201ToU(mySourceChar); +                     } +                     break; +                 case HWKANA_7BIT: +@@ -1910,8 +2104,13 @@ +                     if(mySource < mySourceLimit) { +                         char trailByte; + getTrailByte: +-                        tempBuf[0] = (char) (mySourceChar); +-                        tempBuf[1] = trailByte = *mySource++; ++                        trailByte = *mySource++; ++                        if(cs == JISX208) { ++                            _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailByte, tempBuf); ++                        } else { ++                            tempBuf[0] = (char)mySourceChar; ++                            tempBuf[1] = trailByte; ++                        } +                         mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte); +                         targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); +                     } else { +@@ -3254,6 +3453,9 @@ +     /* open a set and initialize it with code points that are algorithmically round-tripped */ +     switch(cnvData->locale[0]){ +     case 'j': ++        /* include JIS X 0201 which is hardcoded */ ++        sa->add(sa->set, 0xa5); ++        sa->add(sa->set, 0x203e); +         if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) { +             /* include Latin-1 for some variants of JP */ +             sa->addRange(sa->set, 0, 0xff); +@@ -3262,6 +3464,11 @@ +             sa->addRange(sa->set, 0, 0x7f); +         } +         if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) { ++            /* ++             * TODO(markus): If and when ucnv_getUnicodeSet() supports fallbacks, ++             * we need to include half-width Katakana for all JP variants because ++             * JIS X 0208 has hardcoded fallbacks for them. ++             */ +             /* include half-width Katakana for JP */ +             sa->addRange(sa->set, HWKANA_START, HWKANA_END); +         } +@@ -3281,15 +3488,7 @@ +         break; +     } +  +-    /* +-     * Version-specific for CN: +-     * CN version 0 does not map CNS planes 3..7 although +-     * they are all available in the CNS conversion table; +-     * CN version 1 does map them all. +-     * The two versions create different Unicode sets. +-     */ +-    for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { +-        if(cnvData->myConverterArray[i]!=NULL) { ++#if 0  /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ +             if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && +                 cnvData->version==0 && i==CNS_11643 +             ) { +@@ -3299,9 +3498,33 @@ +                         sa, UCNV_ROUNDTRIP_SET, +                         0, 0x81, 0x82, +                         pErrorCode); ++            } ++#endif ++ ++    for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { ++        UConverterSetFilter filter; ++        if(cnvData->myConverterArray[i]!=NULL) { ++            if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && ++                cnvData->version==0 && i==CNS_11643 ++            ) { ++                /* ++                 * Version-specific for CN: ++                 * CN version 0 does not map CNS planes 3..7 although ++                 * they are all available in the CNS conversion table; ++                 * CN version 1 (-EXT) does map them all. ++                 * The two versions create different Unicode sets. ++                 */ ++                filter=UCNV_SET_FILTER_2022_CN; ++            } else if(cnvData->locale[0]=='j' && i==JISX208) { ++                /* ++                 * Only add code points that map to Shift-JIS codes ++                 * corresponding to JIS X 0208. ++                 */ ++                filter=UCNV_SET_FILTER_SJIS; +             } else { +-                ucnv_MBCSGetUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, pErrorCode); ++                filter=UCNV_SET_FILTER_NONE; +             } ++            ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode); +         } +     } +  +diff -ru icu.5483/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c +--- icu.5483/source/common/ucnvmbcs.c	2009-06-02 12:47:41.000000000 +0100 ++++ icu/source/common/ucnvmbcs.c	2009-06-02 12:48:08.000000000 +0100 +@@ -340,6 +340,8 @@ +  + /* Miscellaneous ------------------------------------------------------------ */ +  ++#if 0  /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ ++ + /* similar to ucnv_MBCSGetNextUChar() but recursive */ + static void + _getUnicodeSetForBytes(const UConverterSharedData *sharedData, +@@ -432,11 +434,14 @@ +         pErrorCode); + } +  ++#endif ++ + U_CFUNC void +-ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, +-                             const USetAdder *sa, +-                             UConverterUnicodeSet which, +-                             UErrorCode *pErrorCode) { ++ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData, ++                                         const USetAdder *sa, ++                                         UConverterUnicodeSet which, ++                                         UConverterSetFilter filter, ++                                         UErrorCode *pErrorCode) { +     const UConverterMBCSTable *mbcsTable; +     const uint16_t *table; +  +@@ -490,50 +495,26 @@ +                 c+=1024; /* empty stage 2 block */ +             } +         } +-    } else if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY) { +-        /* ignore single-byte results */ ++    } else { +         const uint32_t *stage2; +-        const uint16_t *stage3, *results; +- +-        results=(const uint16_t *)mbcsTable->fromUnicodeBytes; +- +-        for(st1=0; st1<maxStage1; ++st1) { +-            st2=table[st1]; +-            if(st2>(maxStage1>>1)) { +-                stage2=(const uint32_t *)table+st2; +-                for(st2=0; st2<64; ++st2) { +-                    if((st3=stage2[st2])!=0) { +-                        /* read the stage 3 block */ +-                        stage3=results+16*(uint32_t)(uint16_t)st3; ++        const uint8_t *stage3, *bytes; ++        uint32_t st3Multiplier; ++        uint32_t value; +  +-                        /* get the roundtrip flags for the stage 3 block */ +-                        st3>>=16; ++        bytes=mbcsTable->fromUnicodeBytes; +  +-                        /* +-                         * Add code points for which the roundtrip flag is set. +-                         * Once we get a set for fallback mappings, we have to check +-                         * non-roundtrip stage 3 results for whether they are 0. +-                         * See ucnv_MBCSFromUnicodeWithOffsets() for details. +-                         * +-                         * Ignore single-byte results (<0x100). +-                         */ +-                        do { +-                            if((st3&1)!=0 && *stage3>=0x100) { +-                                sa->add(sa->set, c); +-                            } +-                            st3>>=1; +-                            ++stage3; +-                        } while((++c&0xf)!=0); +-                    } else { +-                        c+=16; /* empty stage 3 block */ +-                    } +-                } +-            } else { +-                c+=1024; /* empty stage 2 block */ +-            } ++        switch(mbcsTable->outputType) { ++        case MBCS_OUTPUT_3: ++        case MBCS_OUTPUT_4_EUC: ++            st3Multiplier=3; ++            break; ++        case MBCS_OUTPUT_4: ++            st3Multiplier=4; ++            break; ++        default: ++            st3Multiplier=2; ++            break; +         } +-    } else { +-        const uint32_t *stage2; +  +         for(st1=0; st1<maxStage1; ++st1) { +             st2=table[st1]; +@@ -541,6 +522,9 @@ +                 stage2=(const uint32_t *)table+st2; +                 for(st2=0; st2<64; ++st2) { +                     if((st3=stage2[st2])!=0) { ++                        /* read the stage 3 block */ ++                        stage3=bytes+st3Multiplier*16*(uint32_t)(uint16_t)st3; ++ +                         /* get the roundtrip flags for the stage 3 block */ +                         st3>>=16; +  +@@ -550,12 +534,49 @@ +                          * non-roundtrip stage 3 results for whether they are 0. +                          * See ucnv_MBCSFromUnicodeWithOffsets() for details. +                          */ +-                        do { +-                            if(st3&1) { +-                                sa->add(sa->set, c); +-                            } +-                            st3>>=1; +-                        } while((++c&0xf)!=0); ++                        switch(filter) { ++                        case UCNV_SET_FILTER_NONE: ++                            do { ++                                if(st3&1) { ++                                    sa->add(sa->set, c); ++                                } ++                                st3>>=1; ++                            } while((++c&0xf)!=0); ++                            break; ++                        case UCNV_SET_FILTER_DBCS_ONLY: ++                             /* Ignore single-byte results (<0x100). */ ++                            do { ++                                if((st3&1)!=0 && *((const uint16_t *)stage3)>=0x100) { ++                                    sa->add(sa->set, c); ++                                } ++                                st3>>=1; ++                                stage3+=2;  /* +=st3Multiplier */ ++                            } while((++c&0xf)!=0); ++                            break; ++                        case UCNV_SET_FILTER_2022_CN: ++                             /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */ ++                            do { ++                                if((st3&1)!=0 && ((value=*stage3)==0x81 || value==0x82)) { ++                                    sa->add(sa->set, c); ++                                } ++                                st3>>=1; ++                                stage3+=3;  /* +=st3Multiplier */ ++                            } while((++c&0xf)!=0); ++                            break; ++                        case UCNV_SET_FILTER_SJIS: ++                             /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */ ++                            do { ++                                if((st3&1)!=0 && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) { ++                                    sa->add(sa->set, c); ++                                } ++                                st3>>=1; ++                                stage3+=2;  /* +=st3Multiplier */ ++                            } while((++c&0xf)!=0); ++                            break; ++                        default: ++                            *pErrorCode=U_INTERNAL_PROGRAM_ERROR; ++                            return; ++                        } +                     } else { +                         c+=16; /* empty stage 3 block */ +                     } +@@ -569,6 +590,19 @@ +     ucnv_extGetUnicodeSet(sharedData, sa, which, pErrorCode); + } +  ++U_CFUNC void ++ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, ++                                 const USetAdder *sa, ++                                 UConverterUnicodeSet which, ++                                 UErrorCode *pErrorCode) { ++    ucnv_MBCSGetFilteredUnicodeSetForUnicode( ++        sharedData, sa, which, ++        sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? ++            UCNV_SET_FILTER_DBCS_ONLY : ++            UCNV_SET_FILTER_NONE, ++        pErrorCode); ++} ++ + static void + ucnv_MBCSGetUnicodeSet(const UConverter *cnv, +                    const USetAdder *sa, +diff -ru icu.5483/source/common/ucnvmbcs.h icu/source/common/ucnvmbcs.h +--- icu.5483/source/common/ucnvmbcs.h	2009-06-02 12:47:41.000000000 +0100 ++++ icu/source/common/ucnvmbcs.h	2009-06-02 12:48:08.000000000 +0100 +@@ -363,6 +363,7 @@ + ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, +                           UErrorCode *pErrorCode); +  ++#if 0  /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ + /* +  * Internal function returning a UnicodeSet for toUnicode() conversion. +  * Currently only used for ISO-2022-CN, and only handles roundtrip mappings. +@@ -377,6 +378,7 @@ +                            UConverterUnicodeSet which, +                            uint8_t state, int32_t lowByte, int32_t highByte, +                            UErrorCode *pErrorCode); ++#endif +  + /* +  * Internal function returning a UnicodeSet for toUnicode() conversion. +@@ -388,9 +390,30 @@ +  */ + U_CFUNC void + ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, +-                             const USetAdder *sa, +-                             UConverterUnicodeSet which, +-                             UErrorCode *pErrorCode); ++                                 const USetAdder *sa, ++                                 UConverterUnicodeSet which, ++                                 UErrorCode *pErrorCode); ++ ++typedef enum UConverterSetFilter { ++    UCNV_SET_FILTER_NONE, ++    UCNV_SET_FILTER_DBCS_ONLY, ++    UCNV_SET_FILTER_2022_CN, ++    UCNV_SET_FILTER_SJIS, ++    UCNV_SET_FILTER_COUNT ++} UConverterSetFilter; ++ ++/* ++ * Same as ucnv_MBCSGetUnicodeSetForUnicode() but ++ * the set can be filtered by encoding scheme. ++ * Used by stateful converters which share regular conversion tables ++ * but only use a subset of their mappings. ++ */ ++U_CFUNC void ++ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData, ++                                         const USetAdder *sa, ++                                         UConverterUnicodeSet which, ++                                         UConverterSetFilter filter, ++                                         UErrorCode *pErrorCode); +  + #endif +  +diff -ru icu.5483/source/test/cintltst/nucnvtst.c icu/source/test/cintltst/nucnvtst.c +--- icu.5483/source/test/cintltst/nucnvtst.c	2009-06-02 12:47:25.000000000 +0100 ++++ icu/source/test/cintltst/nucnvtst.c	2009-06-02 12:58:02.000000000 +0100 +@@ -3202,7 +3202,7 @@ +         0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A, 0x000D, 0x000A, +         0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, 0x0051, 0x0052, 0x000D, 0x000A, +         0x3005, 0x3006, 0x3007, 0x30FC, 0x2015, 0x2010, 0xFF0F, 0x005C, 0x000D, 0x000A, +-        0x301C, 0x2016, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A, ++        0x3013, 0x2018, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A, +         0x201D, 0x3014, 0x000D, 0x000A, +         0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x000D, 0x000A, +         0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x000D, 0x000A, +@@ -3730,7 +3730,7 @@ +         0x52C8, 0x52CC, 0x52CF, 0x52D1, 0x52D4, 0x52D6, 0x52DB, 0x52DC, 0x000D, 0x000A, +         0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, 0x0051, 0x0052, 0x000D, 0x000A, +         0x3005, 0x3006, 0x3007, 0x30FC, 0x2015, 0x2010, 0xFF0F, 0x005C, 0x000D, 0x000A, +-        0x301C, 0x2016, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A, ++        0x3013, 0x2018, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A, +         0x201D, 0x000D, 0x000A, +         0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x000D, 0x000A, +         0x4F94, 0x4F97, 0x52BA, 0x52BB, 0x52BD, 0x52C0, 0x52C4, 0x52C6, 0x000D, 0x000A, +diff -ru icu.5483/source/test/cintltst/udatatst.c icu/source/test/cintltst/udatatst.c +--- icu.5483/source/test/cintltst/udatatst.c	2009-06-02 12:47:25.000000000 +0100 ++++ icu/source/test/cintltst/udatatst.c	2009-06-02 13:09:15.000000000 +0100 +@@ -1260,6 +1260,11 @@ +     {"gb18030",                  "cnv", ucnv_swap}, +     /* MBCS conversion table file with extension */ +     {"*test4x",                  "cnv", ucnv_swap}, ++    /*  ++     * MBCS conversion table file without extension,  ++     * to test swapping and preflighting of UTF-8-friendly mbcsIndex[].  ++     */  ++    {"jisx-212",                 "cnv", ucnv_swap},  + #endif +  + #if !UCONFIG_NO_CONVERSION +diff -ru icu.5483/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt +--- icu.5483/source/test/testdata/conversion.txt	2009-06-02 12:47:25.000000000 +0100 ++++ icu/source/test/testdata/conversion.txt	2009-06-02 12:49:51.000000000 +0100 +@@ -48,6 +48,15 @@ +     toUnicode { +       Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" } +       Cases { ++        // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and ++        // using the Shift-JIS table for JIS X 0208 (ticket #5797) ++        { ++          "ISO-2022-JP", ++          :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b2842 }, ++          "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e", ++          :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 }, ++          :int{1}, :int{1}, "", "?", :bin{""} ++        } +         // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBMPWithOffsets() +         { +           "ISO-8859-3", +@@ -495,6 +504,15 @@ +         } +         { "UTF-16BE", :bin{ 00 }, "", :intvector{}, :int{1}, :int{0}, "truncated", ".", :bin{ 00 } } +         { "UTF-16BE", :bin{ d800dc }, "", :intvector{}, :int{1}, :int{0}, "truncated", ".", :bin{ d800dc } } ++        // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and ++        // using the Shift-JIS table for JIS X 0208 (ticket #5797) ++        { ++          "ISO-2022-JP", ++          "\u203e\xa5\u4e00\ufa10\u6f3e\u0391", ++          :bin{       1b284a7e5c1b2442306c222e5f2126211b2842 }, ++          :intvector{ 0,0,0,0,1,2,2,2,2,2,3,3,4,4,5,5,5,5,5 }, ++          :int{1}, :int{0}, "", "?=\u3013", ""  // U+3013 Geta Mark converts to 222e ++        } +         // Verify that mappings that would result in byte values outside 20..7F (for SBCS) +         // or 21..7E (for DBCS) are not used. +         // ibm-9005_X110-2007.ucm (ISO 8859-7, <ESC>.F=1b2e46): +@@ -1273,13 +1291,13 @@ +         // versions of ISO-2022-JP +         { +           "ISO-2022-JP", +-          "[\x00-\x0d\x10-\x1a\x1c-\x7f\u0391-\u03a1\uff61-\uff9f\u4e00\u4e01\uffe5]", +-          "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\uffe6-\U0010ffff]", ++          "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u203e\uff61-\uff9f\u4e00\u4e01\uffe5]", ++          "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\ufa0e-\ufa2d\uffe6-\U0010ffff]", +           :int{0} +         } +         { +           "ISO-2022-JP-2", +-          "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0385-\u038a\u0390-\u03a1\uff61-\uff9f\u4e00-\u4e05\uffe6]", ++          "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\uff61-\uff9f\u4e00-\u4e05\uffe6]", +           "[\x0e\x0f\x1b\uffe7-\U0010ffff]", +           :int{0} +         } diff --git a/icu.icu6001.backport.patch b/icu.icu6001.backport.patch new file mode 100644 index 0000000..11b2ee3 --- /dev/null +++ b/icu.icu6001.backport.patch @@ -0,0 +1,741 @@ +diff -ru icu.5797/source/common/ucnv2022.c icu/source/common/ucnv2022.c +--- icu.5797/source/common/ucnv2022.c	2009-06-02 14:45:31.000000000 +0100 ++++ icu/source/common/ucnv2022.c	2009-06-02 15:05:10.000000000 +0100 +@@ -3399,11 +3399,19 @@ +             /* include ASCII for JP */ +             sa->addRange(sa->set, 0, 0x7f); +         } +-        if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) { ++        if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { +             /* +-             * TODO(markus): If and when ucnv_getUnicodeSet() supports fallbacks, +-             * we need to include half-width Katakana for all JP variants because +-             * JIS X 0208 has hardcoded fallbacks for them. ++             * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0 ++             * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8) ++             * use half-width Katakana. ++             * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode) ++             * half-width Katakana via the ESC ( I sequence. ++             * However, we only emit (fromUnicode) half-width Katakana according to the ++             * definition of each variant. ++             * ++             * When including fallbacks, ++             * we need to include half-width Katakana Unicode code points for all JP variants because ++             * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana). +              */ +             /* include half-width Katakana for JP */ +             sa->addRange(sa->set, HWKANA_START, HWKANA_END); +@@ -3457,6 +3465,12 @@ +                  * corresponding to JIS X 0208. +                  */ +                 filter=UCNV_SET_FILTER_SJIS; ++            } else if(i==KSC5601) { ++                /* ++                 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables) ++                 * are broader than GR94. ++                 */ ++                filter=UCNV_SET_FILTER_GR94DBCS; +             } else { +                 filter=UCNV_SET_FILTER_NONE; +             } +@@ -3472,6 +3486,9 @@ +     sa->remove(sa->set, 0x0e); +     sa->remove(sa->set, 0x0f); +     sa->remove(sa->set, 0x1b); ++ ++    /* ISO 2022 converters do not convert C1 controls either */ ++    sa->removeRange(sa->set, 0x80, 0x9f); + } +  + static const UConverterImpl _ISO2022Impl={ +diff -ru icu.5797/source/common/ucnv_ext.c icu/source/common/ucnv_ext.c +--- icu.5797/source/common/ucnv_ext.c	2009-06-02 14:45:31.000000000 +0100 ++++ icu/source/common/ucnv_ext.c	2009-06-02 15:12:21.000000000 +0100 +@@ -946,7 +946,7 @@ + ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData, +                             const int32_t *cx, +                             const USetAdder *sa, +-                            UConverterUnicodeSet which, ++                            UBool useFallback, +                             int32_t minLength, +                             UChar32 c, +                             UChar s[UCNV_EXT_MAX_UCHARS], int32_t length, +@@ -966,7 +966,7 @@ +     value=*fromUSectionValues++; +  +     if( value!=0 && +-        UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) && ++        (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || useFallback) && +         UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength +     ) { +         if(c>=0) { +@@ -987,12 +987,14 @@ +             /* no mapping, do nothing */ +         } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) { +             ucnv_extGetUnicodeSetString( +-                sharedData, cx, sa, which, minLength, ++                sharedData, cx, sa, useFallback, minLength, +                 U_SENTINEL, s, length+1, +                 (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value), +                 pErrorCode); +-        } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))== +-                           UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) && ++        } else if((useFallback ? ++                      (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 : ++                      ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))== ++                          UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) && +                   UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength +         ) { +             sa->addString(sa->set, s, length+1); +@@ -1004,6 +1006,7 @@ + ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData, +                       const USetAdder *sa, +                       UConverterUnicodeSet which, ++                      UConverterSetFilter filter, +                       UErrorCode *pErrorCode) { +     const int32_t *cx; +     const uint16_t *stage12, *stage3, *ps2, *ps3; +@@ -1011,6 +1014,7 @@ +  +     uint32_t value; +     int32_t st1, stage1Length, st2, st3, minLength; ++    UBool useFallback; +  +     UChar s[UCNV_EXT_MAX_UCHARS]; +     UChar32 c; +@@ -1027,12 +1031,20 @@ +  +     stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]; +  ++    useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET); ++ +     /* enumerate the from-Unicode trie table */ +     c=0; /* keep track of the current code point while enumerating */ +  +-    if(sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY) { ++    if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY || ++        filter==UCNV_SET_FILTER_DBCS_ONLY || ++        filter==UCNV_SET_FILTER_SJIS || ++        filter==UCNV_SET_FILTER_GR94DBCS ++    ) { +         /* DBCS-only, ignore single-byte results */ +         minLength=2; ++    } else if(filter==UCNV_SET_FILTER_2022_CN) { ++        minLength=3; +     } else { +         minLength=1; +     } +@@ -1064,14 +1076,41 @@ +                             length=0; +                             U16_APPEND_UNSAFE(s, length, c); +                             ucnv_extGetUnicodeSetString( +-                                sharedData, cx, sa, which, minLength, ++                                sharedData, cx, sa, useFallback, minLength, +                                 c, s, length, +                                 (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value), +                                 pErrorCode); +-                        } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))== +-                                           UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) && ++                        } else if((useFallback ? ++                                      (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 : ++                                      ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))== ++                                          UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) && +                                   UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength +                         ) { ++                            switch(filter) { ++                            case UCNV_SET_FILTER_2022_CN: ++                                if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==3 && UCNV_EXT_FROM_U_GET_DATA(value)<=0x82ffff)) { ++                                    continue; ++                                } ++                                break; ++                            case UCNV_SET_FILTER_SJIS: ++                                if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && (value=UCNV_EXT_FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)) { ++                                    continue; ++                                } ++                                break; ++                            case UCNV_SET_FILTER_GR94DBCS: ++                                if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && ++                                     (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value)) - 0xa1a1)<=(0xfefe - 0xa1a1) && ++                                     (uint8_t)(value - 0xa1)<=(0xfe - 0xa1))) { ++                                    continue; ++                                } ++                                break; ++                            default: ++                                /* ++                                 * UCNV_SET_FILTER_NONE, ++                                 * or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength ++                                 */ ++                                break; ++                            } +                             sa->add(sa->set, c); +                         } +                     } while((++c&0xf)!=0); +diff -ru icu.5797/source/common/ucnv_ext.h icu/source/common/ucnv_ext.h +--- icu.5797/source/common/ucnv_ext.h	2009-06-02 14:45:31.000000000 +0100 ++++ icu/source/common/ucnv_ext.h	2009-06-02 15:05:10.000000000 +0100 +@@ -382,10 +382,20 @@ +                            UConverterFromUnicodeArgs *pArgs, int32_t srcIndex, +                            UErrorCode *pErrorCode); +  ++/* ++ * Add code points and strings to the set according to the extension mappings. ++ * Limitation on the UConverterSetFilter: ++ * The filters currently assume that they are used with 1:1 mappings. ++ * They only apply to single input code points, and then they pass through ++ * only mappings with single-charset-code results. ++ * For example, the Shift-JIS filter only works for 2-byte results and tests ++ * that those 2 bytes are in the JIS X 0208 range of Shift-JIS. ++ */ + U_CFUNC void + ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData, +                       const USetAdder *sa, +                       UConverterUnicodeSet which, ++                      UConverterSetFilter filter, +                       UErrorCode *pErrorCode); +  + /* toUnicode helpers -------------------------------------------------------- */ +diff -ru icu.5797/source/common/ucnvhz.c icu/source/common/ucnvhz.c +--- icu.5797/source/common/ucnvhz.c	2009-06-02 14:45:31.000000000 +0100 ++++ icu/source/common/ucnvhz.c	2009-06-02 15:05:10.000000000 +0100 +@@ -528,6 +528,7 @@ +     sa->add(sa->set, 0x7e); +  +     /* add all of the code points that the sub-converter handles */ ++    /* ucnv_MBCSGetFilteredUnicodeSetForUnicode(((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData, sa, which, UCNV_SET_FILTER_GR94DBCS, pErrorCode); */ +     ((UConverterDataHZ*)cnv->extraInfo)-> +         gbConverter->sharedData->impl-> +             getUnicodeSet(((UConverterDataHZ*)cnv->extraInfo)->gbConverter, +diff -ru icu.5797/source/common/ucnv_lmb.c icu/source/common/ucnv_lmb.c +--- icu.5797/source/common/ucnv_lmb.c	2009-06-02 14:45:31.000000000 +0100 ++++ icu/source/common/ucnv_lmb.c	2009-06-02 15:09:13.000000000 +0100 +@@ -536,7 +536,7 @@ +     NULL,\ +     NULL,\ +     _LMBCSSafeClone,\ +-    _LMBCSGetUnicodeSet\ ++    ucnv_getCompleteUnicodeSet\ + };\ + static const UConverterStaticData _LMBCSStaticData##n={\ +   sizeof(UConverterStaticData),\ +@@ -662,15 +662,14 @@ +     return &newLMBCS->cnv; + } +  +-static void +-_LMBCSGetUnicodeSet(const UConverter *cnv, +-                   const USetAdder *sa, +-                   UConverterUnicodeSet which, +-                   UErrorCode *pErrorCode) { +-    /* all but U+F6xx, see LMBCS explanation above (search for F6xx) */ +-    sa->addRange(sa->set, 0, 0xf5ff); +-    sa->addRange(sa->set, 0xf700, 0x10ffff); +-} ++/* ++ * There used to be a _LMBCSGetUnicodeSet() function here (up to svn revision 20117) ++ * which added all code points except for U+F6xx ++ * because those cannot be represented in the Unicode group. ++ * However, it turns out that windows-950 has roundtrips for all of U+F6xx ++ * which means that LMBCS can convert all Unicode code points after all. ++ * We now simply use ucnv_getCompleteUnicodeSet(). ++ */ +  + /*  +    Here's the basic helper function that we use when converting from +diff -ru icu.5797/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c +--- icu.5797/source/common/ucnvmbcs.c	2009-06-02 14:45:31.000000000 +0100 ++++ icu/source/common/ucnvmbcs.c	2009-06-02 15:12:40.000000000 +0100 +@@ -463,9 +463,23 @@ +  +     if(mbcsTable->outputType==MBCS_OUTPUT_1) { +         const uint16_t *stage2, *stage3, *results; ++        uint16_t minValue; +  +         results=(const uint16_t *)mbcsTable->fromUnicodeBytes; +  ++        /* ++         * Set a threshold variable for selecting which mappings to use. ++         * See ucnv_MBCSSingleFromBMPWithOffsets() and ++         * MBCS_SINGLE_RESULT_FROM_U() for details. ++         */ ++        if(which==UCNV_ROUNDTRIP_SET) { ++            /* use only roundtrips */ ++            minValue=0xf00; ++        } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ { ++            /* use all roundtrip and fallback results */ ++            minValue=0x800; ++        } ++ +         for(st1=0; st1<maxStage1; ++st1) { +             st2=table[st1]; +             if(st2>maxStage1) { +@@ -475,15 +489,8 @@ +                         /* read the stage 3 block */ +                         stage3=results+st3; +  +-                        /* +-                         * Add code points for which the roundtrip flag is set. +-                         * Once we get a set for fallback mappings, we have to use +-                         * a threshold variable with a value of 0x800. +-                         * See ucnv_MBCSSingleFromBMPWithOffsets() and +-                         * MBCS_SINGLE_RESULT_FROM_U() for details. +-                         */ +                         do { +-                            if(*stage3++>=0xf00) { ++                            if(*stage3++>=minValue) { +                                 sa->add(sa->set, c); +                             } +                         } while((++c&0xf)!=0); +@@ -500,9 +507,12 @@ +         const uint8_t *stage3, *bytes; +         uint32_t st3Multiplier; +         uint32_t value; ++        UBool useFallback; +  +         bytes=mbcsTable->fromUnicodeBytes; +  ++        useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET); ++ +         switch(mbcsTable->outputType) { +         case MBCS_OUTPUT_3: +         case MBCS_OUTPUT_4_EUC: +@@ -529,9 +539,8 @@ +                         st3>>=16; +  +                         /* +-                         * Add code points for which the roundtrip flag is set. +-                         * Once we get a set for fallback mappings, we have to check +-                         * non-roundtrip stage 3 results for whether they are 0. ++                         * Add code points for which the roundtrip flag is set, ++                         * or which map to non-zero bytes if we use fallbacks. +                          * See ucnv_MBCSFromUnicodeWithOffsets() for details. +                          */ +                         switch(filter) { +@@ -539,6 +548,23 @@ +                             do { +                                 if(st3&1) { +                                     sa->add(sa->set, c); ++                                    stage3+=st3Multiplier; ++                                } else if(useFallback) { ++                                    uint8_t b=0; ++                                    switch(st3Multiplier) { ++                                    case 4: ++                                        b|=*stage3++; ++                                    case 3: ++                                        b|=*stage3++; ++                                    case 2: ++                                        b|=stage3[0]|stage3[1]; ++                                        stage3+=2; ++                                    default: ++                                        break; ++                                    } ++                                    if(b!=0) { ++                                        sa->add(sa->set, c); ++                                    } +                                 } +                                 st3>>=1; +                             } while((++c&0xf)!=0); +@@ -546,7 +572,7 @@ +                         case UCNV_SET_FILTER_DBCS_ONLY: +                              /* Ignore single-byte results (<0x100). */ +                             do { +-                                if((st3&1)!=0 && *((const uint16_t *)stage3)>=0x100) { ++                                if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) { +                                     sa->add(sa->set, c); +                                 } +                                 st3>>=1; +@@ -556,7 +582,7 @@ +                         case UCNV_SET_FILTER_2022_CN: +                              /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */ +                             do { +-                                if((st3&1)!=0 && ((value=*stage3)==0x81 || value==0x82)) { ++                                if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) { +                                     sa->add(sa->set, c); +                                 } +                                 st3>>=1; +@@ -566,7 +592,20 @@ +                         case UCNV_SET_FILTER_SJIS: +                              /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */ +                             do { +-                                if((st3&1)!=0 && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) { ++                                if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) { ++                                    sa->add(sa->set, c); ++                                } ++                                st3>>=1; ++                                stage3+=2;  /* +=st3Multiplier */ ++                            } while((++c&0xf)!=0); ++                            break; ++                        case UCNV_SET_FILTER_GR94DBCS: ++                            /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */ ++                            do { ++                                if( ((st3&1)!=0 || useFallback) && ++                                    (uint16_t)((value=*((const uint16_t *)stage3)) - 0xa1a1)<=(0xfefe - 0xa1a1) && ++                                    (uint8_t)(value - 0xa1)<=(0xfe - 0xa1) ++                                ) { +                                     sa->add(sa->set, c); +                                 } +                                 st3>>=1; +@@ -587,7 +626,7 @@ +         } +     } +  +-    ucnv_extGetUnicodeSet(sharedData, sa, which, pErrorCode); ++    ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode); + } +  + U_CFUNC void +diff -ru icu.5797/source/common/ucnvmbcs.h icu/source/common/ucnvmbcs.h +--- icu.5797/source/common/ucnvmbcs.h	2009-06-02 14:45:31.000000000 +0100 ++++ icu/source/common/ucnvmbcs.h	2009-06-02 15:05:10.000000000 +0100 +@@ -399,6 +399,7 @@ +     UCNV_SET_FILTER_DBCS_ONLY, +     UCNV_SET_FILTER_2022_CN, +     UCNV_SET_FILTER_SJIS, ++    UCNV_SET_FILTER_GR94DBCS, +     UCNV_SET_FILTER_COUNT + } UConverterSetFilter; +  +diff -ru icu.5797/source/common/ucnv_set.c icu/source/common/ucnv_set.c +--- icu.5797/source/common/ucnv_set.c	2009-06-02 14:45:31.000000000 +0100 ++++ icu/source/common/ucnv_set.c	2009-06-02 15:05:10.000000000 +0100 +@@ -1,7 +1,7 @@ + /* + ******************************************************************************* + * +-*   Copyright (C) 2003-2005, International Business Machines ++*   Copyright (C) 2003-2007, International Business Machines + *   Corporation and others.  All Rights Reserved. + * + ******************************************************************************* +@@ -52,7 +52,8 @@ +             uset_add, +             uset_addRange, +             uset_addString, +-            uset_remove ++            uset_remove, ++            uset_removeRange +         }; +         sa.set=setFillIn; +  +diff -ru icu.5797/source/common/unicode/ucnv.h icu/source/common/unicode/ucnv.h +--- icu.5797/source/common/unicode/ucnv.h	2009-06-02 14:45:30.000000000 +0100 ++++ icu/source/common/unicode/ucnv.h	2009-06-02 15:05:10.000000000 +0100 +@@ -870,6 +870,8 @@ + typedef enum UConverterUnicodeSet { +     /** Select the set of roundtrippable Unicode code points. @stable ICU 2.6 */ +     UCNV_ROUNDTRIP_SET, ++    /** Select the set of Unicode code points with roundtrip or fallback mappings. @draft ICU 4.0 */ ++    UCNV_ROUNDTRIP_AND_FALLBACK_SET, +     /** Number of UConverterUnicodeSet selectors. @stable ICU 2.6 */ +     UCNV_SET_COUNT + } UConverterUnicodeSet; +@@ -878,11 +880,16 @@ + /** +  * Returns the set of Unicode code points that can be converted by an ICU converter. +  * +- * The current implementation returns only one kind of set (UCNV_ROUNDTRIP_SET): ++ * Returns one of several kinds of set: ++ * ++ * 1. UCNV_ROUNDTRIP_SET ++ * +  * The set of all Unicode code points that can be roundtrip-converted +- * (converted without any data loss) with the converter. ++ * (converted without any data loss) with the converter (ucnv_fromUnicode()). +  * This set will not include code points that have fallback mappings +  * or are only the result of reverse fallback mappings. ++ * This set will also not include PUA code points with fallbacks, although ++ * ucnv_fromUnicode() will always uses those mappings despite ucnv_setFallback(). +  * See UTR #22 "Character Mapping Markup Language" +  * at http://www.unicode.org/reports/tr22/ +  * +@@ -893,6 +900,12 @@ +  *   by comparing its roundtrip set with the set of ExemplarCharacters from +  *   ICU's locale data or other sources +  * ++ * 2. UCNV_ROUNDTRIP_AND_FALLBACK_SET ++ * ++ * The set of all Unicode code points that can be converted with the converter (ucnv_fromUnicode()) ++ * when fallbacks are turned on (see ucnv_setFallback()). ++ * This set includes all code points with roundtrips and fallbacks (but not reverse fallbacks). ++ * +  * In the future, there may be more UConverterUnicodeSet choices to select +  * sets with different properties. +  * +diff -ru icu.5797/source/common/uset_imp.h icu/source/common/uset_imp.h +--- icu.5797/source/common/uset_imp.h	2009-06-02 14:45:31.000000000 +0100 ++++ icu/source/common/uset_imp.h	2009-06-02 15:05:10.000000000 +0100 +@@ -36,6 +36,9 @@ + typedef void U_CALLCONV + USetRemove(USet *set, UChar32 c); +  ++typedef void U_CALLCONV ++USetRemoveRange(USet *set, UChar32 start, UChar32 end); ++ + /** +  * Interface for adding items to a USet, to keep low-level code from +  * statically depending on the USet implementation. +@@ -47,6 +50,7 @@ +     USetAddRange *addRange; +     USetAddString *addString; +     USetRemove *remove; ++    USetRemoveRange *removeRange; + }; + typedef struct USetAdder USetAdder; +  +diff -ru icu.5797/source/test/intltest/convtest.cpp icu/source/test/intltest/convtest.cpp +--- icu.5797/source/test/intltest/convtest.cpp	2009-06-02 14:45:18.000000000 +0100 ++++ icu/source/test/intltest/convtest.cpp	2009-06-02 15:09:31.000000000 +0100 +@@ -59,6 +59,7 @@ +         case 0: name="TestToUnicode"; if (exec) TestToUnicode(); break; +         case 1: name="TestFromUnicode"; if (exec) TestFromUnicode(); break; +         case 2: name="TestGetUnicodeSet"; if (exec) TestGetUnicodeSet(); break; ++        case 3: name="TestGetUnicodeSet2"; if (exec) TestGetUnicodeSet2(); break; +         default: name=""; break; //needed to end loop +     } + } +@@ -454,6 +455,183 @@ +     } + } +  ++U_CDECL_BEGIN ++static void U_CALLCONV ++getUnicodeSetCallback(const void *context, ++                      UConverterFromUnicodeArgs *fromUArgs, ++                      const UChar* codeUnits, ++                      int32_t length, ++                      UChar32 codePoint, ++                      UConverterCallbackReason reason, ++                      UErrorCode *pErrorCode) { ++    if(reason<=UCNV_IRREGULAR) { ++        ((UnicodeSet *)context)->remove(codePoint);  // the converter cannot convert this code point ++        *pErrorCode=U_ZERO_ERROR;                    // skip ++    }  // else ignore the reset, close and clone calls. ++} ++U_CDECL_END ++ ++// Compare ucnv_getUnicodeSet() with the set of characters that can be converted. ++void ++ConversionTest::TestGetUnicodeSet2() { ++    // Build a string with all code points. ++    UChar32 cpLimit; ++    int32_t s0Length; ++    if(quick) { ++        cpLimit=s0Length=0x10000;  // BMP only ++    } else { ++        cpLimit=0x110000; ++        s0Length=0x10000+0x200000;  // BMP + surrogate pairs ++    } ++    UChar *s0=new UChar[s0Length]; ++    if(s0==NULL) { ++        return; ++    } ++    UChar *s=s0; ++    UChar32 c; ++    UChar c2; ++    // low BMP ++    for(c=0; c<=0xd7ff; ++c) { ++        *s++=(UChar)c; ++    } ++    // trail surrogates ++    for(c=0xdc00; c<=0xdfff; ++c) { ++        *s++=(UChar)c; ++    } ++    // lead surrogates ++    // (after trails so that there is not even one surrogate pair in between) ++    for(c=0xd800; c<=0xdbff; ++c) { ++        *s++=(UChar)c; ++    } ++    // high BMP ++    for(c=0xe000; c<=0xffff; ++c) { ++        *s++=(UChar)c; ++    } ++    // supplementary code points = surrogate pairs ++    if(cpLimit==0x110000) { ++        for(c=0xd800; c<=0xdbff; ++c) { ++            for(c2=0xdc00; c2<=0xdfff; ++c2) { ++                *s++=(UChar)c; ++                *s++=c2; ++            } ++        } ++    } ++ ++    static const char *const cnvNames[]={ ++        "UTF-8", ++        "UTF-7", ++        "UTF-16", ++        "US-ASCII", ++        "ISO-8859-1", ++        "windows-1252", ++        "Shift-JIS", ++        "ibm-1390",  // EBCDIC_STATEFUL table ++        "ibm-16684",  // DBCS-only extension table based on EBCDIC_STATEFUL table ++        // "HZ", TODO(markus): known bug, the set incorrectly contains [\u02CA\u02CB\u02D9\u2010\u2013\u2015...] ++        "ISO-2022-JP", ++        "JIS7", ++        "ISO-2022-CN", ++        "ISO-2022-CN-EXT", ++        "LMBCS" ++    }; ++    char buffer[1024]; ++    int32_t i; ++    for(i=0; i<LENGTHOF(cnvNames); ++i) { ++        UErrorCode errorCode=U_ZERO_ERROR; ++        UConverter *cnv=cnv_open(cnvNames[i], errorCode); ++        if(U_FAILURE(errorCode)) { ++            errln("failed to open converter %s - %s", cnvNames[i], u_errorName(errorCode)); ++            continue; ++        } ++        UnicodeSet expected; ++        ucnv_setFromUCallBack(cnv, getUnicodeSetCallback, &expected, NULL, NULL, &errorCode); ++        if(U_FAILURE(errorCode)) { ++            errln("failed to set the callback on converter %s - %s", cnvNames[i], u_errorName(errorCode)); ++            ucnv_close(cnv); ++            continue; ++        } ++        UConverterUnicodeSet which; ++        for(which=UCNV_ROUNDTRIP_SET; which<UCNV_SET_COUNT; which=(UConverterUnicodeSet)((int)which+1)) { ++            if(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { ++                ucnv_setFallback(cnv, TRUE); ++            } ++            expected.add(0, cpLimit-1); ++            s=s0; ++            UBool flush; ++            do { ++                char *t=buffer; ++                flush=(UBool)(s==s0+s0Length); ++                ucnv_fromUnicode(cnv, &t, buffer+sizeof(buffer), (const UChar **)&s, s0+s0Length, NULL, flush, &errorCode); ++                if(U_FAILURE(errorCode)) { ++                    if(errorCode==U_BUFFER_OVERFLOW_ERROR) { ++                        errorCode=U_ZERO_ERROR; ++                        continue; ++                    } else { ++                        break;  // unexpected error, should not occur ++                    } ++                } ++            } while(!flush); ++            UnicodeSet set; ++            ucnv_getUnicodeSet(cnv, (USet *)&set, which, &errorCode); ++            if(cpLimit<0x110000) { ++                set.remove(cpLimit, 0x10ffff); ++            } ++            if(which==UCNV_ROUNDTRIP_SET) { ++                // ignore PUA code points because they will be converted even if they ++                // are fallbacks and when other fallbacks are turned off, ++                // but ucnv_getUnicodeSet(UCNV_ROUNDTRIP_SET) delivers true roundtrips ++                expected.remove(0xe000, 0xf8ff); ++                expected.remove(0xf0000, 0xffffd); ++                expected.remove(0x100000, 0x10fffd); ++                set.remove(0xe000, 0xf8ff); ++                set.remove(0xf0000, 0xffffd); ++                set.remove(0x100000, 0x10fffd); ++            } ++            if(set!=expected) { ++                // First try to see if we have different sets because ucnv_getUnicodeSet() ++                // added strings: The above conversion method does not tell us what strings might be convertible. ++                // Remove strings from the set and compare again. ++                // Unfortunately, there are no good, direct set methods for finding out whether there are strings ++                // in the set, nor for enumerating or removing just them. ++                // Intersect all code points with the set. The intersection will not contain strings. ++                UnicodeSet temp(0, 0x10ffff); ++                temp.retainAll(set); ++                set=temp; ++            } ++            if(set!=expected) { ++                UnicodeSet diffSet; ++                UnicodeString out; ++ ++                // are there items that must be in the set but are not? ++                (diffSet=expected).removeAll(set); ++                if(!diffSet.isEmpty()) { ++                    diffSet.toPattern(out, TRUE); ++                    if(out.length()>100) { ++                        out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis)); ++                    } ++                    errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - which set: %d", ++                            cnvNames[i], which); ++                    errln(out); ++                } ++ ++                // are there items that must not be in the set but are? ++                (diffSet=set).removeAll(expected); ++                if(!diffSet.isEmpty()) { ++                    diffSet.toPattern(out, TRUE); ++                    if(out.length()>100) { ++                        out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis)); ++                    } ++                    errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - which set: %d", ++                            cnvNames[i], which); ++                    errln(out); ++                } ++            } ++        } ++    } ++ ++    delete [] s0; ++} ++ + // open testdata or ICU data converter ------------------------------------- *** +  + UConverter * +diff -ru icu.5797/source/test/intltest/convtest.h icu/source/test/intltest/convtest.h +--- icu.5797/source/test/intltest/convtest.h	2009-06-02 14:45:18.000000000 +0100 ++++ icu/source/test/intltest/convtest.h	2009-06-02 15:05:10.000000000 +0100 +@@ -64,6 +64,7 @@ +     void TestToUnicode(); +     void TestFromUnicode(); +     void TestGetUnicodeSet(); ++    void TestGetUnicodeSet2(); +  + private: +     UBool +diff -ru icu.5797/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt +--- icu.5797/source/test/testdata/conversion.txt	2009-06-02 14:45:18.000000000 +0100 ++++ icu/source/test/testdata/conversion.txt	2009-06-02 15:25:04.000000000 +0100 +@@ -1198,16 +1198,29 @@ +         // versions of ISO-2022-JP +         { +           "ISO-2022-JP", +-          "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u203e\uff61-\uff9f\u4e00\u4e01\uffe5]", +-          "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\ufa0e-\ufa2d\uffe6-\U0010ffff]", ++          "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2015\u203e\u4e00\u4e01\uffe5]", ++          "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u2014\u301c\u4e02\u4e27-\u4e29\u4fe0\u663b\u9eb5\ufa0e-\ufa2d\uff61-\uff9f\uffe4\uffe6-\U0010ffff]", +           :int{0} +         } +         { +           "ISO-2022-JP-2", +-          "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\uff61-\uff9f\u4e00-\u4e05\uffe6]", +-          "[\x0e\x0f\x1b\uffe7-\U0010ffff]", ++          "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uffe6]", ++          "[\x0e\x0f\x1b\uff61-\uff9f\uffe4\uffe7-\U0010ffff]", +           :int{0} +         } ++        { ++          "JIS7", ++          "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uff61-\uff9f\uffe6]", ++          "[\x0e\x0f\x1b\uffe4\uffe7-\U0010ffff]", ++          :int{0} ++        } ++        // with fallbacks ++        { ++          "ISO-2022-JP", ++          "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2014\u2015\u203e\u301c\u4e00\u4e01\u4fe0\u9eb5\uff61-\uff9f\uffe5]", ++          "[\x0e\x0f\x1b\xa6\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\u663b\ufa0e-\ufa2d\uffe4\uffe6-\U0010ffff]", ++          :int{1} ++        } +  +         // versions of ISO-2022-CN +         { +@@ -1223,6 +1236,14 @@ +           :int{0} +         } +  ++        // LMBCS ++        { ++          "LMBCS", ++          "[\x00-\U0010ffff]", ++          "[]", ++          :int{0} ++        } ++ +         // DBCS-only +         { +           "ibm-971", diff --git a/icu.icu6002.backport.patch b/icu.icu6002.backport.patch new file mode 100644 index 0000000..51f0d75 --- /dev/null +++ b/icu.icu6002.backport.patch @@ -0,0 +1,397 @@ +diff -ru icu.6001/source/common/ucnv_ext.c icu/source/common/ucnv_ext.c +--- icu.6001/source/common/ucnv_ext.c	2009-06-02 15:29:01.000000000 +0100 ++++ icu/source/common/ucnv_ext.c	2009-06-02 15:29:18.000000000 +0100 +@@ -1036,15 +1036,13 @@ +     /* enumerate the from-Unicode trie table */ +     c=0; /* keep track of the current code point while enumerating */ +  +-    if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY || +-        filter==UCNV_SET_FILTER_DBCS_ONLY || +-        filter==UCNV_SET_FILTER_SJIS || +-        filter==UCNV_SET_FILTER_GR94DBCS ++    if(filter==UCNV_SET_FILTER_2022_CN) { ++        minLength=3; ++    } else if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY || ++               filter!=UCNV_SET_FILTER_NONE +     ) { +         /* DBCS-only, ignore single-byte results */ +         minLength=2; +-    } else if(filter==UCNV_SET_FILTER_2022_CN) { +-        minLength=3; +     } else { +         minLength=1; +     } +@@ -1104,6 +1102,13 @@ +                                     continue; +                                 } +                                 break; ++                            case UCNV_SET_FILTER_HZ: ++                                if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && ++                                     (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfdfe - 0xa1a1) && ++                                     (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) { ++                                    continue; ++                                } ++                                break; +                             default: +                                 /* +                                  * UCNV_SET_FILTER_NONE, +diff -ru icu.6001/source/common/ucnvhz.c icu/source/common/ucnvhz.c +--- icu.6001/source/common/ucnvhz.c	2009-06-02 15:29:01.000000000 +0100 ++++ icu/source/common/ucnvhz.c	2009-06-02 15:29:15.000000000 +0100 +@@ -72,7 +72,7 @@ +     cnv->extraInfo = uprv_malloc(sizeof(UConverterDataHZ)); +     if(cnv->extraInfo != NULL){ +         uprv_memset(cnv->extraInfo, 0, sizeof(UConverterDataHZ)); +-        ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("ibm-1386",errorCode); ++        ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("GBK",errorCode); +     } +     else { +         *errorCode = U_MEMORY_ALLOCATION_ERROR; +@@ -141,7 +141,7 @@ +     UChar *myTarget = args->target; +     const char *mySourceLimit = args->sourceLimit; +     UChar32 targetUniChar = 0x0000; +-    UChar mySourceChar = 0x0000; ++    int32_t mySourceChar = 0x0000; +     UConverterDataHZ* myData=(UConverterDataHZ*)(args->converter->extraInfo); +     tempBuf[0]=0;  +     tempBuf[1]=0; +@@ -156,90 +156,71 @@ +              +             mySourceChar= (unsigned char) *mySource++; +  +-            switch(mySourceChar){ ++            if(args->converter->mode == UCNV_TILDE) { ++                /* second byte after ~ */ ++                args->converter->mode=0; ++                switch(mySourceChar) { +                 case 0x0A: +-                    if(args->converter->mode ==UCNV_TILDE){ +-                        args->converter->mode=0; +-                         +-                    } +-                    *(myTarget++)=(UChar)mySourceChar; ++                    /* no output for ~\n (line-continuation marker) */ +                     continue; +-             +                 case UCNV_TILDE: +-                    if(args->converter->mode ==UCNV_TILDE){ +-                        *(myTarget++)=(UChar)mySourceChar; +-                        args->converter->mode=0; +-                        continue; +-                         ++                    if(args->offsets) { ++                        args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 2); +                     } +-                    else if(args->converter->toUnicodeStatus !=0){ +-                        args->converter->mode=0; +-                        break; +-                    } +-                    else{ +-                        args->converter->mode = UCNV_TILDE; +-                        continue; +-                    } +-                 +-                 ++                    *(myTarget++)=(UChar)mySourceChar; ++                    continue; +                 case UCNV_OPEN_BRACE: +-                    if(args->converter->mode == UCNV_TILDE){ +-                        args->converter->mode=0; +-                        myData->isStateDBCS = TRUE; +-                        continue; +-                    } +-                    else{ +-                        break; +-                    } +-                +-                 ++                    myData->isStateDBCS = TRUE; ++                    continue; +                 case UCNV_CLOSE_BRACE: +-                    if(args->converter->mode == UCNV_TILDE){ +-                        args->converter->mode=0; +-                         myData->isStateDBCS = FALSE; +-                        continue; +-                    } +-                    else{ +-                        break; +-                    } +-                 ++                    myData->isStateDBCS = FALSE; ++                    continue; +                 default: +                      /* if the first byte is equal to TILDE and the trail byte +                      * is not a valid byte then it is an error condition +                      */ +-                    if(args->converter->mode == UCNV_TILDE){ +-                        args->converter->mode=0; +-                        mySourceChar= (UChar)(((UCNV_TILDE+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80)); +-                        goto SAVE_STATE; +-                    } +-                     ++                    mySourceChar = 0x7e00 | mySourceChar; ++                    targetUniChar = 0xffff; +                     break; +- +-            } +-              +-            if(myData->isStateDBCS){ ++                } ++            } else if(myData->isStateDBCS) { +                 if(args->converter->toUnicodeStatus == 0x00){ +-                    args->converter->toUnicodeStatus = (UChar) mySourceChar; ++                    /* lead byte */ ++                    if(mySourceChar == UCNV_TILDE) { ++                        args->converter->mode = UCNV_TILDE; ++                    } else { ++                        /* add another bit to distinguish a 0 byte from not having seen a lead byte */ ++                        args->converter->toUnicodeStatus = (uint32_t) (mySourceChar | 0x100); ++                    } +                     continue; +                 } +                 else{ +-                    tempBuf[0] = (char) (args->converter->toUnicodeStatus+0x80) ; +-                    tempBuf[1] = (char) (mySourceChar+0x80); +-                    mySourceChar= (UChar)(((args->converter->toUnicodeStatus+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80)); ++                    /* trail byte */ ++                    uint32_t leadByte = args->converter->toUnicodeStatus & 0xff; ++                    if( (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21) && ++                        (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21) ++                    ) { ++                        tempBuf[0] = (char) (leadByte+0x80) ; ++                        tempBuf[1] = (char) (mySourceChar+0x80); ++                        targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData, ++                            tempBuf, 2, args->converter->useFallback); ++                    } else { ++                        targetUniChar = 0xffff; ++                    } ++                    /* add another bit so that the code below writes 2 bytes in case of error */ ++                    mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar; +                     args->converter->toUnicodeStatus =0x00; +-                    targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData, +-                        tempBuf, 2, args->converter->useFallback); +                 } +             } +             else{ +-                if(args->converter->fromUnicodeStatus == 0x00){ +-                    targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData, +-                        mySource - 1, 1, args->converter->useFallback); +-                } +-                else{ +-                    goto SAVE_STATE; ++                if(mySourceChar == UCNV_TILDE) { ++                    args->converter->mode = UCNV_TILDE; ++                    continue; ++                } else if(mySourceChar <= 0x7f) { ++                    targetUniChar = (UChar)mySourceChar;  /* ASCII */ ++                } else { ++                    targetUniChar = 0xffff; +                 } +- +             } +             if(targetUniChar < 0xfffe){ +                 if(args->offsets) { +@@ -248,26 +229,17 @@ +  +                 *(myTarget++)=(UChar)targetUniChar; +             } +-            else if(targetUniChar>=0xfffe){ +-SAVE_STATE: ++            else /* targetUniChar>=0xfffe */ { +                 if(targetUniChar == 0xfffe){ +                     *err = U_INVALID_CHAR_FOUND; +                 } +                 else{ +                     *err = U_ILLEGAL_CHAR_FOUND; +                 } +-                if(myData->isStateDBCS){ +-                    /* this should never occur since isStateDBCS is set to true  +-                     * only after tempBuf[0] and tempBuf[1] +-                     * are set to the input ..  just to please BEAM  +-                     */ +-                    if(tempBuf[0]==0 || tempBuf[1]==0){ +-                        *err = U_INTERNAL_PROGRAM_ERROR; +-                    }else{ +-                        args->converter->toUBytes[0] = (uint8_t)(tempBuf[0]-0x80); +-                        args->converter->toUBytes[1] = (uint8_t)(tempBuf[1]-0x80); +-                        args->converter->toULength=2; +-                    } ++                if(mySourceChar > 0xff){ ++                    args->converter->toUBytes[0] = (uint8_t)(mySourceChar >> 8); ++                    args->converter->toUBytes[1] = (uint8_t)mySourceChar; ++                    args->converter->toULength=2; +                 } +                 else{ +                     args->converter->toUBytes[0] = (uint8_t)mySourceChar; +@@ -328,16 +300,21 @@ +                 escSeq = TILDE_ESCAPE; +                 CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex); +                 continue; +-            } +-            else{ ++            } else if(mySourceChar <= 0x7f) { ++                length = 1; ++                targetUniChar = mySourceChar; ++            } else { +                 length= ucnv_MBCSFromUChar32(myConverterData->gbConverter->sharedData, +                     mySourceChar,&targetUniChar,args->converter->useFallback); +- +-            } +-            /* only DBCS or SBCS characters are expected*/ +-            /* DB haracters with high bit set to 1 are expected */ +-            if(length > 2 || length==0 ||(((targetUniChar & 0x8080) != 0x8080)&& length==2)){ +-                targetUniChar= missingCharMarker; ++                /* we can only use lead bytes 21..7D and trail bytes 21..7E */ ++                if( length == 2 && ++                    (uint16_t)(targetUniChar - 0xa1a1) <= (0xfdfe - 0xa1a1) && ++                    (uint8_t)(targetUniChar - 0xa1) <= (0xfe - 0xa1) ++                ) { ++                    targetUniChar -= 0x8080; ++                } else { ++                    targetUniChar = missingCharMarker; ++                } +             } +             if (targetUniChar != missingCharMarker){ +                myConverterData->isTargetUCharDBCS = isTargetUCharDBCS = (UBool)(targetUniChar>0x00FF);      +@@ -360,22 +337,22 @@ +              +                 if(isTargetUCharDBCS){ +                     if( myTargetIndex <targetLength){ +-                        myTarget[myTargetIndex++] =(char) ((targetUniChar >> 8) -0x80); ++                        myTarget[myTargetIndex++] =(char) (targetUniChar >> 8); +                         if(offsets){ +                             *(offsets++) = mySourceIndex-1; +                         } +                         if(myTargetIndex < targetLength){ +-                            myTarget[myTargetIndex++] =(char) ((targetUniChar & 0x00FF) -0x80); ++                            myTarget[myTargetIndex++] =(char) targetUniChar; +                             if(offsets){ +                                 *(offsets++) = mySourceIndex-1; +                             } +                         }else{ +-                            args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80); ++                            args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar; +                             *err = U_BUFFER_OVERFLOW_ERROR; +                         }  +                     }else{ +-                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) ((targetUniChar >> 8) -0x80); +-                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80); ++                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) (targetUniChar >> 8); ++                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar; +                         *err = U_BUFFER_OVERFLOW_ERROR; +                     } +  +@@ -524,15 +501,14 @@ +                   const USetAdder *sa, +                   UConverterUnicodeSet which, +                   UErrorCode *pErrorCode) { +-    /* the tilde '~' is hardcoded in the converter */ +-    sa->add(sa->set, 0x7e); ++    /* HZ converts all of ASCII */ ++    sa->addRange(sa->set, 0, 0x7f); +  +     /* add all of the code points that the sub-converter handles */ +-    /* ucnv_MBCSGetFilteredUnicodeSetForUnicode(((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData, sa, which, UCNV_SET_FILTER_GR94DBCS, pErrorCode); */ +-    ((UConverterDataHZ*)cnv->extraInfo)-> +-        gbConverter->sharedData->impl-> +-            getUnicodeSet(((UConverterDataHZ*)cnv->extraInfo)->gbConverter, +-                          sa, which, pErrorCode); ++    ucnv_MBCSGetFilteredUnicodeSetForUnicode( ++        ((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData, ++        sa, which, UCNV_SET_FILTER_HZ, ++        pErrorCode); + } +  + static const UConverterImpl _HZImpl={ +diff -ru icu.6001/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c +--- icu.6001/source/common/ucnvmbcs.c	2009-06-02 15:29:01.000000000 +0100 ++++ icu/source/common/ucnvmbcs.c	2009-06-02 15:35:01.000000000 +0100 +@@ -612,6 +612,19 @@ +                                 stage3+=2;  /* +=st3Multiplier */ +                             } while((++c&0xf)!=0); +                             break; ++                        case UCNV_SET_FILTER_HZ: ++                            /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */ ++                            do { ++                                if( ((st3&1)!=0 || useFallback) && ++                                    (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) && ++                                    (uint8_t)(value - 0xa1)<=(0xfe - 0xa1) ++                                ) { ++                                    sa->add(sa->set, c); ++                                } ++                                st3>>=1; ++                                stage3+=2;  /* +=st3Multiplier */ ++                            } while((++c&0xf)!=0); ++                            break; +                         default: +                             *pErrorCode=U_INTERNAL_PROGRAM_ERROR; +                             return; +diff -ru icu.6001/source/common/ucnvmbcs.h icu/source/common/ucnvmbcs.h +--- icu.6001/source/common/ucnvmbcs.h	2009-06-02 15:29:01.000000000 +0100 ++++ icu/source/common/ucnvmbcs.h	2009-06-02 15:29:15.000000000 +0100 +@@ -400,6 +400,7 @@ +     UCNV_SET_FILTER_2022_CN, +     UCNV_SET_FILTER_SJIS, +     UCNV_SET_FILTER_GR94DBCS, ++    UCNV_SET_FILTER_HZ, +     UCNV_SET_FILTER_COUNT + } UConverterSetFilter; +  +diff -ru icu.6001/source/test/cintltst/ncnvtst.c icu/source/test/cintltst/ncnvtst.c +--- icu.6001/source/test/cintltst/ncnvtst.c	2009-06-02 15:28:46.000000000 +0100 ++++ icu/source/test/cintltst/ncnvtst.c	2009-06-02 15:29:15.000000000 +0100 +@@ -1928,7 +1928,7 @@ + #if !UCONFIG_NO_LEGACY_CONVERSION +         { "UTF-8", 0, 0xd7ff, 0xe000, 0x10ffff, 0xd800, 0xdfff }, +         { "windows-1251", 0, 0x7f, 0x410, 0x44f, 0x3000, 0xd7ff }, +-        { "HZ", 0x410, 0x44f, 0x4e00, 0x4eff, 0xac00, 0xd7ff }, ++        /* HZ test case fixed and moved to intltest's conversion.txt, ticket #6002 */ +         { "shift-jis", 0x3041, 0x3093, 0x30a1, 0x30f3, 0x900, 0x1cff } + #else +         { "UTF-8", 0, 0xd7ff, 0xe000, 0x10ffff, 0xd800, 0xdfff } +diff -ru icu.6001/source/test/intltest/convtest.cpp icu/source/test/intltest/convtest.cpp +--- icu.6001/source/test/intltest/convtest.cpp	2009-06-02 15:28:46.000000000 +0100 ++++ icu/source/test/intltest/convtest.cpp	2009-06-02 15:29:15.000000000 +0100 +@@ -527,7 +527,7 @@ +         "Shift-JIS", +         "ibm-1390",  // EBCDIC_STATEFUL table +         "ibm-16684",  // DBCS-only extension table based on EBCDIC_STATEFUL table +-        // "HZ", TODO(markus): known bug, the set incorrectly contains [\u02CA\u02CB\u02D9\u2010\u2013\u2015...] ++        "HZ", +         "ISO-2022-JP", +         "JIS7", +         "ISO-2022-CN", +diff -ru icu.6001/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt +--- icu.6001/source/test/testdata/conversion.txt	2009-06-02 15:28:46.000000000 +0100 ++++ icu/source/test/testdata/conversion.txt	2009-06-02 15:29:15.000000000 +0100 +@@ -48,6 +48,14 @@ +     toUnicode { +       Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" } +       Cases { ++        // test that HZ limits its byte values to lead bytes 21..7d and trail bytes 21..7e ++        { ++          "HZ", ++          :bin{ 7e7b21212120217e217f772100007e217e7d207e7e807e0a2b }, ++          "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd ~\ufffd+", ++          :intvector{ 2,4,6,8,10,12,14,18,19,21,24 }, ++          :int{1}, :int{1}, "", "?", :bin{""} ++        } +         // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and +         // using the Shift-JIS table for JIS X 0208 (ticket #5797) +         { +@@ -1244,6 +1252,14 @@ +           :int{0} +         } +  ++        // HZ ++        { ++          "HZ", ++          "[\u0410-\u044f\u4e00\u4e01\u4e03]", ++          "[\u4e02\u4e04-\u4e06\uac00-\ud7ff]", ++          :int{0} ++        } ++         +         // DBCS-only +         { +           "ibm-971", diff --git a/icu.icu6175.emptysegments.patch b/icu.icu6175.emptysegments.patch new file mode 100644 index 0000000..bb40bd5 --- /dev/null +++ b/icu.icu6175.emptysegments.patch @@ -0,0 +1,535 @@ +diff -ru icu.6002/source/common/ucnv2022.c icu/source/common/ucnv2022.c +--- icu.6002/source/common/ucnv2022.c	2009-06-02 15:38:08.000000000 +0100 ++++ icu/source/common/ucnv2022.c	2009-06-02 15:40:20.000000000 +0100 +@@ -201,6 +201,7 @@ + #ifdef U_ENABLE_GENERIC_ISO_2022 +     UBool isFirstBuffer; + #endif ++    UBool isEmptySegment; +     char name[30]; +     char locale[3]; + }UConverterDataISO2022; +@@ -609,6 +610,7 @@ +     if(choice<=UCNV_RESET_TO_UNICODE) { +         uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State)); +         myConverterData->key = 0; ++        myConverterData->isEmptySegment = FALSE; +     } +     if(choice!=UCNV_RESET_TO_UNICODE) { +         uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State)); +@@ -814,6 +816,7 @@ +             if(chosenConverterName == NULL) { +                 /* SS2 or SS3 */ +                 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; ++                _this->toUCallbackReason = UCNV_UNASSIGNED; +                 return; +             } +  +@@ -935,6 +938,8 @@ +     } +     if(U_SUCCESS(*err)) { +         _this->toULength = 0; ++    } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) { ++        _this->toUCallbackReason = UCNV_UNASSIGNED; +     } + } +  +@@ -1986,6 +1991,7 @@ +                     continue; +                 } else { +                     /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ ++                    myData->isEmptySegment = FALSE;	/* reset this, we have a different error */ +                     break; +                 } +  +@@ -1997,21 +2003,39 @@ +                     continue; +                 } else { +                     /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ ++                    myData->isEmptySegment = FALSE;	/* reset this, we have a different error */ +                     break; +                 } +  +             case ESC_2022: +                 mySource--; + escape: +-                changeState_2022(args->converter,&(mySource),  +-                    mySourceLimit, ISO_2022_JP,err); ++                { ++                    const char * mySourceBefore = mySource; ++                    int8_t toULengthBefore = args->converter->toULength; ++ ++                    changeState_2022(args->converter,&(mySource), ++                        mySourceLimit, ISO_2022_JP,err); ++ ++                    /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */ ++                    if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { ++                        *err = U_ILLEGAL_ESCAPE_SEQUENCE; ++                        args->converter->toUCallbackReason = UCNV_IRREGULAR; ++                        args->converter->toULength = toULengthBefore + (mySource - mySourceBefore); ++                    } ++                } +  +                 /* invalid or illegal escape sequence */ +                 if(U_FAILURE(*err)){ +                     args->target = myTarget; +                     args->source = mySource; ++                    myData->isEmptySegment = FALSE;	/* Reset to avoid future spurious errors */ +                     return; +                 } ++                /* If we successfully completed an escape sequence, we begin a new segment, empty so far */ ++                if(myData->key==0) { ++                    myData->isEmptySegment = TRUE; ++                } +                 continue; +  +             /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */ +@@ -2028,6 +2052,7 @@ +                 /* falls through */ +             default: +                 /* convert one or two bytes */ ++                myData->isEmptySegment = FALSE; +                 cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; +                 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 && +                     !IS_JP_DBCS(cs) +@@ -2524,15 +2549,27 @@ +  +             if(mySourceChar==UCNV_SI){ +                 myData->toU2022State.g = 0; ++                if (myData->isEmptySegment) { ++                    myData->isEmptySegment = FALSE;	/* we are handling it, reset to avoid future spurious errors */ ++                    *err = U_ILLEGAL_ESCAPE_SEQUENCE; ++                    args->converter->toUCallbackReason = UCNV_IRREGULAR; ++                    args->converter->toUBytes[0] = mySourceChar; ++                    args->converter->toULength = 1; ++                    args->target = myTarget; ++                    args->source = mySource; ++                    return; ++                } +                 /*consume the source */ +                 continue; +             }else if(mySourceChar==UCNV_SO){ +                 myData->toU2022State.g = 1; ++                myData->isEmptySegment = TRUE;	/* Begin a new segment, empty so far */ +                 /*consume the source */ +                 continue; +             }else if(mySourceChar==ESC_2022){ +                 mySource--; + escape: ++                myData->isEmptySegment = FALSE;	/* Any invalid ESC sequences will be detected separately, so just reset this */ +                 changeState_2022(args->converter,&(mySource),  +                                 mySourceLimit, ISO_2022_KR, err); +                 if(U_FAILURE(*err)){ +@@ -2543,6 +2580,7 @@ +                 continue; +             }    +  ++            myData->isEmptySegment = FALSE;	/* Any invalid char errors will be detected separately, so just reset this */ +             if(myData->toU2022State.g == 1) { +                 if(mySource < mySourceLimit) { +                     char trailByte; +@@ -3075,27 +3113,52 @@ +             switch(mySourceChar){ +             case UCNV_SI: +                 pToU2022State->g=0; ++                if (myData->isEmptySegment) { ++                    myData->isEmptySegment = FALSE;	/* we are handling it, reset to avoid future spurious errors */ ++                    *err = U_ILLEGAL_ESCAPE_SEQUENCE; ++                    args->converter->toUCallbackReason = UCNV_IRREGULAR; ++                    args->converter->toUBytes[0] = mySourceChar; ++                    args->converter->toULength = 1; ++                    args->target = myTarget; ++                    args->source = mySource; ++                    return; ++                } +                 continue; +  +             case UCNV_SO: +                 if(pToU2022State->cs[1] != 0) { +                     pToU2022State->g=1; ++                    myData->isEmptySegment = TRUE;	/* Begin a new segment, empty so far */ +                     continue; +                 } else { +                     /* illegal to have SO before a matching designator */ ++                    myData->isEmptySegment = FALSE;	/* Handling a different error, reset this to avoid future spurious errs */ +                     break; +                 } +  +             case ESC_2022: +                 mySource--; + escape: +-                changeState_2022(args->converter,&(mySource),  +-                    mySourceLimit, ISO_2022_CN,err); ++                { ++                    const char * mySourceBefore = mySource; ++                    int8_t toULengthBefore = args->converter->toULength; ++ ++                    changeState_2022(args->converter,&(mySource), ++                        mySourceLimit, ISO_2022_CN,err); ++ ++                    /* After SO there must be at least one character before a designator (designator error handled separately) */ ++                    if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { ++                        *err = U_ILLEGAL_ESCAPE_SEQUENCE; ++                        args->converter->toUCallbackReason = UCNV_IRREGULAR; ++                        args->converter->toULength = toULengthBefore + (mySource - mySourceBefore); ++                    } ++                } +  +                 /* invalid or illegal escape sequence */ +                 if(U_FAILURE(*err)){ +                     args->target = myTarget; +                     args->source = mySource; ++                    myData->isEmptySegment = FALSE;	/* Reset to avoid future spurious errors */ +                     return; +                 } +                 continue; +@@ -3109,6 +3172,7 @@ +                 /* falls through */ +             default: +                 /* convert one or two bytes */ ++                myData->isEmptySegment = FALSE; +                 if(pToU2022State->g != 0) { +                     if(mySource < mySourceLimit) { +                         UConverterSharedData *cnv; +diff -ru icu.6002/source/common/ucnv_bld.c icu/source/common/ucnv_bld.c +--- icu.6002/source/common/ucnv_bld.c	2009-06-02 15:38:05.000000000 +0100 ++++ icu/source/common/ucnv_bld.c	2009-06-02 15:38:31.000000000 +0100 +@@ -914,6 +914,7 @@ +     myUConverter->subCharLen = mySharedConverterData->staticData->subCharLen; +     myUConverter->subChars = (uint8_t *)myUConverter->subUChars; +     uprv_memcpy(myUConverter->subChars, mySharedConverterData->staticData->subChar, myUConverter->subCharLen); ++    myUConverter->toUCallbackReason = UCNV_ILLEGAL; /* default reason to invoke (*fromCharErrorBehaviour) */ +  +     if(mySharedConverterData->impl->open != NULL) { +         mySharedConverterData->impl->open(myUConverter, realName, locale, options, err); +diff -ru icu.6002/source/common/ucnv_bld.h icu/source/common/ucnv_bld.h +--- icu.6002/source/common/ucnv_bld.h	2009-06-02 15:38:08.000000000 +0100 ++++ icu/source/common/ucnv_bld.h	2009-06-02 15:38:31.000000000 +0100 +@@ -226,6 +226,9 @@ +     char preToU[UCNV_EXT_MAX_BYTES]; +     int8_t preFromULength, preToULength;    /* negative: replay */ +     int8_t preToUFirstLength;               /* length of first character */ ++ ++    /* new fields for ICU 4.0 */ ++    UConverterCallbackReason toUCallbackReason; /* (*fromCharErrorBehaviour) reason, set when error is detected */ + }; +  + U_CDECL_END /* end of UConverter */ +diff -ru icu.6002/source/common/ucnv.c icu/source/common/ucnv.c +--- icu.6002/source/common/ucnv.c	2009-06-02 15:38:05.000000000 +0100 ++++ icu/source/common/ucnv.c	2009-06-02 15:38:31.000000000 +0100 +@@ -1473,11 +1473,14 @@ +             cnv->toULength=0; +  +             /* call the callback function */ ++            if(cnv->toUCallbackReason==UCNV_ILLEGAL && *err==U_INVALID_CHAR_FOUND) { ++                cnv->toUCallbackReason = UCNV_UNASSIGNED; ++            } +             cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, +                 cnv->invalidCharBuffer, errorInputLength, +-                (*err==U_INVALID_CHAR_FOUND || *err==U_UNSUPPORTED_ESCAPE_SEQUENCE) ? +-                    UCNV_UNASSIGNED : UCNV_ILLEGAL, ++                cnv->toUCallbackReason, +                 err); ++            cnv->toUCallbackReason = UCNV_ILLEGAL; /* reset to default value */ +  +             /* +              * loop back to the offset handling +diff -ru icu.6002/source/common/ucnvhz.c icu/source/common/ucnvhz.c +--- icu.6002/source/common/ucnvhz.c	2009-06-02 15:38:08.000000000 +0100 ++++ icu/source/common/ucnvhz.c	2009-06-02 15:38:31.000000000 +0100 +@@ -59,6 +59,7 @@ +     UBool isEscapeAppended; +     UBool isStateDBCS; +     UBool isTargetUCharDBCS; ++    UBool isEmptySegment; + }UConverterDataHZ; +  +  +@@ -98,6 +99,7 @@ +         cnv->mode=0; +         if(cnv->extraInfo != NULL){ +             ((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = FALSE; ++            ((UConverterDataHZ*)cnv->extraInfo)->isEmptySegment = FALSE; +         } +     } +     if(choice!=UCNV_RESET_TO_UNICODE) { +@@ -130,6 +132,10 @@ + *   from-GB code '~}' ($7E7D) is outside the defined GB range.) + * + *   Source: RFC 1842 ++* ++*   Note that the formal syntax in RFC 1842 is invalid. I assume that the ++*   intended definition of single-byte-segment is as follows (pedberg): ++*   single-byte-segment = single-byte-seq 1*single-byte-char + */ +  +  +@@ -168,12 +174,23 @@ +                         args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 2); +                     } +                     *(myTarget++)=(UChar)mySourceChar; ++                    myData->isEmptySegment = FALSE; +                     continue; +                 case UCNV_OPEN_BRACE: +-                    myData->isStateDBCS = TRUE; +-                    continue; +                 case UCNV_CLOSE_BRACE: +-                    myData->isStateDBCS = FALSE; ++                    myData->isStateDBCS = (mySourceChar == UCNV_OPEN_BRACE); ++                    if (myData->isEmptySegment) { ++                        myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ ++                        *err = U_ILLEGAL_ESCAPE_SEQUENCE; ++                        args->converter->toUCallbackReason = UCNV_IRREGULAR; ++                        args->converter->toUBytes[0] = UCNV_TILDE; ++                        args->converter->toUBytes[1] = mySourceChar; ++                        args->converter->toULength = 2; ++                        args->target = myTarget; ++                        args->source = mySource; ++                        return; ++                    } ++                    myData->isEmptySegment = TRUE; +                     continue; +                 default: +                      /* if the first byte is equal to TILDE and the trail byte +@@ -181,6 +198,7 @@ +                      */ +                     mySourceChar = 0x7e00 | mySourceChar; +                     targetUniChar = 0xffff; ++                    myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */ +                     break; +                 } +             } else if(myData->isStateDBCS) { +@@ -191,6 +209,7 @@ +                     } else { +                         /* add another bit to distinguish a 0 byte from not having seen a lead byte */ +                         args->converter->toUnicodeStatus = (uint32_t) (mySourceChar | 0x100); ++                        myData->isEmptySegment = FALSE; /* the segment has something, either valid or will produce a different error, so reset this */ +                     } +                     continue; +                 } +@@ -218,8 +237,10 @@ +                     continue; +                 } else if(mySourceChar <= 0x7f) { +                     targetUniChar = (UChar)mySourceChar;  /* ASCII */ ++                    myData->isEmptySegment = FALSE; /* the segment has something valid */ +                 } else { +                     targetUniChar = 0xffff; ++                    myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */ +                 } +             } +             if(targetUniChar < 0xfffe){ +diff -ru icu.6002/source/test/cintltst/nucnvtst.c icu/source/test/cintltst/nucnvtst.c +--- icu.6002/source/test/cintltst/nucnvtst.c	2009-06-02 15:37:53.000000000 +0100 ++++ icu/source/test/cintltst/nucnvtst.c	2009-06-02 15:40:52.000000000 +0100 +@@ -81,6 +81,7 @@ + static void TestJitterbug2411(void); + #endif +  ++static void TestJitterbug6175(void); + static void TestRoundTrippingAllUTF(void); + static void TestConv(const uint16_t in[], +                      int len, +@@ -294,6 +295,7 @@ + #if !UCONFIG_NO_LEGACY_CONVERSION +    addTest(root, &TestJitterbug2346, "tsconv/nucnvtst/TestJitterbug2346"); +    addTest(root, &TestJitterbug2411, "tsconv/nucnvtst/TestJitterbug2411"); ++   addTest(root, &TestJitterbug6175, "tsconv/nucnvtst/TestJitterbug6175"); + #endif +  + } +@@ -4454,6 +4456,70 @@ +     free(offsets); + } +  ++/* Tests for empty segments in ISO-2022-JP/KR/CN, HZ, check that UConverterCallbackReason is UCNV_IRREGULAR */ ++typedef struct { ++    const char *    converterName; ++    const char *    inputText; ++    int             inputTextLength; ++} EmptySegmentTest; ++ ++/* Callback for TestJitterbug6175, should only get called for empty segment errors */ ++static void UCNV_TO_U_CALLBACK_EMPTYSEGMENT( const void *context, UConverterToUnicodeArgs *toArgs, const char* codeUnits, ++                                             int32_t length, UConverterCallbackReason reason, UErrorCode * err ) { ++    if (reason > UCNV_IRREGULAR) { ++        return; ++    } ++    if (reason != UCNV_IRREGULAR) { ++        log_err("toUnicode callback invoked for empty segment but reason is not UCNV_IRREGULAR\n"); ++    } ++    /* Standard stuff below from UCNV_TO_U_CALLBACK_SUBSTITUTE */ ++    *err = U_ZERO_ERROR; ++    ucnv_cbToUWriteSub(toArgs,0,err); ++} ++ ++enum { kEmptySegmentToUCharsMax = 64 }; ++static void TestJitterbug6175(void) { ++    static const char  iso2022jp_a[] = { 0x61, 0x62, 0x1B,0x24,0x42, 0x1B,0x28,0x42, 0x63, 0x64, 0x0D, 0x0A }; ++    static const char  iso2022kr_a[] = { 0x1B,0x24,0x29,0x43, 0x61, 0x0E, 0x0F, 0x62, 0x0D, 0x0A }; ++    static const char  iso2022cn_a[] = { 0x61, 0x1B,0x24,0x29,0x41, 0x62, 0x0E, 0x0F, 0x1B,0x24,0x2A,0x48, 0x1B,0x4E, 0x6A,0x65, 0x63, 0x0D, 0x0A }; ++    static const char  iso2022cn_b[] = { 0x61, 0x1B,0x24,0x29,0x41, 0x62, 0x0E, 0x1B,0x24,0x29,0x47, 0x68,0x64, 0x0F, 0x63, 0x0D, 0x0A }; ++    static const char  hzGB2312_a[]  = { 0x61, 0x62, 0x7E,0x7B, 0x7E,0x7D, 0x63, 0x64 }; ++    static const EmptySegmentTest emptySegmentTests[] = { ++        /* converterName inputText    inputTextLength */ ++        { "ISO-2022-JP", iso2022jp_a, sizeof(iso2022jp_a) }, ++        { "ISO-2022-KR", iso2022kr_a, sizeof(iso2022kr_a) }, ++        { "ISO-2022-CN", iso2022cn_a, sizeof(iso2022cn_a) }, ++        { "ISO-2022-CN", iso2022cn_b, sizeof(iso2022cn_b) }, ++        { "HZ-GB-2312",  hzGB2312_a,  sizeof(hzGB2312_a)  }, ++        /* terminator: */ ++        { NULL,          NULL,        0,                  } ++    }; ++    const EmptySegmentTest * testPtr; ++    for (testPtr = emptySegmentTests; testPtr->converterName != NULL; ++testPtr) { ++        UErrorCode   err = U_ZERO_ERROR; ++        UConverter * cnv = ucnv_open(testPtr->converterName, &err); ++        if (U_FAILURE(err)) { ++            log_data_err("Unable to open %s converter: %s\n", testPtr->converterName, u_errorName(err)); ++            return; ++        } ++        ucnv_setToUCallBack(cnv, UCNV_TO_U_CALLBACK_EMPTYSEGMENT, NULL, NULL, NULL, &err); ++        if (U_FAILURE(err)) { ++            log_data_err("Unable to setToUCallBack for %s converter: %s\n", testPtr->converterName, u_errorName(err)); ++            ucnv_close(cnv); ++            return; ++        } ++        { ++            UChar         toUChars[kEmptySegmentToUCharsMax]; ++            UChar *       toUCharsPtr = toUChars; ++            const UChar * toUCharsLimit = toUCharsPtr + kEmptySegmentToUCharsMax; ++            const char *  inCharsPtr = testPtr->inputText; ++            const char *  inCharsLimit = inCharsPtr + testPtr->inputTextLength; ++            ucnv_toUnicode(cnv, &toUCharsPtr, toUCharsLimit, &inCharsPtr, inCharsLimit, NULL, TRUE, &err); ++        } ++        ucnv_close(cnv); ++    } ++} ++ + static void + TestEBCDIC_STATEFUL() { +     /* test input */ +diff -ru icu.6002/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt +--- icu.6002/source/test/testdata/conversion.txt	2009-06-02 15:37:54.000000000 +0100 ++++ icu/source/test/testdata/conversion.txt	2009-06-02 15:40:52.000000000 +0100 +@@ -199,6 +199,21 @@ +           :intvector{ 0, 5, 7, 9, 9, 9, 9, 9, 9, 9, 9, 12 }, +           :int{1}, :int{1}, "", "&", :bin{""} +         } ++        // empty segment (using substitution and stop) ++        { ++          "ISO-2022-KR", ++          :bin{ 1b242943610e0f620d0a }, ++          "a\uFFFDb\u000D\u000A", ++          :intvector{ 4, 6, 7, 8, 9 }, ++          :int{1}, :int{1}, "", "?", :bin{""} ++        } ++        { ++          "ISO-2022-KR", ++          :bin{ 1b242943610e0f620d0a }, ++          "a", ++          :intvector{ 4 }, ++          :int{1}, :int{1}, "illesc", ".", :bin{"0f"} ++        } +  +         // ISO-2022-JP +  +@@ -249,6 +264,21 @@ +           :bin{ 41c15c1b284a5cc242 }, "A\uff81\\\xa5\uff82B", :intvector{ 0, 1, 2, 6, 7, 8 }, +           :int{1}, :int{1}, "", ".", :bin{""} +         } ++        // empty segment (using substitution and stop) ++        { ++          "ISO-2022-JP", ++          :bin{ 61621b24421b284263640d0a }, ++          "ab\uFFFDcd\u000D\u000A", ++          :intvector{ 0, 1, 5, 8, 9, 10, 11 }, ++          :int{1}, :int{1}, "", "?", :bin{""} ++        } ++        { ++          "ISO-2022-JP", ++          :bin{ 61621b24421b284263640d0a }, ++          "ab", ++          :intvector{ 0, 1 }, ++          :int{1}, :int{1}, "illesc", ".", :bin{"1b2842"} ++        } +  +         // ISO-2022-CN +  +@@ -319,6 +349,36 @@ +           :bin{ 411b242b491b4f2121 }, "\x41", :intvector{ 0 }, +           :int{1}, :int{1}, "unsuppesc", ".", :bin{ 1b242b49 } +         } ++        // empty segment 1 (using substitution and stop) ++        { ++          "ISO-2022-CN", ++          :bin{ 611b242941620e0f1b242a481b4e6a65630d0a }, ++          "ab\uFFFD\u994Cc\u000D\u000A", ++          :intvector{ 0, 5, 7, 14, 16, 17, 18 }, ++          :int{1}, :int{1}, "", "?", :bin{""} ++        } ++        { ++          "ISO-2022-CN", ++          :bin{ 611b242941620e0f1b242a481b4e6a65630d0a }, ++          "ab", ++          :intvector{ 0, 5 }, ++          :int{1}, :int{1}, "illesc", ".", :bin{"0f"} ++        } ++        // empty segment 2 (using substitution and stop) ++        { ++          "ISO-2022-CN", ++          :bin{ 611b242941620e1b24294768640f630d0a }, ++          "ab\uFFFD\u5F70c\u000D\u000A", ++          :intvector{ 0, 5, 7, 11, 14, 15, 16 }, ++          :int{1}, :int{1}, "", "?", :bin{""} ++        } ++        { ++          "ISO-2022-CN", ++          :bin{ 611b242941620e1b24294768640f630d0a }, ++          "ab", ++          :intvector{ 0, 5 }, ++          :int{1}, :int{1}, "illesc", ".", :bin{"1b242947"} ++        } +  +         // ISO-2022 SBCS +         // [U_ENABLE_GENERIC_ISO_2022] +@@ -333,6 +393,39 @@ +         //  :int{1}, :int{1}, "", ".", :bin{""} +         //} +  ++        // HZ-GB-2312 ++ ++        // empty segment 1 (using substitution and stop) ++        { ++          "HZ-GB-2312", ++          :bin{ 61627e7b7e7d6364 }, ++          "ab\uFFFDcd", ++          :intvector{ 0, 1, 4, 6, 7 }, ++          :int{1}, :int{1}, "", "?", :bin{""} ++        } ++        { ++          "HZ-GB-2312", ++          :bin{ 61627e7b7e7d63640d0a }, ++          "ab", ++          :intvector{ 0, 1 }, ++          :int{1}, :int{1}, "illesc", ".", :bin{"7e7d"} ++        } ++        // empty segment 2 & legal redundant switches (using substitution and stop) ++        { ++          "HZ-GB-2312", ++          :bin{ 61627e7b323b3f557e7b7e7b523b7e7d63647e7d65667e7d7e7d }, ++          "ab\u4E0D\u7A7A\uFFFD\u4E00cdef\uFFFD", ++          :intvector{ 0, 1, 4, 6, 10, 12, 16, 17, 20, 21, 24 }, ++          :int{1}, :int{1}, "", "?", :bin{""} ++        } ++        { ++          "HZ-GB-2312", ++          :bin{ 61627e7b323b3f557e7b7e7b523b7e7d63647e7d65667e7d7e7d }, ++          "ab\u4E0D\u7A7A", ++          :intvector{ 0, 1, 4, 6 }, ++          :int{1}, :int{1}, "illesc", ".", :bin{"7e7b"} ++        } ++ +         // DBCS-only extensions +         { +           "ibm-970", diff --git a/icu.icuXXXX.malayalam.bysyllable.patch b/icu.icuXXXX.malayalam.bysyllable.patch new file mode 100644 index 0000000..d0cd1b1 --- /dev/null +++ b/icu.icuXXXX.malayalam.bysyllable.patch @@ -0,0 +1,250 @@ +diff -ruN icu.orig/source/layout/IndicReordering.h icu/source/layout/IndicReordering.h +--- icu.orig/source/layout/IndicReordering.h	2007-04-27 10:28:22.000000000 +0100 ++++ icu/source/layout/IndicReordering.h	2007-04-27 10:39:22.000000000 +0100 +@@ -142,6 +142,7 @@ +     // do not instantiate +     IndicReordering(); +  ++public: +     static le_int32 findSyllable(const IndicClassTable *classTable, const LEUnicode *chars, le_int32 prev, le_int32 charCount); +  + }; +diff -ruN icu.orig/source/layout/LayoutEngine.cpp icu/source/layout/LayoutEngine.cpp +--- icu.orig/source/layout/LayoutEngine.cpp	2007-04-27 10:28:22.000000000 +0100 ++++ icu/source/layout/LayoutEngine.cpp	2007-04-27 10:39:22.000000000 +0100 +@@ -14,6 +14,7 @@ + #include "CanonShaping.h" + #include "HanLayoutEngine.h" + #include "HangulLayoutEngine.h" ++#include "MalayalamLayoutEngine.h" + #include "IndicLayoutEngine.h" + #include "KhmerLayoutEngine.h" + #include "ThaiLayoutEngine.h" +@@ -451,11 +452,13 @@ +  +     if (gsubTable != NULL && gsubTable->coversScript(scriptTag = OpenTypeLayoutEngine::getScriptTag(scriptCode))) { +         switch (scriptCode) { ++        case mlymScriptCode: ++            result = new MalayalamOpenTypeLayoutEngine(fontInstance, scriptCode, languageCode, typoFlags, gsubTable); ++	    break; +         case bengScriptCode: +         case devaScriptCode: +         case gujrScriptCode: +         case kndaScriptCode: +-        case mlymScriptCode: +         case oryaScriptCode: +         case guruScriptCode: +         case tamlScriptCode: +@@ -512,11 +515,13 @@ +             result = new GXLayoutEngine(fontInstance, scriptCode, languageCode, morphTable); +         } else { +             switch (scriptCode) { ++            case mlymScriptCode: ++                result = new MalayalamOpenTypeLayoutEngine(fontInstance, scriptCode, languageCode, typoFlags); ++	        break; +             case bengScriptCode: +             case devaScriptCode: +             case gujrScriptCode: +             case kndaScriptCode: +-            case mlymScriptCode: +             case oryaScriptCode: +             case guruScriptCode: +             case tamlScriptCode: +diff -ruN icu.orig/source/layout/LEGlyphStorage.h icu/source/layout/LEGlyphStorage.h +--- icu.orig/source/layout/LEGlyphStorage.h	2007-04-27 10:28:22.000000000 +0100 ++++ icu/source/layout/LEGlyphStorage.h	2007-04-27 10:43:54.000000000 +0100 +@@ -413,6 +413,8 @@ +      */ +     void adoptGlyphArray(LEGlyphStorage &from); +  ++    void appendGlyphStorage(LEGlyphStorage &from); ++ +     /** +      * Delete the char indices array and replace it with the one +      * in <code>from</code>. Set the char indices array pointer +diff -ruN icu.orig/source/layout/Makefile.in icu/source/layout/Makefile.in +--- icu.orig/source/layout/Makefile.in	2007-04-27 10:28:22.000000000 +0100 ++++ icu/source/layout/Makefile.in	2007-04-27 10:39:22.000000000 +0100 +@@ -66,6 +66,7 @@ + ArabicLayoutEngine.o \ + GXLayoutEngine.o \ + HanLayoutEngine.o \ ++MalayalamLayoutEngine.o \ + IndicLayoutEngine.o \ + LayoutEngine.o \ + ContextualGlyphSubstProc.o \ +diff -ruN icu.orig/source/layout/MalayalamLayoutEngine.cpp icu/source/layout/MalayalamLayoutEngine.cpp +--- icu.orig/source/layout/MalayalamLayoutEngine.cpp	1970-01-01 01:00:00.000000000 +0100 ++++ icu/source/layout/MalayalamLayoutEngine.cpp	2007-04-27 10:44:26.000000000 +0100 +@@ -0,0 +1,126 @@ ++ ++/* ++ * ++ * (C) Copyright IBM Corp. 1998-2005 - All Rights Reserved ++ * ++ */ ++ ++#include "LETypes.h" ++#include "LayoutEngine.h" ++#include "OpenTypeLayoutEngine.h" ++#include "MalayalamLayoutEngine.h" ++#include "ScriptAndLanguageTags.h" ++ ++#include "GlyphSubstitutionTables.h" ++#include "GlyphDefinitionTables.h" ++#include "GlyphPositioningTables.h" ++ ++#include "GDEFMarkFilter.h" ++#include "LEGlyphStorage.h" ++ ++#include "IndicReordering.h" ++ ++#include <stdio.h> ++ ++U_NAMESPACE_BEGIN ++ ++UOBJECT_DEFINE_RTTI_IMPLEMENTATION(MalayalamOpenTypeLayoutEngine) ++ ++void LEGlyphStorage::appendGlyphStorage(LEGlyphStorage &from) ++{ ++    if (fInsertionList) applyInsertions(); ++    if (from.fInsertionList) from.applyInsertions(); ++    if ((!fInsertionList) && (from.fInsertionList)) ++    { ++        fInsertionList = from.fInsertionList; ++        from.fInsertionList = NULL; ++    } ++ ++    if (!from.fGlyphCount) ++        return; ++ ++    le_int32 newGlyphCount = fGlyphCount + from.fGlyphCount; ++ ++    fGlyphs = (LEGlyphID*)LE_GROW_ARRAY(fGlyphs, newGlyphCount); ++    LE_ARRAY_COPY(fGlyphs+fGlyphCount, from.fGlyphs, from.fGlyphCount); ++ ++    le_int32 nLargestIndex = 0; ++    if (fGlyphCount) ++    { ++        for (le_int32 i = 0; i < fGlyphCount; ++i) ++        { ++            if (fCharIndices[i] > nLargestIndex) ++                nLargestIndex = fCharIndices[i]; ++        } ++        nLargestIndex+=1; ++    } ++    fCharIndices = (le_int32 *)LE_GROW_ARRAY(fCharIndices, newGlyphCount); ++    for (le_int32 i = 0; i < from.fGlyphCount; ++i) ++        fCharIndices[fGlyphCount+i] = from.fCharIndices[i] + nLargestIndex; ++ ++    fAuxData = (le_uint32 *)LE_GROW_ARRAY(fAuxData, newGlyphCount); ++    LE_ARRAY_COPY(fAuxData+fGlyphCount, from.fAuxData, from.fGlyphCount); ++ ++    fGlyphCount = newGlyphCount; ++} ++ ++le_int32 MalayalamOpenTypeLayoutEngine::glyphPostProcessing(LEGlyphStorage &tempGlyphStorage, LEGlyphStorage &glyphStorage, LEErrorCode &success) ++{ ++    if (LE_FAILURE(success)) { ++        return 0; ++    } ++ ++    glyphStorage.appendGlyphStorage(tempGlyphStorage); ++ ++    return glyphStorage.getGlyphCount(); ++} ++ ++ ++le_int32 MalayalamOpenTypeLayoutEngine::computeGlyphs(const LEUnicode chars[], le_int32 offset, le_int32 count, le_int32 max, le_bool rightToLeft, LEGlyphStorage &glyphStorage, LEErrorCode &success) ++{ ++    if (LE_FAILURE(success)) { ++        return 0; ++    } ++ ++    if (chars == NULL || offset < 0 || count < 0 || max < 0 || offset >= max || offset + count > max) { ++        success = LE_ILLEGAL_ARGUMENT_ERROR; ++        return 0; ++    } ++ ++    le_int32 outGlyphCount=0; ++ ++    const IndicClassTable *classTable = IndicClassTable::getScriptClassTable(fScriptCode); ++    le_int32 prev = 0; ++    while (prev < count) ++    {  ++        le_int32 outCharCount=0, fakeGlyphCount=0; ++        LEUnicode *outChars = NULL; ++        LEGlyphStorage fakeGlyphStorage; ++ ++        le_int32 syllable = IndicReordering::findSyllable(classTable, chars+offset, prev, count); ++        outCharCount = characterProcessing(chars+prev, offset, syllable-prev, max, rightToLeft, outChars, fakeGlyphStorage, success); ++ ++        if (LE_FAILURE(success)) { ++            return 0; ++        } ++ ++        if (outChars != NULL) { ++            fakeGlyphCount = glyphProcessing(outChars, 0, outCharCount, outCharCount, rightToLeft, fakeGlyphStorage, success); ++            LE_DELETE_ARRAY(outChars); // FIXME: a subclass may have allocated this, in which case this delete might not work... ++        } else { ++            fakeGlyphCount = glyphProcessing(chars+prev, offset, syllable-prev, max, rightToLeft, fakeGlyphStorage, success); ++        } ++ ++        if (LE_FAILURE(success)) { ++            return 0; ++        } ++ ++        outGlyphCount = glyphPostProcessing(fakeGlyphStorage, glyphStorage, success); ++ ++        prev = syllable; ++    } ++ ++    return outGlyphCount; ++} ++ ++U_NAMESPACE_END +diff -ruN icu.orig/source/layout/MalayalamLayoutEngine.h icu/source/layout/MalayalamLayoutEngine.h +--- icu.orig/source/layout/MalayalamLayoutEngine.h	1970-01-01 01:00:00.000000000 +0100 ++++ icu/source/layout/MalayalamLayoutEngine.h	2007-04-27 10:39:52.000000000 +0100 +@@ -0,0 +1,41 @@ ++ ++/* ++ * ++ * (C) Copyright IBM Corp. 1998-2005 - All Rights Reserved ++ * ++ */ ++ ++#ifndef __MALAYALAMLAYOUTENGINE_H ++#define __MALAYALAMLAYOUTENGINE_H ++ ++#include "IndicLayoutEngine.h" ++ ++U_NAMESPACE_BEGIN ++ ++class MalayalamOpenTypeLayoutEngine : public IndicOpenTypeLayoutEngine ++{ ++public: ++    MalayalamOpenTypeLayoutEngine(const LEFontInstance *fontInstance, le_int32 scriptCode, le_int32 languageCode, ++                            le_int32 typoFlags, const GlyphSubstitutionTableHeader *gsubTable) : ++        IndicOpenTypeLayoutEngine(fontInstance, scriptCode, languageCode, typoFlags, gsubTable) ++ ++    {} ++ ++    MalayalamOpenTypeLayoutEngine(const LEFontInstance *fontInstance, le_int32 scriptCode, le_int32 languageCode, ++			      le_int32 typoFlags) : ++        IndicOpenTypeLayoutEngine(fontInstance, scriptCode, languageCode, typoFlags) ++ ++    {} ++ ++    virtual UClassID getDynamicClassID() const; ++    static UClassID getStaticClassID(); ++ ++protected: ++    virtual le_int32 glyphPostProcessing(LEGlyphStorage &tempGlyphStorage, LEGlyphStorage &glyphStorage, LEErrorCode &success); ++ ++    virtual le_int32 computeGlyphs(const LEUnicode chars[], le_int32 offset, le_int32 count, le_int32 max, le_bool rightToLeft, LEGlyphStorage &glyphStorage, LEErrorCode &success); ++}; ++ ++U_NAMESPACE_END ++#endif ++ diff --git a/icu.icuXXXX.rollbackabi.patch b/icu.icuXXXX.rollbackabi.patch new file mode 100644 index 0000000..038d4b6 --- /dev/null +++ b/icu.icuXXXX.rollbackabi.patch @@ -0,0 +1,131 @@ +diff -ru icu.5691/source/common/ucnv2022.c icu/source/common/ucnv2022.c +--- icu.5691/source/common/ucnv2022.c	2009-06-02 16:07:36.000000000 +0100 ++++ icu/source/common/ucnv2022.c	2009-06-02 16:21:56.000000000 +0100 +@@ -3566,7 +3566,7 @@ +             /* include ASCII for JP */ +             sa->addRange(sa->set, 0, 0x7f); +         } +-        if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { ++        if(cnvData->version==3 || cnvData->version==4) { +             /* +              * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0 +              * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8) +diff -ru icu.5691/source/common/ucnv_ext.c icu/source/common/ucnv_ext.c +--- icu.5691/source/common/ucnv_ext.c	2009-06-02 16:07:36.000000000 +0100 ++++ icu/source/common/ucnv_ext.c	2009-06-02 16:23:12.000000000 +0100 +@@ -1031,7 +1031,7 @@ +  +     stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]; +  +-    useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET); ++    useFallback=(UBool)(FALSE); +  +     /* enumerate the from-Unicode trie table */ +     c=0; /* keep track of the current code point while enumerating */ +diff -ru icu.5691/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c +--- icu.5691/source/common/ucnvmbcs.c	2009-06-02 16:07:36.000000000 +0100 ++++ icu/source/common/ucnvmbcs.c	2009-06-02 16:23:50.000000000 +0100 +@@ -340,7 +340,7 @@ +  + /* Miscellaneous ------------------------------------------------------------ */ +  +-#if 0  /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ ++/* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ +  + /* similar to ucnv_MBCSGetNextUChar() but recursive */ + static void +@@ -434,8 +434,6 @@ +         pErrorCode); + } +  +-#endif +- + U_CFUNC void + ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData, +                                          const USetAdder *sa, +@@ -511,7 +509,7 @@ +  +         bytes=mbcsTable->fromUnicodeBytes; +  +-        useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET); ++        useFallback=(UBool)(FALSE); +  +         switch(mbcsTable->outputType) { +         case MBCS_OUTPUT_3: +diff -ru icu.5691/source/common/ucnvmbcs.h icu/source/common/ucnvmbcs.h +--- icu.5691/source/common/ucnvmbcs.h	2009-06-02 16:07:36.000000000 +0100 ++++ icu/source/common/ucnvmbcs.h	2009-06-02 16:23:50.000000000 +0100 +@@ -363,7 +363,8 @@ + ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, +                           UErrorCode *pErrorCode); +  +-#if 0  /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ ++/* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ ++ + /* +  * Internal function returning a UnicodeSet for toUnicode() conversion. +  * Currently only used for ISO-2022-CN, and only handles roundtrip mappings. +@@ -378,7 +379,6 @@ +                            UConverterUnicodeSet which, +                            uint8_t state, int32_t lowByte, int32_t highByte, +                            UErrorCode *pErrorCode); +-#endif +  + /* +  * Internal function returning a UnicodeSet for toUnicode() conversion. +diff -ru icu.5691/source/common/unicode/ucnv.h icu/source/common/unicode/ucnv.h +--- icu.5691/source/common/unicode/ucnv.h	2009-06-02 16:07:32.000000000 +0100 ++++ icu/source/common/unicode/ucnv.h	2009-06-02 16:20:18.000000000 +0100 +@@ -870,8 +870,6 @@ + typedef enum UConverterUnicodeSet { +     /** Select the set of roundtrippable Unicode code points. @stable ICU 2.6 */ +     UCNV_ROUNDTRIP_SET, +-    /** Select the set of Unicode code points with roundtrip or fallback mappings. @draft ICU 4.0 */ +-    UCNV_ROUNDTRIP_AND_FALLBACK_SET, +     /** Number of UConverterUnicodeSet selectors. @stable ICU 2.6 */ +     UCNV_SET_COUNT + } UConverterUnicodeSet; +@@ -880,16 +878,11 @@ + /** +  * Returns the set of Unicode code points that can be converted by an ICU converter. +  * +- * Returns one of several kinds of set: +- * +- * 1. UCNV_ROUNDTRIP_SET +- * ++ * The current implementation returns only one kind of set (UCNV_ROUNDTRIP_SET): +  * The set of all Unicode code points that can be roundtrip-converted +- * (converted without any data loss) with the converter (ucnv_fromUnicode()). ++ * (converted without any data loss) with the converter. +  * This set will not include code points that have fallback mappings +  * or are only the result of reverse fallback mappings. +- * This set will also not include PUA code points with fallbacks, although +- * ucnv_fromUnicode() will always uses those mappings despite ucnv_setFallback(). +  * See UTR #22 "Character Mapping Markup Language" +  * at http://www.unicode.org/reports/tr22/ +  * +@@ -900,12 +893,6 @@ +  *   by comparing its roundtrip set with the set of ExemplarCharacters from +  *   ICU's locale data or other sources +  * +- * 2. UCNV_ROUNDTRIP_AND_FALLBACK_SET +- * +- * The set of all Unicode code points that can be converted with the converter (ucnv_fromUnicode()) +- * when fallbacks are turned on (see ucnv_setFallback()). +- * This set includes all code points with roundtrips and fallbacks (but not reverse fallbacks). +- * +  * In the future, there may be more UConverterUnicodeSet choices to select +  * sets with different properties. +  * +diff -ru icu.5691/source/test/intltest/convtest.cpp icu/source/test/intltest/convtest.cpp +--- icu.5691/source/test/intltest/convtest.cpp	2009-06-02 16:07:21.000000000 +0100 ++++ icu/source/test/intltest/convtest.cpp	2009-06-02 16:24:08.000000000 +0100 +@@ -552,7 +552,7 @@ +         } +         UConverterUnicodeSet which; +         for(which=UCNV_ROUNDTRIP_SET; which<UCNV_SET_COUNT; which=(UConverterUnicodeSet)((int)which+1)) { +-            if(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { ++            if(FALSE) { +                 ucnv_setFallback(cnv, TRUE); +             } +             expected.add(0, cpLimit-1); diff --git a/icu.icuXXXX.virama.prevnext.patch b/icu.icuXXXX.virama.prevnext.patch new file mode 100644 index 0000000..49393c2 --- /dev/null +++ b/icu.icuXXXX.virama.prevnext.patch @@ -0,0 +1,98 @@ +diff -ur icu.orig/source/common/rbbi.cpp icu/source/common/rbbi.cpp +--- icu.orig/source/common/rbbi.cpp	2006-10-05 11:54:13.000000000 +0100 ++++ icu/source/common/rbbi.cpp	2006-10-05 11:57:31.000000000 +0100 +@@ -879,6 +879,22 @@ +     RBBI_END        // state machine processing is after end of user text. + }; +  ++#define VIRAMA_SCRIPT(wc)        ((wc) >= 0x0901 && (wc) <= 0x17FF) ++#define VIRAMA(wc) ((wc) == 0x094D || \ ++                    (wc) == 0x09CD || \ ++                    (wc) == 0x0A4D || \ ++                    (wc) == 0x0ACD || \ ++                    (wc) == 0x0B4D || \ ++                    (wc) == 0x0BCD || \ ++                    (wc) == 0x0C4D || \ ++                    (wc) == 0x0CCD || \ ++                    (wc) == 0x0D4D || \ ++                    (wc) == 0x0DCA || \ ++                    (wc) == 0x0E3A || \ ++                    (wc) == 0x0F84 || \ ++                    (wc) == 0x1039 || \ ++                    (wc) == 0x17D2 || \ ++                    (wc) == 0x200D) +  + //----------------------------------------------------------------------------------- + // +@@ -896,6 +911,7 @@ +     RBBIRunMode         mode; +      +     RBBIStateTableRow  *row; ++    UChar32             prevchar; +     UChar32             c; +     int32_t             lookaheadStatus = 0; +     int32_t             lookaheadTagIdx = 0; +@@ -919,6 +935,7 @@ +     // if we're already at the end of the text, return DONE. +     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);  +     result          = initialPosition; ++    prevchar        = 0; +     c               = UTEXT_NEXT32(fText); +     if (fData == NULL || c==U_SENTINEL) { +         return BreakIterator::DONE; +@@ -1001,6 +1018,11 @@ +  +         // State Transition - move machine to its next state +         // ++        if (VIRAMA_SCRIPT(c) && VIRAMA(prevchar)) ++        { ++                state = START_STATE; ++                row = (RBBIStateTableRow *) (tableData + tableRowLen * state); ++        } +         state = row->fNextState[category]; +         row = (RBBIStateTableRow *) +             // (statetable->fTableData + (statetable->fRowLen * state)); +@@ -1059,6 +1081,7 @@ +         //    the input position.  The next iteration will be processing the +         //    first real input character. +         if (mode == RBBI_RUN) { ++            prevchar = c; +             c = UTEXT_NEXT32(fText); +         } else { +             if (mode == RBBI_START) { +@@ -1107,6 +1130,7 @@ +     int16_t             category        = 0; +     RBBIRunMode         mode; +     RBBIStateTableRow  *row; ++    UChar32             prevchar; +     UChar32             c; +     int32_t             lookaheadStatus = 0; +     int32_t             result          = 0; +@@ -1135,6 +1159,7 @@ +     //  Set up the starting char. +     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); +     result          = initialPosition; ++    prevchar        = 0; +     c               = UTEXT_PREVIOUS32(fText); +  +     //  Set the initial state for the state machine +@@ -1218,6 +1243,11 @@ +  +         // State Transition - move machine to its next state +         // ++	if (VIRAMA_SCRIPT(prevchar) && VIRAMA(c)) ++        { ++                state = START_STATE; ++                row = (RBBIStateTableRow *) (statetable->fTableData + (statetable->fRowLen * state)); ++        } +         state = row->fNextState[category]; +         row = (RBBIStateTableRow *) +             (statetable->fTableData + (statetable->fRowLen * state)); +@@ -1269,6 +1299,7 @@ +         //    the input position.  The next iteration will be processing the +         //    first real input character. +         if (mode == RBBI_RUN) { ++            prevchar = c; +             c = UTEXT_PREVIOUS32(fText); +         } else {             +             if (mode == RBBI_START) { diff --git a/icu.rh429023.regexp.patch b/icu.rh429023.regexp.patch new file mode 100644 index 0000000..ef8eded --- /dev/null +++ b/icu.rh429023.regexp.patch @@ -0,0 +1,307 @@ +diff -ru icu.orig/source/common/uvectr32.cpp icu/source/common/uvectr32.cpp +--- icu.orig/source/common/uvectr32.cpp	2003-08-27 02:01:30.000000000 +0100 ++++ icu/source/common/uvectr32.cpp	2008-01-22 08:37:06.000000000 +0000 +@@ -1,6 +1,6 @@ + /* + ****************************************************************************** +-* Copyright (C) 1999-2003, International Business Machines Corporation and   * ++* Copyright (C) 1999-2008, International Business Machines Corporation and   * + * others. All Rights Reserved.                                               * + ****************************************************************************** + *   Date        Name        Description +@@ -26,6 +26,7 @@ + UVector32::UVector32(UErrorCode &status) : +     count(0), +     capacity(0), ++    maxCapacity(0), +     elements(NULL) + { +     _init(DEFUALT_CAPACITY, status); +@@ -34,6 +35,7 @@ + UVector32::UVector32(int32_t initialCapacity, UErrorCode &status) : +     count(0), +     capacity(0), ++    maxCapacity(0), +     elements(0) + { +     _init(initialCapacity, status); +@@ -46,6 +48,9 @@ +     if (initialCapacity < 1) { +         initialCapacity = DEFUALT_CAPACITY; +     } ++    if (maxCapacity>0 && maxCapacity<initialCapacity) { ++        initialCapacity = maxCapacity; ++    } +     elements = (int32_t *)uprv_malloc(sizeof(int32_t)*initialCapacity); +     if (elements == 0) { +         status = U_MEMORY_ALLOCATION_ERROR; +@@ -189,21 +194,35 @@ + UBool UVector32::expandCapacity(int32_t minimumCapacity, UErrorCode &status) { +     if (capacity >= minimumCapacity) { +         return TRUE; +-    } else { +-        int32_t newCap = capacity * 2; +-        if (newCap < minimumCapacity) { +-            newCap = minimumCapacity; +-        } +-        int32_t* newElems = (int32_t *)uprv_malloc(sizeof(int32_t)*newCap); +-        if (newElems == 0) { +-            status = U_MEMORY_ALLOCATION_ERROR; +-            return FALSE; +-        } +-        uprv_memcpy(newElems, elements, sizeof(elements[0]) * count); +-        uprv_free(elements); +-        elements = newElems; +-        capacity = newCap; +-        return TRUE; ++    } ++    if (maxCapacity>0 && minimumCapacity>maxCapacity) { ++        status = U_BUFFER_OVERFLOW_ERROR; ++        return FALSE; ++    } ++    int32_t newCap = capacity * 2; ++    if (newCap < minimumCapacity) { ++        newCap = minimumCapacity; ++    } ++    if (maxCapacity > 0 && newCap > maxCapacity) { ++        newCap = maxCapacity; ++    } ++    int32_t* newElems = (int32_t *)uprv_malloc(sizeof(int32_t)*newCap); ++    if (newElems == 0) { ++        status = U_MEMORY_ALLOCATION_ERROR; ++        return FALSE; ++    } ++    uprv_memcpy(newElems, elements, sizeof(elements[0]) * count); ++    uprv_free(elements); ++    elements = newElems; ++    capacity = newCap; ++    return TRUE; ++} ++ ++void UVector32::setMaxCapacity(int32_t limit) { ++    U_ASSERT(limit >= 0); ++    maxCapacity = limit; ++    if (maxCapacity < 0) { ++        maxCapacity = 0; +     } + } +  +diff -ru icu.orig/source/common/uvectr32.h icu/source/common/uvectr32.h +--- icu.orig/source/common/uvectr32.h	2006-01-18 03:52:04.000000000 +0000 ++++ icu/source/common/uvectr32.h	2008-01-22 08:37:07.000000000 +0000 +@@ -1,6 +1,6 @@ + /* + ********************************************************************** +-*   Copyright (C) 1999-2006, International Business Machines ++*   Copyright (C) 1999-2008, International Business Machines + *   Corporation and others.  All Rights Reserved. + ********************************************************************** + */ +@@ -61,6 +61,8 @@ +     int32_t   count; +  +     int32_t   capacity; ++     ++    int32_t   maxCapacity;   // Limit beyond which capacity is not permitted to grow. +  +     int32_t*  elements; +  +@@ -162,6 +164,14 @@ +     int32_t *getBuffer() const; +  +     /** ++     * Set the maximum allowed buffer capacity for this vector/stack. ++     * Default with no limit set is unlimited, go until malloc() fails. ++     * A Limit of zero means unlimited capacity. ++     * Units are vector elements (32 bits each), not bytes. ++     */ ++    void setMaxCapacity(int32_t limit); ++ ++    /** +      * ICU "poor man's RTTI", returns a UClassID for this class. +      */ +     static UClassID U_EXPORT2 getStaticClassID(); +@@ -221,7 +231,9 @@ + } +  + inline int32_t *UVector32::reserveBlock(int32_t size, UErrorCode &status) { +-    ensureCapacity(count+size, status); ++    if (ensureCapacity(count+size, status) == FALSE) { ++        return NULL; ++    } +     int32_t  *rp = elements+count; +     count += size; +     return rp; +diff -ru icu.orig/source/i18n/regexcmp.cpp icu/source/i18n/regexcmp.cpp +--- icu.orig/source/i18n/regexcmp.cpp	2006-02-02 04:37:14.000000000 +0000 ++++ icu/source/i18n/regexcmp.cpp	2008-01-22 08:37:06.000000000 +0000 +@@ -1187,14 +1187,17 @@ +             // Because capture groups can be forward-referenced by back-references, +             //  we fill the operand with the capture group number.  At the end +             //  of compilation, it will be changed to the variable's location. +-            U_ASSERT(groupNum > 0); +-            int32_t  op; +-            if (fModeFlags & UREGEX_CASE_INSENSITIVE) { +-                op = URX_BUILD(URX_BACKREF_I, groupNum); ++            if (groupNum < 1) {  ++                error(U_REGEX_INVALID_BACK_REF); +             } else { +-                op = URX_BUILD(URX_BACKREF, groupNum); ++                int32_t  op; ++                if (fModeFlags & UREGEX_CASE_INSENSITIVE) { ++                    op = URX_BUILD(URX_BACKREF_I, groupNum); ++                } else { ++                    op = URX_BUILD(URX_BACKREF, groupNum); ++                } ++                fRXPat->fCompiledPat->addElement(op, *fStatus); +             } +-            fRXPat->fCompiledPat->addElement(op, *fStatus); +         } +         break; +  +diff -ru icu.orig/source/i18n/rematch.cpp icu/source/i18n/rematch.cpp +--- icu.orig/source/i18n/rematch.cpp	2005-08-25 19:02:20.000000000 +0100 ++++ icu/source/i18n/rematch.cpp	2008-01-22 08:37:44.000000000 +0000 +@@ -30,6 +30,15 @@ +  + U_NAMESPACE_BEGIN +  ++// Limit the size of the back track stack, to avoid system failures caused ++//   by heap exhaustion.  Units are in 32 bit words, not bytes. ++// This value puts ICU's limits higher than most other regexp implementations, ++//  which use recursion rather than the heap, and take more storage per ++//  backtrack point. ++// This constant is _temporary_.  Proper API to control the value will added. ++// ++static const int32_t BACKTRACK_STACK_CAPACITY = 8000000; ++ + //----------------------------------------------------------------------------- + // + //   Constructor and Destructor +@@ -53,6 +62,8 @@ +     } +     if (fStack == NULL || fData == NULL) { +         fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; ++    } else { ++        fStack->setMaxCapacity(BACKTRACK_STACK_CAPACITY); +     } +          +     reset(*RegexStaticSets::gStaticSets->fEmptyString); +@@ -78,6 +89,8 @@ +     } +     if (fStack == NULL || fData == NULL) { +         status = U_MEMORY_ALLOCATION_ERROR; ++    } else { ++        fStack->setMaxCapacity(BACKTRACK_STACK_CAPACITY); +     } +     reset(input); + } +@@ -102,6 +115,8 @@ +     } +     if (fStack == NULL || fData == NULL) { +         status = U_MEMORY_ALLOCATION_ERROR; ++    } else { ++        fStack->setMaxCapacity(BACKTRACK_STACK_CAPACITY); +     } +     reset(*RegexStaticSets::gStaticSets->fEmptyString); + } +@@ -1015,6 +1030,14 @@ + inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int32_t savePatIdx, int32_t frameSize, UErrorCode &status) { +     // push storage for a new frame.  +     int32_t *newFP = fStack->reserveBlock(frameSize, status); ++    if (newFP == NULL) { ++        // Heap allocation error on attempted stack expansion. ++        // We need to return a writable stack frame, so just return the ++        //    previous frame.  The match operation will stop quickly ++        //    becuase of the error status, after which the frame will never ++        //    be looked at again. ++        return fp; ++    } +     fp = (REStackFrame *)(newFP - frameSize);  // in case of realloc of stack. +      +     // New stack frame = copy of old top frame. +@@ -1030,8 +1053,8 @@ +     fp->fPatIdx = savePatIdx; +     return (REStackFrame *)newFP; + } +-     +-             ++ ++ + //-------------------------------------------------------------------------------- + // + //   MatchAt      This is the actual matching engine. +@@ -2262,6 +2285,7 @@ +         } +  +         if (U_FAILURE(status)) { ++            isMatch = FALSE; +             break; +         } +     } +diff -ru icu.orig/source/test/intltest/regextst.cpp icu/source/test/intltest/regextst.cpp +--- icu.orig/source/test/intltest/regextst.cpp	2005-07-05 19:39:00.000000000 +0100 ++++ icu/source/test/intltest/regextst.cpp	2008-01-22 08:38:21.000000000 +0000 +@@ -66,6 +66,10 @@ +         case 6: name = "PerlTests"; +             if (exec) PerlTests(); +             break; ++        case 7: name = "Bug 6149"; ++            if (exec) Bug6149(); ++            break; ++             +  +  +         default: name = ""; +@@ -1637,6 +1641,13 @@ +     // UnicodeSet containing a string +     REGEX_ERR("abc[{def}]xyz", 1, 10, U_REGEX_SET_CONTAINS_STRING); +  ++     ++    // Invalid Back Reference \0 ++    //    For ICU 3.8 and earlier ++    //    For ICU versions newer than 3.8, \0 introduces an octal escape. ++    // ++    REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_INVALID_BACK_REF); ++ + } +  +  +@@ -2119,6 +2130,26 @@ + } +  +  ++//-------------------------------------------------------------- ++// ++//  Bug6149   Verify limits to heap expansion for backtrack stack. ++//             Use this pattern, ++//                 "(a?){1,}" ++//             The zero-length match will repeat forever. ++//                (That this goes into a loop is another bug) ++// ++//--------------------------------------------------------------- ++void RegexTest::Bug6149() { ++    UnicodeString pattern("(a?){1,}"); ++    UnicodeString s("xyz"); ++    uint32_t flags = 0; ++    UErrorCode status = U_ZERO_ERROR; ++     ++    RegexMatcher  matcher(pattern, s, flags, status); ++    UBool result = false; ++    REGEX_ASSERT_FAIL(result=matcher.matches(status), U_BUFFER_OVERFLOW_ERROR); ++    REGEX_ASSERT(result == FALSE); ++ } +  + #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */ +  +diff -ru icu.orig/source/test/intltest/regextst.h icu/source/test/intltest/regextst.h +--- icu.orig/source/test/intltest/regextst.h	2003-12-03 06:58:28.000000000 +0000 ++++ icu/source/test/intltest/regextst.h	2008-01-22 08:37:06.000000000 +0000 +@@ -30,6 +30,7 @@ +     virtual void Extended(); +     virtual void Errors(); +     virtual void PerlTests(); ++    virtual void Bug6149(); +  +     // The following functions are internal to the regexp tests. +     virtual UBool doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int line);  | 
