diff options
26 files changed, 5812 insertions, 0 deletions
diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..1e65467 --- /dev/null +++ b/Makefile @@ -0,0 +1,4 @@ +SRCDIR := $(shell pwd) +NAME := $(shell basename $(SRCDIR)) +include ../common/Makefile + diff --git a/canonicalize.patch b/canonicalize.patch new file mode 100644 index 0000000..3ff9c33 --- /dev/null +++ b/canonicalize.patch @@ -0,0 +1,11 @@ +--- source/common/uloc.c 2011-12-12 04:50:00.601092000 -0500 ++++ source/common/uloc.c 2011-12-12 04:56:18.503570000 -0500 +@@ -1712,7 +1712,7 @@ + /* Check for EURO variants. */ + sawEuro = _deleteVariant(variant, variantSize, "EURO", 4); + len -= sawEuro; +- if (sawEuro > 0 && name[len-1] == '_') { /* delete trailing '_' */ ++ if (sawEuro > 0 && len > 0 && name[len-1] == '_') { /* delete trailing '_' */ + --len; + } + diff --git a/compat-icu36.spec b/compat-icu36.spec new file mode 100644 index 0000000..d8a820e --- /dev/null +++ b/compat-icu36.spec @@ -0,0 +1,189 @@ +Name: compat-icu36 +Version: 3.6 +Release: 5.16.1 +Summary: International Components for Unicode + +Group: System Environment/Libraries +License: X License +URL: http://www.ibm.com/software/globalization/icu/ +Source0: ftp://ftp.software.ibm.com/software/globalization/icu/icu4c-3_6-src.tgz +BuildRoot: %{_tmppath}/%{name}-%{version}-root + +BuildRequires: doxygen, autoconf +Patch1: icu-3.4-multiarchdevel.patch +Patch2: icu-config +Patch3: icu.icu5365.dependantvowels.patch +Patch4: icu.icu5418.malayam.patch +Patch5: icu.icu5431.malayam.patch +Patch6: icu.icu5433.oriya.patch +Patch7: icu.icuXXXX.virama.prevnext.patch +Patch8: icu.icu5465.telegu.patch +Patch9: icu.icu5488.assamese.patch +Patch10: icu.icu5500.devicetablecrash.patch +Patch11: icu.icu5501.sinhala.biggerexpand.patch +Patch12: icu.icu5557.safety.patch +Patch13: icu.icu5594.gujarati.patch +Patch14: icu.icu5506.multiplevowels.patch +Patch15: icu.icuXXXX.malayalam.bysyllable.patch +Patch16: icu.rh429023.regexp.patch +Patch17: icu.icu5483.backport.patch +Patch18: icu.icu5797.backport.patch +Patch19: icu.icu6001.backport.patch +Patch20: icu.icu6002.backport.patch +Patch21: icu.icu6175.emptysegments.patch +Patch22: icu.icu5691.backport.patch +Patch23: icu.icuXXXX.rollbackabi.patch +Patch24: canonicalize.patch +Conflicts: icu + +%description +The International Components for Unicode (ICU) libraries provide +robust and full-featured Unicode services on a wide variety of +platforms. ICU supports the most current version of the Unicode +standard, and they provide support for supplementary Unicode +characters (needed for GB 18030 repertoire support). +As computing environments become more heterogeneous, software +portability becomes more important. ICU lets you produce the same +results across all the various platforms you support, without +sacrificing performance. It offers great flexibility to extend and +customize the supplied services. + + +%package -n compat-libicu36 +Summary: International Components for Unicode - libraries +Group: System Environment/Libraries + +%description -n compat-libicu36 +%{summary}. + +This package provides the ICU libraries for package built +against version %{version}. + +%package -n compat-libicu36-devel +Summary: Development files for International Components for Unicode +Group: Development/Libraries +Requires: compat-libicu36 = %{version}-%{release} +Requires: pkgconfig +Conflicts: libicu-devel + +%description -n compat-libicu36-devel +%{summary}. + +%package -n compat-libicu36-doc +Summary: Documentation for International Components for Unicode +Group: Documentation + +%description -n compat-libicu36-doc +%{summary}. + + +%prep +%setup -q -n icu +%patch1 -p1 -b .multiarchdevel +%patch3 -p1 -b .dependantvowels +%patch4 -p1 -b .icu5418.malayam.patch +%patch5 -p1 -b .icu5431.malayam.patch +%patch6 -p1 -b .icu5433.oriya.patch +%patch7 -p1 -b .icuXXXX.virama.prevnext.patch +%patch8 -p1 -b .icu5465.telegu.patch +%patch9 -p1 -b .icu5488.assamese.patch +%patch10 -p1 -b .icu5500.devicetablecrash.patch +%patch11 -p1 -b .icu5501.sinhala.biggerexpand.patch +%patch12 -p1 -b .icu5557.safety.patch +%patch13 -p1 -b .icu5594.gujarati.patch +%patch14 -p1 -b .icu5506.multiplevowels.patch +%patch15 -p1 -b .icuXXXX.malayalam.bysyllable.patch +%patch16 -p1 -b .rh429023.regexp.patch +%patch17 -p1 -b .icu5483.backport.patch +%patch18 -p1 -b .icu5797.backport.patch +%patch19 -p1 -b .icu6001.backport.patch +%patch20 -p1 -b .icu6002.backport.patch +%patch21 -p1 -b .icu6175.emptysegments.patch +%patch22 -p1 -b .icu5691.backport.patch +%patch23 -p1 -b .icuXXXX.rollbackabi.patch +%patch24 -p0 -b .canonicalize.patch + +%build +cd source +export CFLAGS="$RPM_OPT_FLAGS -fno-strict-aliasing" +export CXXFLAGS="$RPM_OPT_FLAGS -fno-strict-aliasing" +autoconf +%configure --with-data-packaging=library --disable-samples +#rhbz#654590 +sed -i -- "s/-nodefaultlibs -nostdlib//" config/mh-linux +make # %{?_smp_mflags} # -j(X>1) may "break" man pages as of 3.2, b.f.u #2357 +make doc + +%install +rm -rf $RPM_BUILD_ROOT source/__docs +make -C source install DESTDIR=$RPM_BUILD_ROOT +make -C source install-doc docdir=__docs +chmod +x $RPM_BUILD_ROOT%{_libdir}/*.so.* +cp %{PATCH2} $RPM_BUILD_ROOT%{_bindir}/icu-config +chmod a+x $RPM_BUILD_ROOT%{_bindir}/icu-config +sed -i s/\\\$\(THREADSCXXFLAGS\)// $RPM_BUILD_ROOT/%{_libdir}/pkgconfig/icu.pc +sed -i s/\\\$\(THREADSCPPFLAGS\)/-D_REENTRANT/ $RPM_BUILD_ROOT/%{_libdir}/pkgconfig/icu.pc + +%check +make -C source check + + +%clean +rm -rf $RPM_BUILD_ROOT + + +%post -n compat-libicu36 -p /sbin/ldconfig + +%postun -n compat-libicu36 -p /sbin/ldconfig + + +%files +%defattr(-,root,root,-) +%doc license.html readme.html +%{_bindir}/derb +%{_bindir}/genbrk +%{_bindir}/gencnval +%{_bindir}/genctd +%{_bindir}/genrb +%{_bindir}/makeconv +%{_bindir}/pkgdata +%{_bindir}/uconv +%{_sbindir}/* +%{_mandir}/man1/derb.1* +%{_mandir}/man1/gencnval.1* +%{_mandir}/man1/genrb.1* +%{_mandir}/man1/genbrk.1* +%{_mandir}/man1/genctd.1* +%{_mandir}/man1/makeconv.1* +%{_mandir}/man1/pkgdata.1* +%{_mandir}/man1/uconv.1* +%{_mandir}/man8/*.8* + +%files -n compat-libicu36 +%defattr(-,root,root,-) +%{_libdir}/*.so.* + +%files -n compat-libicu36-devel +%defattr(-,root,root,-) +%{_bindir}/icu-config +%{_mandir}/man1/icu-config.1* +%{_includedir}/layout +%{_includedir}/unicode +%{_libdir}/*.so +%{_libdir}/icu +%{_libdir}/pkgconfig/icu.pc +%dir %{_datadir}/icu +%dir %{_datadir}/icu/3.6 +%{_datadir}/icu/3.6/mkinstalldirs +%{_datadir}/icu/3.6/config +%doc %{_datadir}/icu/3.6/license.html + +%files -n compat-libicu36-doc +%defattr(-,root,root,-) +%doc source/__docs/icu/html/* + + +%changelog +* Wed Mar 20 2013 Remi Collet <RPMS@famillecollet.com> - 3.6-5.16.1 +- new package from RHEL-5 spec of icu. + diff --git a/icu-3.4-multiarchdevel.patch b/icu-3.4-multiarchdevel.patch new file mode 100644 index 0000000..a7839aa --- /dev/null +++ b/icu-3.4-multiarchdevel.patch @@ -0,0 +1,70 @@ +--- icu/source/configure.in.orig 2006-05-02 12:10:31.000000000 +0100 ++++ icu/source/configure.in 2006-05-02 15:06:07.000000000 +0100 +@@ -1011,6 +1011,7 @@ + Makefile \ + data/icupkg.inc \ + config/Makefile.inc \ ++ config/icu.pc \ + data/Makefile \ + stubdata/Makefile \ + common/Makefile \ +--- /dev/null 2006-04-29 13:38:37.035974750 +0100 ++++ icu/source/config/icu.pc.in 2006-05-02 15:03:14.000000000 +0100 +@@ -0,0 +1,46 @@ ++prefix = @prefix@ ++bindir = @bindir@ ++exec_prefix = @exec_prefix@ ++libdir = @libdir@ ++includedir = @includedir@ ++datadir = @datadir@ ++sbindir = @sbindir@ ++mandir = @mandir@ ++sysconfdir = @sysconfdir@ ++CFLAGS = @CFLAGS@ ++CXXFLAGS = @CXXFLAGS@ ++DEFS = @DEFS@ ++UNICODE_VERSION=@UNICODE_VERSION@ ++ICUPREFIX=icu ++ICULIBSUFFIX=@ICULIBSUFFIX@ ++LIBICU=lib${ICUPREFIX} ++LIBCPPFLAGS=-D_REENTRANT ++CPPFLAGS=@CPPFLAGS@ ${LIBCPPFLAGS} -I${prefix}/include ++SHAREDLIBCPPFLAGS=-DPIC ++SHAREDLIBCXXFLAGS=-fPIC ++SHAREDLIBCFLAGS=-fPIC ++pkglibdir=${libdir}/@PACKAGE@${ICULIBSUFFIX}/@VERSION@ ++pkgdatadir=${datadir}/@PACKAGE@${ICULIBSUFFIX}/@VERSION@ ++ICUDATA_NAME = icudt@LIB_VERSION_MAJOR@@ICUDATA_CHAR@ ++ICUPKGDATA_DIR=@libdir@ ++ICUDATA_DIR=${pkgdatadir} ++SO=so ++ICULIBS_COMMON_LIB_NAME=${LIBICU}uc${ICULIBSUFFIX}.${SO} ++SHLIB_cc=cxx ${DEFS} ${CPPFLAGS} ${CXXFLAGS} @LDFLAGS@ -shared ++SHLIB_c=cc ${DEFS} ${CPPFLAGS} ${CFLAGS} @LDFLAGS@ -shared ++ICULIBS_LAYOUT = -l${ICUPREFIX}le${ICULIBSUFFIX} -l${ICUPREFIX}lx${ICULIBSUFFIX} ++ICULIBS_TOOLUTIL = -l${ICUPREFIX}tu${ICULIBSUFFIX} ++ICULIBS_OBSOLETE = -l${ICUPREFIX}obsolete${ICULIBSUFFIX} ++ICULIBS_ICUIO = -l${ICUPREFIX}io${ICULIBSUFFIX} ++ICULIBS_I18N = -l${ICUPREFIX}i18n${ICULIBSUFFIX} ++ICULIBS_COMMON = -l${ICUPREFIX}uc${ICULIBSUFFIX} ++ICULIBS_DATA = -l${ICUPREFIX}data${ICULIBSUFFIX} ++ICULIBS_LIBSONLY = ${ICULIBS_I18N} ${ICULIBS_COMMON} ${ICULIBS_DATA} ++ICULIBS_SYSTEMLIBS = @LIBS@ ++ICULIBS_BASE = @LIBS@ -L${libdir} ++ICULIBS = ${ICULIBS_BASE} ${ICULIBS_I18N} ${ICULIBS_COMMON} ${ICULIBS_DATA} ++ ++Name: @PACKAGE@ ++Description: International Components for Unicode ++Version: @VERSION@ ++Libs: @LDFLAGS@ ${ICULIBS} @LIBS@ +--- icu/source/Makefile.in.orig 2006-05-02 12:10:31.000000000 +0100 ++++ icu/source/Makefile.in 2006-05-02 15:18:15.000000000 +0100 +@@ -125,6 +125,8 @@ + @$(MKINSTALLDIRS) $(DESTDIR)$(sbindir) + $(INSTALL_DATA) @platform_make_fragment@ $(DESTDIR)$(pkgdatadir)/config/@platform_make_fragment_name@ + $(INSTALL_SCRIPT) $(top_srcdir)/mkinstalldirs $(DESTDIR)$(pkgdatadir)/mkinstalldirs ++ @$(MKINSTALLDIRS) $(DESTDIR)$(libdir)/pkgconfig ++ $(INSTALL_DATA) $(top_srcdir)/config/icu.pc $(DESTDIR)$(libdir)/pkgconfig/icu.pc + $(INSTALL_DATA) $(top_srcdir)/../license.html $(DESTDIR)$(pkgdatadir)/license.html + $(INSTALL_SCRIPT) $(top_builddir)/config/icu-config $(DESTDIR)$(bindir)/icu-config + $(INSTALL_DATA) $(top_builddir)/config/Makefile.inc $(DESTDIR)$(pkglibdir)/Makefile.inc diff --git a/icu-config b/icu-config new file mode 100755 index 0000000..08f9ce8 --- /dev/null +++ b/icu-config @@ -0,0 +1,387 @@ +#!/bin/sh +## -*-sh-*- +#set -x +# BEGIN of icu-config-top +#****************************************************************************** +# Copyright (C) 1999-2004, International Business Machines +# Corporation and others. All Rights Reserved. +#****************************************************************************** +# This script is designed to aid configuration of ICU. +# rpath links a library search path right into the binaries. + + +### END of icu-config-top + +## Zero out prefix. +exec_prefix=`pkg-config --variable=exec_prefix icu` +execprefix=$exec_prefix +prefix=`pkg-config --variable=prefix icu` + + +loaddefs() +{ +LDLIBRARYPATH_ENVVAR="LD_LIBRARY_PATH" +bindir=`pkg-config --variable=bindir icu` +sbindir=`pkg-config --variable=sbindir icu` +libdir=`pkg-config --variable=libdir icu` +sysconfdir=`pkg-config --variable=sysconfdir icu` +mandir=`pkg-config --variable=mandir icu` +datadir=`pkg-config --variable=datadir icu` +pkglibdir=`pkg-config --variable=pkglibdir icu` +ICULIBS_COMMON_LIB_NAME=`pkg-config --variable=ICULIBS_COMMON_LIB_NAME icu` +UNICODE_VERSION=`pkg-config --variable=UNICODE_VERSION icu` +VERSION=`pkg-config --modversion icu` +SO=`pkg-config --variable=SO icu` + +## -*-sh-*- +## BEGIN of icu-config-bottom. +## Copyright (c) 2002-2004, International Business Machines Corporation and +## others. All Rights Reserved. + +ICUUC_FILE=${libdir}/${ICULIBS_COMMON_LIB_NAME} + +# echo ENABLE RPATH $ENABLE_RPATH and RPATHLDFLAGS=${RPATH_LDFLAGS} +if [ "x$PKGDATA_MODE" = "x" ]; then + PKGDATA_MODE=dll +fi + +} + +## The actual code of icu-config goes here. + +ME=`basename $0` + +allflags() +{ + echo " --bindir Print binary directory path (bin)" + echo " --cc Print C compiler used [CC]" + echo " --cflags Print C compiler flags [CFLAGS]" + echo " --cflags-dynamic Print additional C flags for" + echo " building shared libraries." + echo " --cppflags Print C Preprocessor flags [CPPFLAGS]" + echo " --cppflags-dynamic Print additional C Preprocessor flags for" + echo " building shared libraries." + echo " --cppflags-searchpath Print only -I include directives (-Iinclude)" + echo " --cxx Print C++ compiler used [CXX]" + echo " --cxxflags Print C++ compiler flags [CXXFLAGS]" + echo " --cxxflags-dynamic Print additional C++ flags for" + echo " building shared libraries." + echo " --detect-prefix Attempt to detect prefix based on PATH" + echo " --exec-prefix Print prefix for executables (/bin)" + echo " --exists Return with 0 status if ICU exists else fail" + echo " --help, -?, --usage Print this message" + echo " --icudata Print shortname of ICU data file (icudt21l)" + echo " --icudata-install-dir Print path to install data to - use as --install option to pkgdata(1)" + echo " --icudata-mode Print default ICU pkgdata mode (dll) - use as --mode option to pkgdata(1)." + echo " --icudatadir Print path to packaged archive data. Can set as [ICU_DATA]" + echo " --invoke Print commands to invoke an ICU program" + echo " --invoke=<prog> Print commands to invoke an ICU program named <prog> (ex: genrb)" + echo " --ldflags Print -L search path and -l libraries to link with ICU [LDFLAGS]. This is for the data, uc (common), and i18n libraries only. " + echo " --ldflags-layout Print ICU layout engine link directive. Use in addition to --ldflags" + echo " --ldflags-libsonly Same as --ldflags, but only the -l directives" + echo " --ldflags-searchpath Print only -L (search path) directive" + echo " --ldflags-system Print only system libs ICU links with (-lpthread, -lm)" + echo " --ldflags-icuio Print ICU icuio link directive. Use in addition to --ldflags " + echo " --ldflags-obsolete Print ICU obsolete link directive. Use in addition to --ldflags. (requires icuapps/obsolete to be built and installed.) " + echo " --mandir Print manpage (man) path" + echo " --prefix Print PREFIX to icu install (/usr/local)" + echo " --prefix=XXX Set prefix to XXX for remainder of command" + echo " --sbindir Print system binary path (sbin) " + echo " --shared-datadir Print shared data (share) path. This is NOT the ICU data dir." + echo " --shlib-c Print the command to compile and build C shared libraries with ICU" + echo " --shlib-cc Print the command to compile and build C++ shared libraries with ICU" + echo " --sysconfdir Print system config (etc) path" + echo " --unicode-version Print version of Unicode data used in ICU ($UNICODE_VERSION)" + echo " --version Print ICU version ($VERSION)" + echo " --incfile Print path to Makefile.inc (for -O option of pkgdata)" +} + +## Print the normal usage message +shortusage() +{ + echo "usage: ${ME} " `allflags | cut -c-25 | sed -e 's%.*%[ & ]%'` +} + + +usage() +{ + echo "${ME}: icu-config: ICU configuration helper script" + echo + echo "The most commonly used options will be --cflags, --cxxflags, --cppflags, and --ldflags." + echo 'Example (in make): CPFLAGS=$(shell icu-config --cppflags)' + echo ' LDFLAGS=$(shell icu-config --ldflags)' + echo " (etc).." + echo + echo "Usage:" + allflags + + echo + echo " [Brackets] show MAKE variable equivalents, (parenthesis) show example output" + echo + echo "Copyright (c) 2002, International Business Machines Corporation and others. All Rights Reserved." +} + +## Check the sanity of current variables +sanity() +{ + if [ ! -f ${ICUUC_FILE} ]; + then + echo "### $ME: Can't find ${ICUUC_FILE} - ICU prefix is wrong." 1>&2 + echo "### Try the --prefix= or --exec-prefix= options " 1>&2 + echo "### or --detect-prefix" + echo "### $ME: Exitting." 1>&2 + exit 2 + fi +} + +## Main starts here. + +if [ $# -lt 1 ]; then + shortusage + exit 1 +fi + + +# Load our variables from autoconf +# ALWAYS load twice because of dependencies +loaddefs +loaddefs +sanity + +while [ $# -gt 0 ]; +do + arg="$1" + var=`echo $arg | sed -e 's/^[^=]*=//'` +# echo "### processing $arg" 1>&2 + case "$arg" in + + # undocumented. + --debug) + set -x + ;; + + --so) + echo $SO + ;; + + --bindir) + echo $bindir + ;; + + --libdir) + echo $libdir + ;; + + --exists) + sanity + ;; + + --sbindir) + echo $sbindir + ;; + + --invoke=*) + QUOT="'" + CMD="${var}" + + # If it's not a locally executable command (1st choice) then + # search for it in the ICU directories. + if [ ! -x ${CMD} ]; then + if [ -x ${bindir}/${var} ]; then + CMD="${bindir}/${var}" + fi + if [ -x ${sbindir}/${var} ]; then + CMD="${sbindir}/${var}" + fi + fi + + echo "env ${QUOT}${LDLIBRARYPATH_ENVVAR}=${libdir}:"'${'"${LDLIBRARYPATH_ENVVAR}"'}'${QUOT} ${CMD} + ;; + + --invoke) + QUOT="'" + echo "env ${QUOT}${LDLIBRARYPATH_ENVVAR}=${libdir}:"'${'"${LDLIBRARYPATH_ENVVAR}"'}'${QUOT} + ;; + + --cflags) + pkg-config --variable=CFLAGS icu + ;; + + --cc) + echo cc + ;; + + --cxx) + echo c++ + ;; + + --cxxflags) + pkg-config --variable=CXXFLAGS icu + ;; + + --cppflags) + # Don't echo the -I. - it's unneeded. + CPPFLAGS=`pkg-config --variable=CPPFLAGS icu` + echo $CPPFLAGS | sed -e 's/-I. //' + ;; + + --cppflags-searchpath) + echo -I${prefix}/include + ;; + + --cppflags-dynamic) + pkg-config --variable=SHAREDLIBCPPFLAGS icu + ;; + + --cxxflags-dynamic) + pkg-config --variable=SHAREDLIBCXXFLAGS icu + ;; + + --cflags-dynamic) + pkg-config --variable=SHAREDLIBCFLAGS icu + ;; + + --ldflags-system) + pkg-config --variable=ICULIBS_SYSTEMLIBS icu + ;; + + --ldflags) + pkg-config --libs icu +# $RPATH_LDFLAGS + ;; + + --ldflags-libsonly) + pkg-config --variable=ICULIBS_LIBSONLY icu + ;; + + --ldflags-icuio) + pkg-config --variable=ICULIBS_ICUIO icu + ;; + + --ldflags-obsolete) + pkg-config --variable=ICULIBS_OBSOLETE icu + ;; + + --ldflags-toolutil) + pkg-config --variable=ICULIBS_TOOLUTIL icu + ;; + + --ldflags-layout) + pkg-config --variable=ICULIBS_LAYOUT icu + ;; + + --ldflags-searchpath) + echo -L${libdir} + ;; + + --detect-prefix) + HERE=`echo $0 | sed -e "s/$ME//g"` + if [ -f $HERE/../lib/${ICULIBS_COMMON_LIB_NAME} ]; then + prefix=$HERE/.. + echo "## Using --prefix=${prefix}" 1>&2 + fi + loaddefs + loaddefs + sanity + ;; + + --exec-prefix) + echo $exec_prefix + ;; + + --prefix) + echo $prefix + ;; + + --prefix=*) + prefix=$var + loaddefs + loaddefs + sanity + ;; + + --sysconfdir) + echo $sysconfdir + ;; + + --mandir) + echo $mandir + ;; + + --shared-datadir) + echo $datadir + ;; + + --incfile) + echo $pkglibdir/Makefile.inc + ;; + + --icudata) + pkg-config --variable=ICUDATA_NAME icu + ;; + + --icudata-mode) + echo $PKGDATA_MODE + ;; + + --icudata-install-dir) + pkg-config --variable=ICUPKGDATA_DIR icu + ;; + + --icudatadir) + pkg-config --variable=ICUDATA_DIR icu + ;; + + --shlib-c) + pkg-config --variable=SHLIB_c icu + ;; + + --shlib-cc) + pkg-config --variable=SHLIB_cc icu + ;; + + --version) + echo $VERSION + ;; + + --unicode-version) + echo $UNICODE_VERSION + ;; + + --help) + usage + exit 0 + ;; + + --usage) + usage + exit 0 + ;; + +# --enable-rpath=*) +# ENABLE_RPATH=$var +# loaddefs +# ;; + + -?) + usage + exit 0 + ;; + + *) + echo ${ME}: ERROR Unknown Option $arg 1>&2 + echo 1>&2 + shortusage 1>&2 + echo "### $ME: Exitting." 1>&2 + exit 1; + ;; + esac + shift +done + +# Check once before we quit (will check last used prefix) +sanity +## END of icu-config-bottom + +exit 0 + diff --git a/icu.icu5365.dependantvowels.patch b/icu.icu5365.dependantvowels.patch new file mode 100644 index 0000000..5708018 --- /dev/null +++ b/icu.icu5365.dependantvowels.patch @@ -0,0 +1,11 @@ +--- icu/source/layout/IndicReordering.cpp.orig 2006-09-05 17:01:15.000000000 +0100 ++++ icu/source/layout/IndicReordering.cpp 2006-09-05 17:01:19.000000000 +0100 +@@ -377,7 +377,7 @@ + {-1, 6, 1, -1, -1, -1, -1, -1, -1, 5, 9, 5, 5, 4, 12}, // 2 - consonant with nukta + {-1, 6, 1, -1, -1, -1, -1, -1, 2, 5, 9, 5, 5, 4, 12}, // 3 - consonant + {-1, -1, -1, -1, -1, -1, 3, 2, -1, -1, -1, -1, -1, -1, 7}, // 4 - consonant virama +- {-1, 6, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, // 5 - dependent vowels ++ {-1, 6, 1, -1, -1, -1, -1, -1, -1, 5, -1, -1, -1, -1, -1}, // 5 - dependent vowels + {-1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, // 6 - vowel mark + {-1, -1, -1, -1, -1, -1, 3, 2, -1, -1, -1, -1, -1, -1, -1}, // 7 - consonant virama ZWJ, consonant ZWJ virama + {-1, 6, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4, -1}, // 8 - independent vowels that can take a virama diff --git a/icu.icu5418.malayam.patch b/icu.icu5418.malayam.patch new file mode 100644 index 0000000..03fbe63 --- /dev/null +++ b/icu.icu5418.malayam.patch @@ -0,0 +1,39 @@ +--- icu/source/layout/IndicClassTables.cpp.orig 2006-08-23 01:12:40.000000000 +0100 ++++ icu/source/layout/IndicClassTables.cpp 2006-09-25 09:06:38.000000000 +0100 +@@ -173,6 +173,19 @@ + _iv, _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx // 0CE0 - 0CEF + }; + ++#if 1 ++//use the pango char class table here ++static const IndicClassTable::CharClass mlymCharClasses[] = ++{ ++ _xx, _xx, _mp, _mp, _xx, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _xx, _iv, _iv, /* 0D00 - 0D0F */ ++ _iv, _xx, _iv, _iv, _iv, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, /* 0D10 - 0D1F */ ++ _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _xx, _ct, _ct, _ct, _ct, _ct, _pb, /* 0D20 - 0D2F */ ++ _pb, _cn, _ct, _ct, _ct, _pb, _ct, _ct, _ct, _ct, _xx, _xx, _xx, _xx, _dr, _dr, /* 0D30 - 0D3F */ ++ _dr, _dr, _dr, _dr, _xx, _xx, _dl, _dl, _dl, _xx, _s1, _s2, _s3, _vr, _xx, _xx, /* 0D40 - 0D4F */ ++ _xx, _xx, _xx, _xx, _xx, _xx, _xx, _dr, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, /* 0D50 - 0D5F */ ++ _iv, _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx /* 0D60 - 0D6F */ ++}; ++#else + // FIXME: this is correct for old-style Malayalam (MAL) but not for reformed Malayalam (MLR) + // FIXME: should there be a REPH for old-style Malayalam? + static const IndicClassTable::CharClass mlymCharClasses[] = +@@ -185,6 +198,7 @@ + _xx, _xx, _xx, _xx, _xx, _xx, _xx, _m2, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, // 0D50 - 0D5F + _iv, _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx // 0D60 - 0D6F + }; ++#endif + + static const IndicClassTable::CharClass sinhCharClasses[] = + { +@@ -232,7 +246,7 @@ + #define TAML_SCRIPT_FLAGS (SF_MPRE_FIXUP | SF_NO_POST_BASE_LIMIT | SF_FILTER_ZERO_WIDTH) + #define TELU_SCRIPT_FLAGS (SF_MATRAS_AFTER_BASE | SF_FILTER_ZERO_WIDTH | 3) + #define KNDA_SCRIPT_FLAGS (SF_MATRAS_AFTER_BASE | SF_FILTER_ZERO_WIDTH | 3) +-#define MLYM_SCRIPT_FLAGS (SF_MPRE_FIXUP | SF_NO_POST_BASE_LIMIT | SF_FILTER_ZERO_WIDTH) ++#define MLYM_SCRIPT_FLAGS (SF_MPRE_FIXUP | SF_NO_POST_BASE_LIMIT) + #define SINH_SCRIPT_FLAGS (SF_MPRE_FIXUP | SF_NO_POST_BASE_LIMIT) + + // diff --git a/icu.icu5431.malayam.patch b/icu.icu5431.malayam.patch new file mode 100644 index 0000000..48a549d --- /dev/null +++ b/icu.icu5431.malayam.patch @@ -0,0 +1,107 @@ +--- icu.orig/source/layout/IndicReordering.cpp 2006-12-21 09:24:42.000000000 +0000 ++++ icu/source/layout/IndicReordering.cpp 2006-12-21 09:16:15.000000000 +0000 +@@ -50,6 +50,14 @@ + #define distFeatureMask 0x00010000UL + #define initFeatureMask 0x00008000UL + ++// TODO: Find better names for these! ++#define tagArray4 (loclFeatureMask | nuktFeatureMask | akhnFeatureMask | vatuFeatureMask | presFeatureMask | blwsFeatureMask | abvsFeatureMask | pstsFeatureMask | halnFeatureMask | blwmFeatureMask | abvmFeatureMask | distFeatureMask) ++#define tagArray3 (pstfFeatureMask | tagArray4) ++#define tagArray2 (halfFeatureMask | tagArray3) ++#define tagArray1 (blwfFeatureMask | tagArray2) ++#define tagArray0 (rphfFeatureMask | tagArray1) ++ ++ + class IndicReorderingOutput : public UMemory { + private: + le_int32 fOutIndex; +@@ -154,6 +162,27 @@ + fSMabove = fSMbelow = 0; + } + ++ void swapChars(int a, int b) ++ { ++ LEErrorCode success = LE_NO_ERROR; ++ LEUnicode temp_char; ++ le_uint32 temp_index; ++ FeatureMask temp_tag; ++ ++ temp_char = fOutChars[fOutIndex + b]; ++ temp_index = fGlyphStorage.getCharIndex(fOutIndex + b, success); ++ temp_tag = fGlyphStorage.getAuxData(fOutIndex + b, success); ++ ++ fOutChars[fOutIndex + b] = fOutChars[fOutIndex + a]; ++ le_uint32 toswap = fGlyphStorage.getCharIndex(fOutIndex + a, success); ++ fGlyphStorage.setCharIndex(fOutIndex + b, toswap, success); ++ fGlyphStorage.setAuxData(fOutIndex + b, tagArray3, success); ++ ++ fOutChars[fOutIndex + a] = temp_char; ++ fGlyphStorage.setCharIndex(fOutIndex + a, temp_index, success); ++ fGlyphStorage.setAuxData(fOutIndex + a, temp_tag, success); ++ } ++ + void writeChar(LEUnicode ch, le_uint32 charIndex, FeatureMask charFeatures) + { + LEErrorCode success = LE_NO_ERROR; +@@ -335,13 +364,6 @@ + C_DOTTED_CIRCLE = 0x25CC + }; + +-// TODO: Find better names for these! +-#define tagArray4 (loclFeatureMask | nuktFeatureMask | akhnFeatureMask | vatuFeatureMask | presFeatureMask | blwsFeatureMask | abvsFeatureMask | pstsFeatureMask | halnFeatureMask | blwmFeatureMask | abvmFeatureMask | distFeatureMask) +-#define tagArray3 (pstfFeatureMask | tagArray4) +-#define tagArray2 (halfFeatureMask | tagArray3) +-#define tagArray1 (blwfFeatureMask | tagArray2) +-#define tagArray0 (rphfFeatureMask | tagArray1) +- + static const FeatureMap featureMap[] = + { + {loclFeatureTag, loclFeatureMask}, +@@ -629,6 +651,21 @@ + output.writeChar(chars[i], i, tagArray4); + } + ++ /* for the special conjuction of Cons+0x0d4d+0x0d31 or Cons+0x0d4d+0x0d30 of Malayalam */ ++ if ((baseConsonant - 2 >= 0) && ++ (chars[baseConsonant - 1] == 0x0d4d) && ++ ((chars[baseConsonant] == 0x0d31) || ++ (chars[baseConsonant] == 0x0d30)) && ++ ((chars[baseConsonant - 2] >= 0x0d15) && ++ (chars[baseConsonant - 2] <= 0x0d39))) { ++ if (baseConsonant < 3 || chars[baseConsonant - 3] != 0x0d4d) { ++ output.swapChars(-1, -3); ++ ++ if (mpreFixups) ++ mpreFixups->reduce(); ++ } ++ } ++ + if ((classTable->scriptFlags & SF_MATRAS_AFTER_BASE) != 0) { + output.writeMbelow(); + output.writeSMbelow(); // FIXME: there are no SMs in these scripts... +--- icu.orig/source/layout/MPreFixups.h 2006-11-10 09:42:47.000000000 +0000 ++++ icu/source/layout/MPreFixups.h 2006-12-21 09:13:47.000000000 +0000 +@@ -31,6 +31,8 @@ + + void apply(LEGlyphStorage &glyphStorage); + ++ void reduce(); ++ + private: + FixupData *fFixupData; + le_int32 fFixupCount; +--- icu.orig/source/layout/MPreFixups.cpp 2006-11-10 09:42:47.000000000 +0000 ++++ icu/source/layout/MPreFixups.cpp 2006-12-21 09:16:33.000000000 +0000 +@@ -40,6 +40,12 @@ + } + } + ++void MPreFixups::reduce() ++{ ++ if (fFixupCount > 0) ++ fFixupCount--; ++} ++ + void MPreFixups::apply(LEGlyphStorage &glyphStorage) + { + for (le_int32 fixup = 0; fixup < fFixupCount; fixup += 1) { diff --git a/icu.icu5433.oriya.patch b/icu.icu5433.oriya.patch new file mode 100644 index 0000000..f35f5a2 --- /dev/null +++ b/icu.icu5433.oriya.patch @@ -0,0 +1,31 @@ +diff -ru icu.orig/source/layout/IndicClassTables.cpp icu/source/layout/IndicClassTables.cpp +--- icu.orig/source/layout/IndicClassTables.cpp 2006-10-03 14:27:47.000000000 +0100 ++++ icu/source/layout/IndicClassTables.cpp 2006-10-03 14:30:07.000000000 +0100 +@@ -120,6 +120,19 @@ + _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx // 0AE0 - 0AEF + }; + ++#if 1 ++static const IndicClassTable::CharClass oryaCharClasses[] = ++{ ++ _xx, _ma, _mp, _mp, _xx, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _xx, _xx, _iv, /* 0B00 - 0B0F */ ++ _iv, _xx, _xx, _iv, _iv, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _ct, _bb, /* 0B10 - 0B1F */ ++ _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _xx, _bb, _bb, _bb, _bb, _bb, _pb, /* 0B20 - 0B2F */ ++ _rb, _xx, _bb, _bb, _xx, _bb, _bb, _bb, _bb, _bb, _xx, _xx, _nu, _xx, _dr, _da, /* 0B30 - 0B3F */ ++ _dr, _db, _db, _db, _xx, _xx, _xx, _dl, _s1, _xx, _xx, _s2, _s3, _vr, _xx, _xx, /* 0B40 - 0B4F */ ++ _xx, _xx, _xx, _xx, _xx, _xx, _da, _dr, _xx, _xx, _xx, _xx, _cn, _cn, _xx, _pb, /* 0B50 - 0B5F */ ++ _iv, _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, /* 0B60 - 0B6F */ ++ _xx, _bb /* 0B70 - 0B71 */ ++}; ++#else + static const IndicClassTable::CharClass oryaCharClasses[] = + { + _xx, _ma, _mp, _mp, _xx, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _xx, _xx, _iv, // 0B00 - 0B0F +@@ -131,6 +144,7 @@ + _iv, _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, // 0B60 - 0B6F + _xx, _ct // 0B70 - 0B71 + }; ++#endif + + static const IndicClassTable::CharClass tamlCharClasses[] = + { diff --git a/icu.icu5465.telegu.patch b/icu.icu5465.telegu.patch new file mode 100644 index 0000000..7e80103 --- /dev/null +++ b/icu.icu5465.telegu.patch @@ -0,0 +1,29 @@ +--- icu.orig/source/layout/IndicClassTables.cpp 2007-02-05 14:44:17.000000000 +0000 ++++ icu/source/layout/IndicClassTables.cpp 2007-02-05 14:47:49.000000000 +0000 +@@ -145,6 +145,7 @@ + }; + + // FIXME: Should some of the bb's be pb's? (KA, NA, MA, YA, VA, etc. (approx 13)) ++#if 0 + static const IndicClassTable::CharClass teluCharClasses[] = + { + _xx, _mp, _mp, _mp, _xx, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _xx, _iv, _iv, // 0C00 - 0C0F +@@ -155,6 +156,18 @@ + _xx, _xx, _xx, _xx, _xx, _da, _m2, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, // 0C50 - 0C5F + _iv, _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx // 0C60 - 0C6F + }; ++#else ++static const IndicClassTable::CharClass teluCharClasses[] = ++{ ++ _xx, _mp, _mp, _mp, _xx, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _xx, _iv, _iv, /* 0C00 - 0C0F */ ++ _iv, _xx, _iv, _iv, _iv, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, /* 0C10 - 0C1F */ ++ _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _bb, _xx, _bb, _bb, _bb, _bb, _bb, _bb, /* 0C20 - 0C2F */ ++ _bb, _bb, _bb, _bb, _xx, _bb, _bb, _bb, _bb, _bb, _xx, _xx, _xx, _xx, _da, _da, /* 0C30 - 0C3F */ ++ _da, _dr, _dr, _dr, _dr, _xx, _da, _da, _s1, _xx, _da, _da, _da, _vr, _xx, _xx, /* 0C40 - 0C4F */ ++ _xx, _xx, _xx, _xx, _xx, _da, _db, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, /* 0C50 - 0C5F */ ++ _iv, _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx /* 0C60 - 0C6F */ ++}; ++#endif + + // U+CC3 and U+CC4 are _lm here not _dr since the Kannada rendering + // rules want them below and to the right of the entire cluster diff --git a/icu.icu5483.backport.patch b/icu.icu5483.backport.patch new file mode 100644 index 0000000..039dee2 --- /dev/null +++ b/icu.icu5483.backport.patch @@ -0,0 +1,874 @@ +diff -ru icu.orig/source/common/ucnv2022.c icu/source/common/ucnv2022.c +--- icu.orig/source/common/ucnv2022.c 2009-06-02 11:48:38.000000000 +0100 ++++ icu/source/common/ucnv2022.c 2009-06-02 12:30:29.000000000 +0100 +@@ -84,6 +84,26 @@ + #define V_TAB 0x0B + #define SPACE 0x20 + ++enum { ++ HWKANA_START=0xff61, ++ HWKANA_END=0xff9f ++}; ++ ++/* ++ * 94-character sets with native byte values A1..FE are encoded in ISO 2022 ++ * as bytes 21..7E. (Subtract 0x80.) ++ * 96-character sets with native byte values A0..FF are encoded in ISO 2022 ++ * as bytes 20..7F. (Subtract 0x80.) ++ * Do not encode C1 control codes with native bytes 80..9F ++ * as bytes 00..1F (C0 control codes). ++ */ ++enum { ++ GR94_START=0xa1, ++ GR94_END=0xfe, ++ GR96_START=0xa0, ++ GR96_END=0xff ++}; ++ + /* + * ISO 2022 control codes must not be converted from Unicode + * because they would mess up the byte stream. +@@ -981,22 +1001,27 @@ + + + /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c +- * any future change in _MBCSFromUChar32() function should be reflected in +- * this macro ++ * any future change in _MBCSFromUChar32() function should be reflected here. ++ * @return number of bytes in *value; negative number if fallback; 0 if no mapping + */ +-static U_INLINE void ++static U_INLINE int32_t + MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData, + UChar32 c, + uint32_t* value, + UBool useFallback, +- int32_t *length, + int outputType) + { + const int32_t *cx; + const uint16_t *table; + uint32_t stage2Entry; + uint32_t myValue; ++ int32_t length; + const uint8_t *p; ++ /* ++ * TODO(markus): Use and require new, faster MBCS conversion table structures. ++ * Use internal version of ucnv_open() that verifies that the new structures are available, ++ * else U_INTERNAL_PROGRAM_ERROR. ++ */ + /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ + if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { + table=sharedData->mbcs.fromUnicodeTable; +@@ -1005,51 +1030,60 @@ + if(outputType==MBCS_OUTPUT_2){ + myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); + if(myValue<=0xff) { +- *length=1; ++ length=1; + } else { +- *length=2; ++ length=2; + } + } else /* outputType==MBCS_OUTPUT_3 */ { + p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); + myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; + if(myValue<=0xff) { +- *length=1; ++ length=1; + } else if(myValue<=0xffff) { +- *length=2; ++ length=2; + } else { +- *length=3; ++ length=3; + } + } ++ /* ++ * TODO(markus): Use Shift-JIS table for JIS X 0208, to save mapping table space. ++ * Pass in parameter for type of output bytes, for validation and shifting: ++ * - Direct: Pass bytes through, but forbid control codes 00-1F (except SI/SO/ESC) and space 20? ++ * (Need to allow some (TAB/LF/CR) or most of them for ASCII and maybe JIS X 0201.) ++ * - A1-FE: Subtract 80 after range check. ++ * - SJIS: Shift DBCS result to 21-7E x 21-7E. ++ */ + /* is this code point assigned, or do we use fallbacks? */ +- if( (stage2Entry&(1<<(16+(c&0xf))))!=0 || +- (FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) +- ) { ++ if((stage2Entry&(1<<(16+(c&0xf))))!=0) { ++ /* assigned */ ++ *value=myValue; ++ return length; ++ } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) { + /* + * We allow a 0 byte output if the "assigned" bit is set for this entry. + * There is no way with this data structure for fallback output + * to be a zero byte. + */ +- /* assigned */ + *value=myValue; +- return; ++ return -length; + } + } + + cx=sharedData->mbcs.extIndexes; + if(cx!=NULL) { +- *length=ucnv_extSimpleMatchFromU(cx, c, value, useFallback); +- return; ++ return ucnv_extSimpleMatchFromU(cx, c, value, useFallback); + } + + /* unassigned */ +- *length=0; ++ return 0; + } + + /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c +- * any future change in _MBCSSingleFromUChar32() function should be reflected in +- * this macro ++ * any future change in _MBCSSingleFromUChar32() function should be reflected here. ++ * @param retval pointer to output byte ++ * @return 1 roundtrip byte 0 no mapping -1 fallback byte + */ +-static U_INLINE void ++static U_INLINE int32_t + MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData, + UChar32 c, + uint32_t* retval, +@@ -1059,20 +1093,21 @@ + int32_t value; + /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ + if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { +- *retval=(uint16_t)-1; +- return; ++ return 0; + } + /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ + table=sharedData->mbcs.fromUnicodeTable; + /* get the byte for the output */ + value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); + /* is this code point assigned, or do we use fallbacks? */ +- if(useFallback ? value>=0x800 : value>=0xc00) { +- value &=0xff; ++ *retval=(uint32_t)(value&0xff); ++ if(value>=0xf00) { ++ return 1; /* roundtrip */ ++ } else if(useFallback ? value>=0x800 : value>=0xc00) { ++ return -1; /* fallback taken */ + } else { +- value= -1; ++ return 0; /* no mapping */ + } +- *retval=(uint16_t) value; + } + + #ifdef U_ENABLE_GENERIC_ISO_2022 +@@ -1316,6 +1351,7 @@ + + static void + UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) { ++ UConverter *cnv = args->converter; + UConverterDataISO2022 *converterData; + ISO2022State *pFromU2022State; + uint8_t *target = (uint8_t *) args->target; +@@ -1335,14 +1371,13 @@ + int8_t cs, g; + + /* set up the state */ +- converterData = (UConverterDataISO2022*)args->converter->extraInfo; ++ converterData = (UConverterDataISO2022*)cnv->extraInfo; + pFromU2022State = &converterData->fromU2022State; +- useFallback = args->converter->useFallback; + + choiceCount = 0; + + /* check if the last codepoint of previous buffer was a lead surrogate*/ +- if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) { ++ if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { + goto getTrail; + } + +@@ -1361,26 +1396,26 @@ + if(UTF_IS_SECOND_SURROGATE(trail)) { + source++; + sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); +- args->converter->fromUChar32=0x00; ++ cnv->fromUChar32=0x00; + /* convert this supplementary code point */ + /* exit this condition tree */ + } else { + /* this is an unmatched lead code unit (1st surrogate) */ + /* callback(illegal) */ + *err=U_ILLEGAL_CHAR_FOUND; +- args->converter->fromUChar32=sourceChar; ++ cnv->fromUChar32=sourceChar; + break; + } + } else { + /* no more input */ +- args->converter->fromUChar32=sourceChar; ++ cnv->fromUChar32=sourceChar; + break; + } + } else { + /* this is an unmatched trail code unit (2nd surrogate) */ + /* callback(illegal) */ + *err=U_ILLEGAL_CHAR_FOUND; +- args->converter->fromUChar32=sourceChar; ++ cnv->fromUChar32=sourceChar; + break; + } + } +@@ -1389,7 +1424,7 @@ + if(IS_2022_CONTROL(sourceChar)) { + /* callback(illegal) */ + *err=U_ILLEGAL_CHAR_FOUND; +- args->converter->fromUChar32=sourceChar; ++ cnv->fromUChar32=sourceChar; + break; + } + +@@ -1407,9 +1442,10 @@ + + /* JIS7/8: try single-byte half-width Katakana before JISX208 */ + if(converterData->version == 3 || converterData->version == 4) { +- choices[choiceCount++] = cs = (int8_t)HWKANA_7BIT; +- csm &= ~CSM(cs); ++ choices[choiceCount++] = (int8_t)HWKANA_7BIT; + } ++ /* Do not try single-byte half-width Katakana for other versions. */ ++ csm &= ~CSM(HWKANA_7BIT); + + /* try the current G0 charset */ + choices[choiceCount++] = cs = pFromU2022State->cs[0]; +@@ -1432,86 +1468,134 @@ + } + + cs = g = 0; ++ /* ++ * len==0: no mapping found yet ++ * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks ++ * len>0: found a roundtrip result, done ++ */ + len = 0; ++ /* ++ * We will turn off useFallback after finding a fallback, ++ * but we still get fallbacks from PUA code points as usual. ++ * Therefore, we will also need to check that we don't overwrite ++ * an early fallback with a later one. ++ */ ++ useFallback = cnv->useFallback; + +- for(i = 0; i < choiceCount && len == 0; ++i) { +- cs = choices[i]; +- switch(cs) { ++ for(i = 0; i < choiceCount && len <= 0; ++i) { ++ uint32_t value; ++ int32_t len2; ++ int8_t cs0 = choices[i]; ++ switch(cs0) { + case ASCII: + if(sourceChar <= 0x7f) { + targetValue = (uint32_t)sourceChar; + len = 1; ++ cs = cs0; ++ g = 0; + } + break; + case ISO8859_1: +- if(0x80 <= sourceChar && sourceChar <= 0xff) { ++ if(GR96_START <= sourceChar && sourceChar <= GR96_END) { + targetValue = (uint32_t)sourceChar - 0x80; + len = 1; ++ cs = cs0; + g = 2; + } + break; + case HWKANA_7BIT: +- if((uint32_t)(0xff9f-sourceChar)<=(0xff9f-0xff61)) { +- targetValue = (uint32_t)(sourceChar - (0xff61 - 0x21)); +- len = 1; +- ++ if((uint32_t)(HWKANA_END-sourceChar)<=(HWKANA_END-HWKANA_START)) { + if(converterData->version==3) { + /* JIS7: use G1 (SO) */ +- pFromU2022State->cs[1] = cs; /* do not output an escape sequence */ ++ /* Shift U+FF61..U+FF9F to bytes 21..5F. */ ++ targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21)); ++ len = 1; ++ pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */ + g = 1; + } else if(converterData->version==4) { + /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */ +- int8_t cs0; +- +- targetValue += 0x80; ++ /* Shift U+FF61..U+FF9F to bytes A1..DF. */ ++ targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1)); ++ len = 1; + +- cs0 = pFromU2022State->cs[0]; +- if(IS_JP_DBCS(cs0)) { ++ cs = pFromU2022State->cs[0]; ++ if(IS_JP_DBCS(cs)) { + /* switch from a DBCS charset to JISX201 */ + cs = (int8_t)JISX201; +- } else { +- /* stay in the current G0 charset */ +- cs = cs0; + } ++ /* else stay in the current G0 charset */ ++ g = 0; + } ++ /* else do not use HWKANA_7BIT with other versions */ + } + break; + case JISX201: + /* G0 SBCS */ +- MBCS_SINGLE_FROM_UCHAR32( +- converterData->myConverterArray[cs], +- sourceChar, &targetValue, +- useFallback); +- if(targetValue <= 0x7f) { +- len = 1; ++ len2 = MBCS_SINGLE_FROM_UCHAR32( ++ converterData->myConverterArray[cs0], ++ sourceChar, &value, ++ useFallback); ++ if(len2 != 0 && !(len2 < 0 && len != 0) && value <= 0x7f) { ++ targetValue = value; ++ len = len2; ++ cs = cs0; ++ g = 0; ++ useFallback = FALSE; + } + break; + case ISO8859_7: + /* G0 SBCS forced to 7-bit output */ +- MBCS_SINGLE_FROM_UCHAR32( +- converterData->myConverterArray[cs], +- sourceChar, &targetValue, +- useFallback); +- if(0x80 <= targetValue && targetValue <= 0xff) { +- targetValue -= 0x80; +- len = 1; ++ len2 = MBCS_SINGLE_FROM_UCHAR32( ++ converterData->myConverterArray[cs0], ++ sourceChar, &value, ++ useFallback); ++ if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) { ++ targetValue = value - 0x80; ++ len = len2; ++ cs = cs0; + g = 2; ++ useFallback = FALSE; + } + break; + default: + /* G0 DBCS */ +- MBCS_FROM_UCHAR32_ISO2022( +- converterData->myConverterArray[cs], +- sourceChar, &targetValue, +- useFallback, &len, MBCS_OUTPUT_2); +- if(len != 2) { +- len = 0; ++ len2 = MBCS_FROM_UCHAR32_ISO2022( ++ converterData->myConverterArray[cs0], ++ sourceChar, &value, ++ useFallback, MBCS_OUTPUT_2); ++ if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ ++ if(cs0 == KSC5601) { ++ /* ++ * Check for valid bytes for the encoding scheme. ++ * This is necessary because the sub-converter (windows-949) ++ * has a broader encoding scheme than is valid for 2022. ++ * ++ * Check that the result is a 2-byte value with each byte in the range A1..FE ++ * (strict EUC-KR DBCS) before accepting it and subtracting 0x80 from each byte ++ * to move it to the ISO 2022 range 21..7E. ++ */ ++ if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && ++ (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) ++ ) { ++ value -= 0x8080; /* shift down to 21..7e byte range */ ++ } else { ++ break; /* not valid for ISO 2022 */ ++ } ++ } ++ targetValue = value; ++ len = len2; ++ cs = cs0; ++ g = 0; ++ useFallback = FALSE; + } + break; + } + } + +- if(len > 0) { ++ if(len != 0) { ++ if(len < 0) { ++ len = -len; /* fallback */ ++ } + outLen = 0; /* count output bytes */ + + /* write SI if necessary (only for JIS7) */ +@@ -1560,7 +1644,7 @@ + * then this is an error + */ + *err = U_INVALID_CHAR_FOUND; +- args->converter->fromUChar32=sourceChar; ++ cnv->fromUChar32=sourceChar; + break; + } + +@@ -1586,7 +1670,7 @@ + } + } else { + fromUWriteUInt8( +- args->converter, ++ cnv, + buffer, outLen, + &target, (const char *)targetLimit, + &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), +@@ -1615,7 +1699,7 @@ + */ + if( U_SUCCESS(*err) && + (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) && +- args->flush && source>=sourceLimit && args->converter->fromUChar32==0 ++ args->flush && source>=sourceLimit && cnv->fromUChar32==0 + ) { + int32_t sourceIndex; + +@@ -1654,7 +1738,7 @@ + } + + fromUWriteUInt8( +- args->converter, ++ cnv, + buffer, outLen, + &target, (const char *)targetLimit, + &offsets, sourceIndex, +@@ -1777,7 +1861,7 @@ + !IS_JP_DBCS(cs) + ) { + /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */ +- targetUniChar = mySourceChar + (0xff61 - 0xa1); ++ targetUniChar = mySourceChar + (HWKANA_START - 0xa1); + + /* return from a single-shift state to the previous one */ + if(pToU2022State->g >= 2) { +@@ -1818,7 +1902,7 @@ + case HWKANA_7BIT: + if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) { + /* 7-bit halfwidth Katakana */ +- targetUniChar = mySourceChar + (0xff61 - 0x21); ++ targetUniChar = mySourceChar + (HWKANA_START - 0x21); + } + break; + default: +@@ -1965,9 +2049,10 @@ + break; + } + +- /* length= ucnv_MBCSFromUChar32(converterData->currentConverter->sharedData, +- sourceChar,&targetByteUnit,args->converter->useFallback);*/ +- MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,&length,MBCS_OUTPUT_2); ++ length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2); ++ if(length < 0) { ++ length = -length; /* fallback */ ++ } + /* only DBCS or SBCS characters are expected*/ + /* DB characters with high bit set to 1 are expected */ + if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080)&& length==2)){ +@@ -2449,7 +2534,7 @@ + + static void + UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ +- ++ UConverter *cnv = args->converter; + UConverterDataISO2022 *converterData; + ISO2022State *pFromU2022State; + uint8_t *target = (uint8_t *) args->target; +@@ -2466,14 +2551,13 @@ + UBool useFallback; + + /* set up the state */ +- converterData = (UConverterDataISO2022*)args->converter->extraInfo; ++ converterData = (UConverterDataISO2022*)cnv->extraInfo; + pFromU2022State = &converterData->fromU2022State; +- useFallback = args->converter->useFallback; + + choiceCount = 0; + + /* check if the last codepoint of previous buffer was a lead surrogate*/ +- if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) { ++ if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { + goto getTrail; + } + +@@ -2492,26 +2576,26 @@ + if(UTF_IS_SECOND_SURROGATE(trail)) { + source++; + sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); +- args->converter->fromUChar32=0x00; ++ cnv->fromUChar32=0x00; + /* convert this supplementary code point */ + /* exit this condition tree */ + } else { + /* this is an unmatched lead code unit (1st surrogate) */ + /* callback(illegal) */ + *err=U_ILLEGAL_CHAR_FOUND; +- args->converter->fromUChar32=sourceChar; ++ cnv->fromUChar32=sourceChar; + break; + } + } else { + /* no more input */ +- args->converter->fromUChar32=sourceChar; ++ cnv->fromUChar32=sourceChar; + break; + } + } else { + /* this is an unmatched trail code unit (2nd surrogate) */ + /* callback(illegal) */ + *err=U_ILLEGAL_CHAR_FOUND; +- args->converter->fromUChar32=sourceChar; ++ cnv->fromUChar32=sourceChar; + break; + } + } +@@ -2522,7 +2606,7 @@ + if(IS_2022_CONTROL(sourceChar)) { + /* callback(illegal) */ + *err=U_ILLEGAL_CHAR_FOUND; +- args->converter->fromUChar32=sourceChar; ++ cnv->fromUChar32=sourceChar; + break; + } + +@@ -2545,7 +2629,6 @@ + } + else{ + /* convert U+0080..U+10ffff */ +- UConverterSharedData *cnv; + int32_t i; + int8_t cs, g; + +@@ -2593,17 +2676,41 @@ + } + + cs = g = 0; ++ /* ++ * len==0: no mapping found yet ++ * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks ++ * len>0: found a roundtrip result, done ++ */ + len = 0; ++ /* ++ * We will turn off useFallback after finding a fallback, ++ * but we still get fallbacks from PUA code points as usual. ++ * Therefore, we will also need to check that we don't overwrite ++ * an early fallback with a later one. ++ */ ++ useFallback = cnv->useFallback; + +- for(i = 0; i < choiceCount && len == 0; ++i) { +- cs = choices[i]; +- if(cs > 0) { +- if(cs > CNS_11643_0) { +- cnv = converterData->myConverterArray[CNS_11643]; +- MBCS_FROM_UCHAR32_ISO2022(cnv,sourceChar,&targetValue,useFallback,&len,MBCS_OUTPUT_3); +- if(len==3) { +- cs = (int8_t)(CNS_11643_0 + (targetValue >> 16) - 0x80); +- len = 2; ++ for(i = 0; i < choiceCount && len <= 0; ++i) { ++ int8_t cs0 = choices[i]; ++ if(cs0 > 0) { ++ uint32_t value; ++ int32_t len2; ++ if(cs0 > CNS_11643_0) { ++ len2 = MBCS_FROM_UCHAR32_ISO2022( ++ converterData->myConverterArray[CNS_11643], ++ sourceChar, ++ &value, ++ useFallback, ++ MBCS_OUTPUT_3); ++ if(len2 == 3 || (len2 == -3 && len == 0)) { ++ targetValue = value; ++ cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80); ++ if(len2 >= 0) { ++ len = 2; ++ } else { ++ len = -2; ++ useFallback = FALSE; ++ } + if(cs == CNS_11643_1) { + g = 1; + } else if(cs == CNS_11643_2) { +@@ -2617,15 +2724,25 @@ + } + } else { + /* GB2312_1 or ISO-IR-165 */ +- cnv = converterData->myConverterArray[cs]; +- MBCS_FROM_UCHAR32_ISO2022(cnv,sourceChar,&targetValue,useFallback,&len,MBCS_OUTPUT_2); +- g = 1; /* used if len == 2 */ ++ len2 = MBCS_FROM_UCHAR32_ISO2022( ++ converterData->myConverterArray[cs0], ++ sourceChar, ++ &value, ++ useFallback, ++ MBCS_OUTPUT_2); ++ if(len2 == 2 || (len2 == -2 && len == 0)) { ++ targetValue = value; ++ len = len2; ++ cs = cs0; ++ g = 1; ++ useFallback = FALSE; ++ } + } + } + } + +- if(len > 0) { +- len = 0; /* count output bytes; it must have been len == 2 */ ++ if(len != 0) { ++ len = 0; /* count output bytes; it must have been abs(len) == 2 */ + + /* write the designation sequence if necessary */ + if(cs != pFromU2022State->cs[g]) { +@@ -2670,7 +2787,7 @@ + * then this is an error + */ + *err = U_INVALID_CHAR_FOUND; +- args->converter->fromUChar32=sourceChar; ++ cnv->fromUChar32=sourceChar; + break; + } + } +@@ -2691,7 +2808,7 @@ + } + } else { + fromUWriteUInt8( +- args->converter, ++ cnv, + buffer, len, + &target, (const char *)targetLimit, + &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), +@@ -2720,7 +2837,7 @@ + */ + if( U_SUCCESS(*err) && + pFromU2022State->g!=0 && +- args->flush && source>=sourceLimit && args->converter->fromUChar32==0 ++ args->flush && source>=sourceLimit && cnv->fromUChar32==0 + ) { + int32_t sourceIndex; + +@@ -2748,7 +2865,7 @@ + } + + fromUWriteUInt8( +- args->converter, ++ cnv, + SHIFT_IN_STR, 1, + &target, (const char *)targetLimit, + &offsets, sourceIndex, +@@ -3146,7 +3263,7 @@ + } + if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) { + /* include half-width Katakana for JP */ +- sa->addRange(sa->set, 0xff61, 0xff9f); ++ sa->addRange(sa->set, HWKANA_START, HWKANA_END); + } + break; + case 'c': +diff -ru icu.orig/source/common/ucnv_ext.c icu/source/common/ucnv_ext.c +--- icu.orig/source/common/ucnv_ext.c 2009-06-02 11:48:38.000000000 +0100 ++++ icu/source/common/ucnv_ext.c 2009-06-02 12:14:20.000000000 +0100 +@@ -551,6 +551,12 @@ + return 0; + } + ++ /* ++ * Tests for (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0: ++ * Do not interpret values with reserved bits used, for forward compatibility, ++ * and do not even remember intermediate results with reserved bits used. ++ */ ++ + if(UCNV_EXT_TO_U_IS_PARTIAL(value)) { + /* partial match, enter the loop below */ + index=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value); +@@ -575,7 +581,8 @@ + value=*fromUSectionValues++; + if( value!=0 && + (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || +- FROM_U_USE_FALLBACK(useFallback, firstCP)) ++ FROM_U_USE_FALLBACK(useFallback, firstCP)) && ++ (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 + ) { + /* remember longest match so far */ + matchValue=value; +@@ -613,8 +620,9 @@ + /* partial match, continue */ + index=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value); + } else { +- if( UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || +- FROM_U_USE_FALLBACK(useFallback, firstCP) ++ if( (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || ++ FROM_U_USE_FALLBACK(useFallback, firstCP)) && ++ (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 + ) { + /* full match, stop with result */ + matchValue=value; +@@ -632,8 +640,9 @@ + return 0; + } + } else /* result from firstCP trie lookup */ { +- if( UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || +- FROM_U_USE_FALLBACK(useFallback, firstCP) ++ if( (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || ++ FROM_U_USE_FALLBACK(useFallback, firstCP)) && ++ (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 + ) { + /* full match, stop with result */ + matchValue=value; +@@ -644,20 +653,18 @@ + } + } + +- if(matchValue&UCNV_EXT_FROM_U_RESERVED_MASK) { +- /* do not interpret values with reserved bits used, for forward compatibility */ +- return 0; +- } +- + /* return result */ + if(matchValue==UCNV_EXT_FROM_U_SUBCHAR1) { + return 1; /* assert matchLength==2 */ + } + +- *pMatchValue=UCNV_EXT_FROM_U_MASK_ROUNDTRIP(matchValue); ++ *pMatchValue=matchValue; + return matchLength; + } + ++/* ++ * @param value fromUnicode mapping table value; ignores roundtrip and reserved bits ++ */ + static U_INLINE void + ucnv_extWriteFromU(UConverter *cnv, const int32_t *cx, + uint32_t value, +@@ -792,6 +799,10 @@ + } + } + ++/* ++ * Used by ISO 2022 implementation. ++ * @return number of bytes in *pValue; negative number if fallback; 0 for no mapping ++ */ + U_CFUNC int32_t + ucnv_extSimpleMatchFromU(const int32_t *cx, + UChar32 cp, uint32_t *pValue, +@@ -809,13 +820,15 @@ + if(match>=2) { + /* write result for simple, single-character conversion */ + int32_t length; +- ++ int isRoundtrip; ++ ++ isRoundtrip=UCNV_EXT_FROM_U_IS_ROUNDTRIP(value); + length=UCNV_EXT_FROM_U_GET_LENGTH(value); + value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(value); + + if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) { + *pValue=value; +- return length; ++ return isRoundtrip ? length : -length; + #if 0 /* not currently used */ + } else if(length==4) { + /* de-serialize a 4-byte result */ +@@ -825,7 +838,7 @@ + ((uint32_t)result[1]<<16)| + ((uint32_t)result[2]<<8)| + result[3]; +- return 4; ++ return isRoundtrip ? 4 : -4; + #endif + } + } +diff -ru icu.orig/source/common/ucnv_ext.h icu/source/common/ucnv_ext.h +--- icu.orig/source/common/ucnv_ext.h 2009-06-02 11:48:38.000000000 +0100 ++++ icu/source/common/ucnv_ext.h 2009-06-02 12:14:20.000000000 +0100 +@@ -452,7 +452,7 @@ + #define UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) (((value)&UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)!=0) + #define UCNV_EXT_FROM_U_MASK_ROUNDTRIP(value) ((value)&~UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) + +-/* use after masking off the roundtrip flag */ ++/* get length; masks away all other bits */ + #define UCNV_EXT_FROM_U_GET_LENGTH(value) (int32_t)(((value)>>UCNV_EXT_FROM_U_LENGTH_SHIFT)&UCNV_EXT_MAX_BYTES) + + /* get bytes or bytes index */ +diff -ru icu.orig/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c +--- icu.orig/source/common/ucnvmbcs.c 2009-06-02 11:48:38.000000000 +0100 ++++ icu/source/common/ucnvmbcs.c 2009-06-02 12:14:20.000000000 +0100 +@@ -3785,7 +3785,8 @@ + + cx=sharedData->mbcs.extIndexes; + if(cx!=NULL) { +- return ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback); ++ length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback); ++ return length>=0 ? length : -length; /* return abs(length); */ + } + + /* unassigned */ +diff -ru icu.orig/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt +--- icu.orig/source/test/testdata/conversion.txt 2009-06-02 11:48:26.000000000 +0100 ++++ icu/source/test/testdata/conversion.txt 2009-06-02 12:14:20.000000000 +0100 +@@ -495,6 +495,46 @@ + } + { "UTF-16BE", :bin{ 00 }, "", :intvector{}, :int{1}, :int{0}, "truncated", ".", :bin{ 00 } } + { "UTF-16BE", :bin{ d800dc }, "", :intvector{}, :int{1}, :int{0}, "truncated", ".", :bin{ d800dc } } ++ // Verify that mappings that would result in byte values outside 20..7F (for SBCS) ++ // or 21..7E (for DBCS) are not used. ++ // ibm-9005_X110-2007.ucm (ISO 8859-7, <ESC>.F=1b2e46): ++ // <U009F> \x9F |0 (also in ISO 8859-1) ++ // <U0387> \xB7 |1 ++ // windows-949-2000 (KSC_5601, <ESC>$(C=1b242843): ++ // <UC829> \xA0\xA1 |0 ++ // <UD4FE> \xC0\x41 |0 ++ // <UD79D> \xC8\xFE |0 ++ { ++ "JIS8", // =ISO_2022,locale=ja,version=4 ++ "\u009f\u0387\uc829\ud4fe\ud79d", ++ :bin{ 1a1b2e461b4e371a1a1b242843487e1b2842 }, ++ :intvector{ 0,1,1,1,1,1,1,2,3,4,4,4,4,4,4,4,4,4 }, ++ :int{1}, :int{1}, "", "?", "" ++ } ++ // Ticket 5483: ISO 2022 converter incorrectly using fallback mapping ++ // Verify that a roundtrip mapping is used even when a fallback mapping is ++ // available in the current state. ++ // U+FF61 is handled in code ++ // jisx-208.ucm (<ESC>$B=1b2442): ++ // <U30FE> \x21\x34 |0 ++ // <UFF5D> \x21\x51 |0 and ++ // ibm-897_P100-1995.ucm (JIS X 0201, <ESC>(J=1b284a): ++ // <UFF5D> \x7D |1 ++ // ibm-9005_X110-2007.ucm (ISO 8859-7, <ESC>.F=1b2e46): ++ // <U03D5> \xF6 |1 ++ // <U2015> \xAF |0 ++ // <UFF5D> \x7D |1 (not legal for ISO 2022) ++ // windows-949-2000 (KSC_5601, <ESC>$(C=1b242843): ++ // <UAC00> \xB0\xA1 |0 ++ // <UFF5D> \xA3\xFD |0 ++ // <U223C> \xA1\xAD |0 (in extension table) ++ { ++ "JIS8", // =ISO_2022,locale=ja,version=4 ++ "a\uff61\u03d5\uff5d\uac00\u223c\uff5d\u30fe\uff5d", // Make it switch to ISO-8859-7, KSC 5601 and JIS X 0208. ++ :bin{ 61a11b2e461b4e761b244221511b2428433021212d237d1b2442213421511b2842 }, ++ :intvector{ 0,1,2,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,4,5,5,6,6,7,7,7,7,7,8,8,8,8,8 }, ++ :int{1}, :int{1}, "", "?", "" ++ } + + // e4b8 is a partial sequence + { "UTF-8", :bin{ 31e4ba8ce4b8 }, "1\u4e8c", :intvector{ 0, 1 }, :int{1}, :int{0}, "truncated", ".", :bin{ e4b8 } } diff --git a/icu.icu5488.assamese.patch b/icu.icu5488.assamese.patch new file mode 100644 index 0000000..8b5d773 --- /dev/null +++ b/icu.icu5488.assamese.patch @@ -0,0 +1,11 @@ +--- icu.orig/source/layout/IndicClassTables.cpp 2006-10-18 09:05:20.000000000 +0100 ++++ icu/source/layout/IndicClassTables.cpp 2006-11-01 09:26:58.000000000 +0000 +@@ -94,7 +94,7 @@ + _dr, _db, _db, _db, _db, _xx, _xx, _l1, _dl, _xx, _xx, _s1, _s2, _vr, _xx, _xx, // 09C0 - 09CF + _xx, _xx, _xx, _xx, _xx, _xx, _xx, _m2, _xx, _xx, _xx, _xx, _cn, _cn, _xx, _cn, // 09D0 - 09DF + _iv, _iv, _dv, _dv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, // 09E0 - 09EF +- _ct, _ct, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx // 09F0 - 09FA ++ _rv, _ct, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx /* 09F0 - 09FA */ + }; + + static const IndicClassTable::CharClass punjCharClasses[] = diff --git a/icu.icu5500.devicetablecrash.patch b/icu.icu5500.devicetablecrash.patch new file mode 100644 index 0000000..16ea5b7 --- /dev/null +++ b/icu.icu5500.devicetablecrash.patch @@ -0,0 +1,11 @@ +--- icu.orig/source/layout/DeviceTables.cpp 2006-10-18 09:05:20.000000000 +0100 ++++ icu/source/layout/DeviceTables.cpp 2006-11-08 09:08:09.000000000 +0000 +@@ -22,7 +22,7 @@ + le_uint16 format = SWAPW(deltaFormat) - 1; + le_int16 result = 0; + +- if (ppem >= start && ppem <= SWAPW(endSize)) { ++ if (ppem >= start && ppem <= SWAPW(endSize) && format < sizeof(fieldBits)/sizeof(fieldBits[0])) { + le_uint16 sizeIndex = ppem - start; + le_uint16 bits = fieldBits[format]; + le_uint16 count = 16 / bits; diff --git a/icu.icu5501.sinhala.biggerexpand.patch b/icu.icu5501.sinhala.biggerexpand.patch new file mode 100644 index 0000000..6013780 --- /dev/null +++ b/icu.icu5501.sinhala.biggerexpand.patch @@ -0,0 +1,11 @@ +--- icu.orig/source/layout/IndicClassTables.cpp 2006-10-18 09:05:20.000000000 +0100 ++++ icu/source/layout/IndicClassTables.cpp 2006-11-08 11:20:55.000000000 +0000 +@@ -284,7 +284,7 @@ + + static const IndicClassTable mlymClassTable = {0x0D00, 0x0D6F, 3, MLYM_SCRIPT_FLAGS, mlymCharClasses, mlymSplitTable}; + +-static const IndicClassTable sinhClassTable = {0x0D80, 0x0DF4, 3, SINH_SCRIPT_FLAGS, sinhCharClasses, sinhSplitTable}; ++static const IndicClassTable sinhClassTable = {0x0D80, 0x0DF4, 4, SINH_SCRIPT_FLAGS, sinhCharClasses, sinhSplitTable}; + + // + // IndicClassTable addresses diff --git a/icu.icu5506.multiplevowels.patch b/icu.icu5506.multiplevowels.patch new file mode 100644 index 0000000..a58ec64 --- /dev/null +++ b/icu.icu5506.multiplevowels.patch @@ -0,0 +1,61 @@ +diff -ur icu.orig/source/layout/IndicReordering.cpp icu/source/layout/IndicReordering.cpp +--- icu.orig/source/layout/IndicReordering.cpp 2006-11-10 09:42:44.000000000 +0000 ++++ icu/source/layout/IndicReordering.cpp 2006-11-10 09:47:05.000000000 +0000 +@@ -395,7 +395,7 @@ + {-1, 6, 1, -1, -1, -1, -1, -1, -1, 5, 9, 5, 5, 4, 12}, // 2 - consonant with nukta + {-1, 6, 1, -1, -1, -1, -1, -1, 2, 5, 9, 5, 5, 4, 12}, // 3 - consonant + {-1, -1, -1, -1, -1, -1, 3, 2, -1, -1, -1, -1, -1, -1, 7}, // 4 - consonant virama +- {-1, 6, 1, -1, -1, -1, -1, -1, -1, 5, -1, -1, -1, -1, -1}, // 5 - dependent vowels ++ {-1, 6, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, // 5 - dependent vowels + {-1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, // 6 - vowel mark + {-1, -1, -1, -1, -1, -1, 3, 2, -1, -1, -1, -1, -1, -1, -1}, // 7 - consonant virama ZWJ, consonant ZWJ virama + {-1, 6, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4, -1}, // 8 - independent vowels that can take a virama +@@ -423,6 +423,48 @@ + + state = stateTable[state][charClass & CF_CLASS_MASK]; + ++ /*for the components of split matra*/ ++ if ((charCount >= cursor + 3) && ++ (chars[cursor] == 0x0DD9 && chars[cursor + 1] == 0x0DCF && chars[cursor + 2] == 0x0DCA)) { /*for 3 split matra of Sinhala*/ ++ return cursor + 3; ++ } ++ else if ((charCount >= cursor + 3) && ++ (chars[cursor] == 0x0CC6 && chars[cursor + 1] == 0x0CC2 && chars[cursor + 2] == 0x0CD5)) { /*for 3 split matra of Kannada*/ ++ return cursor + 3; ++ } ++ /*for 2 split matra*/ ++ else if (charCount >= cursor + 2) { ++ /*for Bengali*/ ++ if ((chars[cursor] == 0x09C7 && chars[cursor + 1] == 0x09BE) || ++ (chars[cursor] == 0x09C7 && chars[cursor + 1] == 0x09D7) || ++ /*for Oriya*/ ++ (chars[cursor] == 0x0B47 && chars[cursor + 1] == 0x0B3E) || ++ (chars[cursor] == 0x0B47 && chars[cursor + 1] == 0x0B56) || ++ (chars[cursor] == 0x0B47 && chars[cursor + 1] == 0x0B57) || ++ /*for Tamil*/ ++ (chars[cursor] == 0x0BC6 && chars[cursor + 1] == 0x0BBE) || ++ (chars[cursor] == 0x0BC6 && chars[cursor + 1] == 0x0BD7) || ++ (chars[cursor] == 0x0BC7 && chars[cursor + 1] == 0x0BBE) || ++ /*for Malayalam*/ ++ (chars[cursor] == 0x0D46 && chars[cursor + 1] == 0x0D3E) || ++ (chars[cursor] == 0x0D46 && chars[cursor + 1] == 0x0D57) || ++ (chars[cursor] == 0x0D47 && chars[cursor + 1] == 0x0D3E) || ++ /*for Sinhala*/ ++ (chars[cursor] == 0x0DD9 && chars[cursor + 1] == 0x0DCA) || ++ (chars[cursor] == 0x0DD9 && chars[cursor + 1] == 0x0DCF) || ++ (chars[cursor] == 0x0DD9 && chars[cursor + 1] == 0x0DDF) || ++ (chars[cursor] == 0x0DDC && chars[cursor + 1] == 0x0DCA) || ++ /*for Telugu*/ ++ (chars[cursor] == 0x0C46 && chars[cursor + 1] == 0x0C56) || ++ /*for Kannada*/ ++ (chars[cursor] == 0x0CBF && chars[cursor + 1] == 0x0CD5) || ++ (chars[cursor] == 0x0CC6 && chars[cursor + 1] == 0x0CD5) || ++ (chars[cursor] == 0x0CC6 && chars[cursor + 1] == 0x0CD6) || ++ (chars[cursor] == 0x0CC6 && chars[cursor + 1] == 0x0CC2) || ++ (chars[cursor] == 0x0CCA && chars[cursor + 1] == 0x0CD5)) ++ return cursor + 2; ++ } ++ + if (state < 0) { + break; + } diff --git a/icu.icu5557.safety.patch b/icu.icu5557.safety.patch new file mode 100644 index 0000000..682caa1 --- /dev/null +++ b/icu.icu5557.safety.patch @@ -0,0 +1,14 @@ +--- icu.orig/source/layout/CoverageTables.cpp 2007-01-09 12:57:41.000000000 +0000 ++++ icu/source/layout/CoverageTables.cpp 2007-01-09 12:59:09.000000000 +0000 +@@ -44,6 +44,11 @@ + le_uint16 count = SWAPW(glyphCount); + le_uint8 bit = OpenTypeUtilities::highBit(count); + le_uint16 power = 1 << bit; ++ ++ if (count == 0) { ++ return -1; ++ } ++ + le_uint16 extra = count - power; + le_uint16 probe = power; + le_uint16 index = 0; diff --git a/icu.icu5594.gujarati.patch b/icu.icu5594.gujarati.patch new file mode 100644 index 0000000..b21418d --- /dev/null +++ b/icu.icu5594.gujarati.patch @@ -0,0 +1,14 @@ +--- icu.orig/source/layout/IndicClassTables.cpp 2007-02-09 14:26:04.000000000 +0000 ++++ icu/source/layout/IndicClassTables.cpp 2007-02-13 15:41:52.000000000 +0000 +@@ -117,7 +117,11 @@ + _rv, _xx, _ct, _ct, _xx, _ct, _ct, _ct, _ct, _ct, _xx, _xx, _nu, _xx, _dr, _dl, // 0AB0 - 0ABF + _dr, _db, _db, _db, _db, _da, _xx, _da, _da, _dr, _xx, _dr, _dr, _vr, _xx, _xx, // 0AC0 - 0ACF + _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, // 0AD0 - 0ADF ++#if 1 ++ _iv, _xx, _db, _db, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx // 0AE0 - 0AEF ++#else + _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx // 0AE0 - 0AEF ++#endif + }; + + #if 1 diff --git a/icu.icu5691.backport.patch b/icu.icu5691.backport.patch new file mode 100644 index 0000000..906ecd3 --- /dev/null +++ b/icu.icu5691.backport.patch @@ -0,0 +1,730 @@ +diff -ru icu.6175/source/common/ucnv2022.c icu/source/common/ucnv2022.c +--- icu.6175/source/common/ucnv2022.c 2009-06-02 15:47:31.000000000 +0100 ++++ icu/source/common/ucnv2022.c 2009-06-02 16:03:15.000000000 +0100 +@@ -754,6 +754,7 @@ + UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo); + uint32_t key = myData2022->key; + int32_t offset = 0; ++ int8_t initialToULength = _this->toULength; + char c; + + value = VALID_NON_TERMINAL_2022; +@@ -806,7 +807,6 @@ + return; + } else if (value == INVALID_2022 ) { + *err = U_ILLEGAL_ESCAPE_SEQUENCE; +- return; + } else /* value == VALID_TERMINAL_2022 */ { + switch(var){ + #ifdef U_ENABLE_GENERIC_ISO_2022 +@@ -938,6 +938,35 @@ + } + if(U_SUCCESS(*err)) { + _this->toULength = 0; ++ } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) { ++ if(_this->toULength>1) { ++ /* ++ * Ticket 5691: consistent illegal sequences: ++ * - We include at least the first byte (ESC) in the illegal sequence. ++ * - If any of the non-initial bytes could be the start of a character, ++ * we stop the illegal sequence before the first one of those. ++ * In escape sequences, all following bytes are "printable", that is, ++ * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS), ++ * they are valid single/lead bytes. ++ * For simplicity, we always only report the initial ESC byte as the ++ * illegal sequence and back out all other bytes we looked at. ++ */ ++ /* Back out some bytes. */ ++ int8_t backOutDistance=_this->toULength-1; ++ int8_t bytesFromThisBuffer=_this->toULength-initialToULength; ++ if(backOutDistance<=bytesFromThisBuffer) { ++ /* same as initialToULength<=1 */ ++ *source-=backOutDistance; ++ } else { ++ /* Back out bytes from the previous buffer: Need to replay them. */ ++ _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); ++ /* same as -(initialToULength-1) */ ++ /* preToULength is negative! */ ++ uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength); ++ *source-=bytesFromThisBuffer; ++ } ++ _this->toULength=1; ++ } + } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) { + _this->toUCallbackReason = UCNV_UNASSIGNED; + } +@@ -1973,6 +2002,7 @@ + mySourceChar = args->converter->toUBytes[0]; + args->converter->toULength = 0; + cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; ++ targetUniChar = missingCharMarker; + goto getTrailByte; + } + +@@ -2102,17 +2132,44 @@ + default: + /* G0 DBCS */ + if(mySource < mySourceLimit) { +- char trailByte; ++ int leadIsOk, trailIsOk; ++ uint8_t trailByte; + getTrailByte: +- trailByte = *mySource++; +- if(cs == JISX208) { +- _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailByte, tempBuf); +- } else { +- tempBuf[0] = (char)mySourceChar; +- tempBuf[1] = trailByte; ++ trailByte = (uint8_t)*mySource; ++ /* ++ * Ticket 5691: consistent illegal sequences: ++ * - We include at least the first byte in the illegal sequence. ++ * - If any of the non-initial bytes could be the start of a character, ++ * we stop the illegal sequence before the first one of those. ++ * ++ * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is ++ * an ESC/SO/SI, we report only the first byte as the illegal sequence. ++ * Otherwise we convert or report the pair of bytes. ++ */ ++ leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); ++ trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); ++ if (leadIsOk && trailIsOk) { ++ ++mySource; ++ uint32_t tmpSourceChar = (mySourceChar << 8) | trailByte; ++ if(cs == JISX208) { ++ _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf); ++ mySourceChar = tmpSourceChar; ++ } else { ++ /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */ ++ mySourceChar = tmpSourceChar; ++ if (cs == KSC5601) { ++ tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */ ++ } ++ tempBuf[0] = (char)(tmpSourceChar >> 8); ++ tempBuf[1] = (char)(tmpSourceChar); ++ } ++ targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); ++ } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { ++ /* report a pair of illegal bytes if the second byte is not a DBCS starter */ ++ ++mySource; ++ /* add another bit so that the code below writes 2 bytes in case of error */ ++ mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; + } +- mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte); +- targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); + } else { + args->converter->toUBytes[0] = (uint8_t)mySourceChar; + args->converter->toULength = 1; +@@ -2254,7 +2311,12 @@ + } + /* only DBCS or SBCS characters are expected*/ + /* DB characters with high bit set to 1 are expected */ +- if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080)&& length==2)){ ++ if( length > 2 || length==0 || ++ (length == 1 && targetByteUnit > 0x7f) || ++ (length == 2 && ++ ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) || ++ (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1))) ++ ) { + targetByteUnit=missingCharMarker; + } + if (targetByteUnit != missingCharMarker){ +@@ -2583,17 +2645,34 @@ + myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */ + if(myData->toU2022State.g == 1) { + if(mySource < mySourceLimit) { +- char trailByte; ++ int leadIsOk, trailIsOk; ++ uint8_t trailByte; + getTrailByte: +- trailByte = *mySource++; +- tempBuf[0] = (char)(mySourceChar + 0x80); +- tempBuf[1] = (char)(trailByte + 0x80); +- mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte); +- if((mySourceChar & 0x8080) == 0) { ++ targetUniChar = missingCharMarker; ++ trailByte = (uint8_t)*mySource; ++ /* ++ * Ticket 5691: consistent illegal sequences: ++ * - We include at least the first byte in the illegal sequence. ++ * - If any of the non-initial bytes could be the start of a character, ++ * we stop the illegal sequence before the first one of those. ++ * ++ * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is ++ * an ESC/SO/SI, we report only the first byte as the illegal sequence. ++ * Otherwise we convert or report the pair of bytes. ++ */ ++ leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); ++ trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); ++ if (leadIsOk && trailIsOk) { ++ ++mySource; ++ tempBuf[0] = (char)(mySourceChar + 0x80); ++ tempBuf[1] = (char)(trailByte + 0x80); + targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback); +- } else { +- /* illegal bytes > 0x7f */ +- targetUniChar = missingCharMarker; ++ mySourceChar = (mySourceChar << 8) | trailByte; ++ } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { ++ /* report a pair of illegal bytes if the second byte is not a DBCS starter */ ++ ++mySource; ++ /* add another bit so that the code below writes 2 bytes in case of error */ ++ mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; + } + } else { + args->converter->toUBytes[0] = (uint8_t)mySourceChar; +@@ -2601,8 +2680,10 @@ + break; + } + } +- else{ ++ else if(mySourceChar <= 0x7f) { + targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback); ++ } else { ++ targetUniChar = 0xffff; + } + if(targetUniChar < 0xfffe){ + if(args->offsets) { +@@ -3099,6 +3180,7 @@ + /* continue with a partial double-byte character */ + mySourceChar = args->converter->toUBytes[0]; + args->converter->toULength = 0; ++ targetUniChar = missingCharMarker; + goto getTrailByte; + } + +@@ -3178,29 +3260,50 @@ + UConverterSharedData *cnv; + StateEnum tempState; + int32_t tempBufLen; +- char trailByte; ++ int leadIsOk, trailIsOk; ++ uint8_t trailByte; + getTrailByte: +- trailByte = *mySource++; +- tempState = (StateEnum)pToU2022State->cs[pToU2022State->g]; +- if(tempState > CNS_11643_0) { +- cnv = myData->myConverterArray[CNS_11643]; +- tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0)); +- tempBuf[1] = (char) (mySourceChar); +- tempBuf[2] = trailByte; +- tempBufLen = 3; +- +- }else{ +- cnv = myData->myConverterArray[tempState]; +- tempBuf[0] = (char) (mySourceChar); +- tempBuf[1] = trailByte; +- tempBufLen = 2; ++ trailByte = (uint8_t)*mySource; ++ /* ++ * Ticket 5691: consistent illegal sequences: ++ * - We include at least the first byte in the illegal sequence. ++ * - If any of the non-initial bytes could be the start of a character, ++ * we stop the illegal sequence before the first one of those. ++ * ++ * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is ++ * an ESC/SO/SI, we report only the first byte as the illegal sequence. ++ * Otherwise we convert or report the pair of bytes. ++ */ ++ leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); ++ trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); ++ if (leadIsOk && trailIsOk) { ++ ++mySource; ++ tempState = (StateEnum)pToU2022State->cs[pToU2022State->g]; ++ if(tempState >= CNS_11643_0) { ++ cnv = myData->myConverterArray[CNS_11643]; ++ tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0)); ++ tempBuf[1] = (char) (mySourceChar); ++ tempBuf[2] = (char) trailByte; ++ tempBufLen = 3; ++ ++ }else{ ++ cnv = myData->myConverterArray[tempState]; ++ tempBuf[0] = (char) (mySourceChar); ++ tempBuf[1] = (char) trailByte; ++ tempBufLen = 2; ++ } ++ targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE); ++ mySourceChar = (mySourceChar << 8) | trailByte; ++ } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { ++ /* report a pair of illegal bytes if the second byte is not a DBCS starter */ ++ ++mySource; ++ /* add another bit so that the code below writes 2 bytes in case of error */ ++ mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; + } +- mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte); + if(pToU2022State->g>=2) { + /* return from a single-shift state to the previous one */ + pToU2022State->g=pToU2022State->prevG; + } +- targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE); + } else { + args->converter->toUBytes[0] = (uint8_t)mySourceChar; + args->converter->toULength = 1; +diff -ru icu.6175/source/common/ucnvhz.c icu/source/common/ucnvhz.c +--- icu.6175/source/common/ucnvhz.c 2009-06-02 15:47:31.000000000 +0100 ++++ icu/source/common/ucnvhz.c 2009-06-02 15:57:18.000000000 +0100 +@@ -196,10 +196,30 @@ + /* if the first byte is equal to TILDE and the trail byte + * is not a valid byte then it is an error condition + */ +- mySourceChar = 0x7e00 | mySourceChar; +- targetUniChar = 0xffff; ++ /* ++ * Ticket 5691: consistent illegal sequences: ++ * - We include at least the first byte in the illegal sequence. ++ * - If any of the non-initial bytes could be the start of a character, ++ * we stop the illegal sequence before the first one of those. ++ */ + myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */ +- break; ++ *err = U_ILLEGAL_ESCAPE_SEQUENCE; ++ args->converter->toUBytes[0] = UCNV_TILDE; ++ if( myData->isStateDBCS ? ++ (0x21 <= mySourceChar && mySourceChar <= 0x7e) : ++ mySourceChar <= 0x7f ++ ) { ++ /* The current byte could be the start of a character: Back it out. */ ++ args->converter->toULength = 1; ++ --mySource; ++ } else { ++ /* Include the current byte in the illegal sequence. */ ++ args->converter->toUBytes[1] = mySourceChar; ++ args->converter->toULength = 2; ++ } ++ args->target = myTarget; ++ args->source = mySource; ++ return; + } + } else if(myData->isStateDBCS) { + if(args->converter->toUnicodeStatus == 0x00){ +@@ -215,19 +235,36 @@ + } + else{ + /* trail byte */ ++ int leadIsOk, trailIsOk; + uint32_t leadByte = args->converter->toUnicodeStatus & 0xff; +- if( (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21) && +- (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21) +- ) { ++ targetUniChar = 0xffff; ++ /* ++ * Ticket 5691: consistent illegal sequences: ++ * - We include at least the first byte in the illegal sequence. ++ * - If any of the non-initial bytes could be the start of a character, ++ * we stop the illegal sequence before the first one of those. ++ * ++ * In HZ DBCS, if the second byte is in the 21..7e range, ++ * we report only the first byte as the illegal sequence. ++ * Otherwise we convert or report the pair of bytes. ++ */ ++ leadIsOk = (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21); ++ trailIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); ++ if (leadIsOk && trailIsOk) { + tempBuf[0] = (char) (leadByte+0x80) ; + tempBuf[1] = (char) (mySourceChar+0x80); + targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData, + tempBuf, 2, args->converter->useFallback); ++ mySourceChar= (leadByte << 8) | mySourceChar; ++ } else if (trailIsOk) { ++ /* report a single illegal byte and continue with the following DBCS starter byte */ ++ --mySource; ++ mySourceChar = (int32_t)leadByte; + } else { +- targetUniChar = 0xffff; ++ /* report a pair of illegal bytes if the second byte is not a DBCS starter */ ++ /* add another bit so that the code below writes 2 bytes in case of error */ ++ mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar; + } +- /* add another bit so that the code below writes 2 bytes in case of error */ +- mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar; + args->converter->toUnicodeStatus =0x00; + } + } +diff -ru icu.6175/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c +--- icu.6175/source/common/ucnvmbcs.c 2009-06-02 15:47:31.000000000 +0100 ++++ icu/source/common/ucnvmbcs.c 2009-06-02 15:56:07.000000000 +0100 +@@ -1697,6 +1697,65 @@ + pArgs->offsets=offsets; + } + ++static UBool ++hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) { ++ const int32_t *row=stateTable[state]; ++ int32_t b, entry; ++ /* First test for final entries in this state for some commonly valid byte values. */ ++ entry=row[0xa1]; ++ if( !MBCS_ENTRY_IS_TRANSITION(entry) && ++ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL ++ ) { ++ return TRUE; ++ } ++ entry=row[0x41]; ++ if( !MBCS_ENTRY_IS_TRANSITION(entry) && ++ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL ++ ) { ++ return TRUE; ++ } ++ /* Then test for final entries in this state. */ ++ for(b=0; b<=0xff; ++b) { ++ entry=row[b]; ++ if( !MBCS_ENTRY_IS_TRANSITION(entry) && ++ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL ++ ) { ++ return TRUE; ++ } ++ } ++ /* Then recurse for transition entries. */ ++ for(b=0; b<=0xff; ++b) { ++ entry=row[b]; ++ if( MBCS_ENTRY_IS_TRANSITION(entry) && ++ hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry)) ++ ) { ++ return TRUE; ++ } ++ } ++ return FALSE; ++} ++ ++/* ++ * Is byte b a single/lead byte in this state? ++ * Recurse for transition states, because here we don't want to say that ++ * b is a lead byte if all byte sequences that start with b are illegal. ++ */ ++static UBool ++isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) { ++ const int32_t *row=stateTable[state]; ++ int32_t entry=row[b]; ++ if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */ ++ return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry)); ++ } else { ++ uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); ++ if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) { ++ return FALSE; /* SI/SO are illegal for DBCS-only conversion */ ++ } else { ++ return action!=MBCS_STATE_ILLEGAL; ++ } ++ } ++} ++ + U_CFUNC void + ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, + UErrorCode *pErrorCode) { +@@ -2052,6 +2111,34 @@ + sourceIndex=nextSourceIndex; + } else if(U_FAILURE(*pErrorCode)) { + /* callback(illegal) */ ++ if(byteIndex>1) { ++ /* ++ * Ticket 5691: consistent illegal sequences: ++ * - We include at least the first byte in the illegal sequence. ++ * - If any of the non-initial bytes could be the start of a character, ++ * we stop the illegal sequence before the first one of those. ++ */ ++ UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0); ++ int8_t i; ++ for(i=1; ++ i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]); ++ ++i) {} ++ if(i<byteIndex) { ++ /* Back out some bytes. */ ++ int8_t backOutDistance=byteIndex-i; ++ int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source); ++ byteIndex=i; /* length of reported illegal byte sequence */ ++ if(backOutDistance<=bytesFromThisBuffer) { ++ source-=backOutDistance; ++ } else { ++ /* Back out bytes from the previous buffer: Need to replay them. */ ++ cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); ++ /* preToULength is negative! */ ++ uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength); ++ source=(const uint8_t *)pArgs->source; ++ } ++ } ++ } + break; + } else /* unassigned sequences indicated with byteIndex>0 */ { + /* try an extension mapping */ +@@ -2062,7 +2149,7 @@ + &offsets, sourceIndex, + pArgs->flush, + pErrorCode); +- sourceIndex=nextSourceIndex+(int32_t)(source-(const uint8_t *)pArgs->source); ++ sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source); + + if(U_FAILURE(*pErrorCode)) { + /* not mappable or buffer overflow */ +@@ -2353,15 +2440,37 @@ + + if(c<0) { + if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) { +- *pErrorCode=U_TRUNCATED_CHAR_FOUND; +- } +- if(U_FAILURE(*pErrorCode)) { + /* incomplete character byte sequence */ + uint8_t *bytes=cnv->toUBytes; + cnv->toULength=(int8_t)(source-lastSource); + do { + *bytes++=*lastSource++; + } while(lastSource<source); ++ *pErrorCode=U_TRUNCATED_CHAR_FOUND; ++ } else if(U_FAILURE(*pErrorCode)) { ++ /* callback(illegal) */ ++ /* ++ * Ticket 5691: consistent illegal sequences: ++ * - We include at least the first byte in the illegal sequence. ++ * - If any of the non-initial bytes could be the start of a character, ++ * we stop the illegal sequence before the first one of those. ++ */ ++ UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0); ++ uint8_t *bytes=cnv->toUBytes; ++ *bytes++=*lastSource++; /* first byte */ ++ if(lastSource==source) { ++ cnv->toULength=1; ++ } else /* lastSource<source: multi-byte character */ { ++ int8_t i; ++ for(i=1; ++ lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource); ++ ++i ++ ) { ++ *bytes++=*lastSource++; ++ } ++ cnv->toULength=i; ++ source=lastSource; ++ } + } else { + /* no output because of empty input or only state changes */ + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; +diff -ru icu.6175/source/test/cintltst/nccbtst.c icu/source/test/cintltst/nccbtst.c +--- icu.6175/source/test/cintltst/nccbtst.c 2009-06-02 15:47:18.000000000 +0100 ++++ icu/source/test/cintltst/nccbtst.c 2009-06-02 15:47:38.000000000 +0100 +@@ -2497,13 +2497,13 @@ + + + static const uint8_t text943[] = { +- 0x82, 0xa9, 0x82, 0x20, /*0xc8,*/ 0x61, 0x8a, 0xbf, 0x8e, 0x9a }; +- static const UChar toUnicode943sub[] = { 0x304b, 0xfffd, /*0xff88,*/ 0x0061, 0x6f22, 0x5b57}; +- static const UChar toUnicode943skip[]= { 0x304b, /*0xff88,*/ 0x0061, 0x6f22, 0x5b57}; ++ 0x82, 0xa9, 0x82, 0x20, 0x61, 0x8a, 0xbf, 0x8e, 0x9a }; ++ static const UChar toUnicode943sub[] = { 0x304b, 0x1a, 0x20, 0x0061, 0x6f22, 0x5b57 }; ++ static const UChar toUnicode943skip[]= { 0x304b, 0x20, 0x0061, 0x6f22, 0x5b57 }; + static const UChar toUnicode943stop[]= { 0x304b}; + +- static const int32_t fromIBM943Offssub[] = {0, 2, 4, 5, 7}; +- static const int32_t fromIBM943Offsskip[] = { 0, 4, 5, 7}; ++ static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 7 }; ++ static const int32_t fromIBM943Offsskip[] = { 0, 3, 4, 5, 7 }; + static const int32_t fromIBM943Offsstop[] = { 0}; + + gInBufferSize = inputsize; +@@ -2537,9 +2537,9 @@ + { + static const uint8_t sampleText[] = { + 0x82, 0xa9, 0x61, 0x62, 0x63 , 0x82, +- 0xff, /*0x82, 0xa9,*/ 0x32, 0x33}; +- static const UChar toUnicode943sub[] = {0x304b, 0x0061, 0x0062, 0x0063, 0xfffd,/*0x304b,*/ 0x0032, 0x0033}; +- static const int32_t fromIBM943Offssub[] = {0, 2, 3, 4, 5, 7, 8}; ++ 0xff, 0x32, 0x33}; ++ static const UChar toUnicode943sub[] = { 0x304b, 0x0061, 0x0062, 0x0063, 0x1a, 0x1a, 0x0032, 0x0033 }; ++ static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 6, 7, 8 }; + /*checking illegal value for ibm-943 with substitute*/ + gInBufferSize = inputsize; + gOutBufferSize = outputsize; +diff -ru icu.6175/source/test/cintltst/nucnvtst.c icu/source/test/cintltst/nucnvtst.c +--- icu.6175/source/test/cintltst/nucnvtst.c 2009-06-02 15:47:18.000000000 +0100 ++++ icu/source/test/cintltst/nucnvtst.c 2009-06-02 15:47:38.000000000 +0100 +@@ -2606,7 +2606,7 @@ + TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceLimit <= source"); + /*Test for the condition where there is an invalid character*/ + { +- static const uint8_t source2[]={0xa1, 0x01}; ++ static const uint8_t source2[]={0xa1, 0x80}; + TestNextUCharError(cnv, (const char*)source2, (const char*)source2+sizeof(source2), U_ZERO_ERROR, "an invalid character"); + } + /*Test for the condition where we have a truncated char*/ +@@ -3899,11 +3899,11 @@ + TestISO_2022_KR() { + /* test input */ + static const uint16_t in[]={ +- 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F66,0x9F67,0x9F6A,0x000A,0x000D +- ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC02,0xAC04 ++ 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F67,0x9F6A,0x000A,0x000D ++ ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC04 + ,0xAC07,0xAC08,0xAC09,0x0025,0x0026,0x0027,0x000A,0x000D,0x0028,0x0029 + ,0x002A,0x002B,0x002C,0x002D,0x002E,0x53C3,0x53C8,0x53C9,0x53CA,0x53CB +- ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53DF,0x53E1,0x53E2 ++ ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53E1,0x53E2 + ,0x53E3,0x53E4,0x000A,0x000D}; + const UChar* uSource; + const UChar* uSourceLimit; +diff -ru icu.6175/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt +--- icu.6175/source/test/testdata/conversion.txt 2009-06-02 15:47:18.000000000 +0100 ++++ icu/source/test/testdata/conversion.txt 2009-06-02 15:57:41.000000000 +0100 +@@ -48,12 +48,144 @@ + toUnicode { + Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" } + Cases { ++ // Test ticket 5691: consistent illegal sequences ++ // The following test cases are for illegal character byte sequences. ++ // ++ // Unfortunately, we cannot use the Shift-JIS examples from the ticket ++ // comments because our Shift-JIS table is Windows-compatible and ++ // therefore has no illegal single bytes. Same for GBK. ++ // Instead, we use the stricter GB 18030 also for 2-byte examples. ++ // The byte sequences are generally slightly different from the ticket ++ // comment, simply using assigned characters rather than just ++ // theoretically valid sequences. ++ { ++ "gb18030", ++ :bin{ 618140813c81ff7a }, ++ "a\u4e02\\x81<\\x81\\xFFz", ++ :intvector{ 0,1,3,3,3,3,4,5,5,5,5,5,5,5,5,7 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "EUC-JP", ++ :bin{ 618fb0a98fb03c8f3cb0a97a }, ++ "a\u4e28\\x8F\\xB0<\\x8F<\u9022z", ++ :intvector{ 0,1,4,4,4,4,5,5,5,5,6,7,7,7,7,8,9,11 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "gb18030", ++ :bin{ 618130fc318130fc8181303c3e813cfc817a }, ++ "a\u05ed\\x810\u9f07\\x810<>\\x81<\u9f07z", ++ :intvector{ 0,1,5,5,5,5,6,7,9,9,9,9,10,11,12,13,13,13,13,14,15,17 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "UTF-8", ++ :bin{ 61f1808182f180813cf18081fff180ff3cf1ff3c3e7a }, ++ "a\U00040042\\xF1\\x80\\x81<\\xF1\\x80\\x81\\xFF\\xF1\\x80\\xFF<\\xF1\\xFF<>z", ++ :intvector{ 0,1,1,5,5,5,5,5,5,5,5,5,5,5,5,8,9,9,9,9,9,9,9,9,9,9,9,9,12,12,12,12,13,13,13,13,13,13,13,13,15,15,15,15,16,17,17,17,17,18,18,18,18,19,20,21 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "ISO-2022-JP", ++ :bin{ 1b24424141af4142affe41431b2842 }, ++ "\u758f\\xAF\u758e\\xAF\\xFE\u790e", ++ :intvector{ 3,5,5,5,5,6,8,8,8,8,8,8,8,8,10 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "ibm-25546", ++ :bin{ 411b242943420e4141af4142affe41430f5a }, ++ "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ", ++ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "ISO-2022-KR", ++ :bin{ 411b242943420e4141af4142affe41430f5a }, ++ "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ", ++ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "ISO-2022-CN", ++ :bin{ 411b242941420e4141af4142affe41430f5a }, ++ "AB\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z", ++ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "HZ", ++ :bin{ 417e7b4141af4142affe41437e7d5a }, ++ "A\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z", ++ :intvector{ 0,3,5,5,5,5,6,8,8,8,8,8,8,8,8,10,14 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ // Test ticket 5691: consistent illegal sequences ++ // The following test cases are for illegal escape/designator/shift sequences. ++ // ++ // ISO-2022-JP and -CN with illegal escape sequences. ++ { ++ "ISO-2022-JP", ++ :bin{ 611b24201b244241411b283f1b28427a }, ++ "a\\x1B$ \u758f\\x1B\u2538z", ++ :intvector{ 0,1,1,1,1,2,3,7,9,9,9,9,10,15 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "ISO-2022-CN", ++ :bin{ 611b2429201b2429410e41410f7a }, ++ "a\\x1B$) \u4eaez", ++ :intvector{ 0,1,1,1,1,2,3,4,10,13 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ // Test ticket 5691: ISO-2022-JP-2 with illegal single-shift SS2 and SS3 sequences. ++ // The first ESC N comes before its designator sequence, the last sequence is ESC+space. ++ { ++ "ISO-2022-JP-2", ++ :bin{ 4e1b4e4e1b2e414e1b4e4e4e1b204e }, ++ "N\\x1BNNN\xceN\\x1B N", ++ :intvector{ 0,1,1,1,1,2,3,7,10,11,12,12,12,12,13,14 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "ISO-2022-CN-EXT", ++ :bin{ 4e1b4e4e1b242a484e1b4e4e4e4e1b204e }, ++ "N\\x1BNNN\u8f0eN\\x1B N", ++ :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "ISO-2022-CN-EXT", ++ :bin{ 4f1b4f4f1b242b494f1b4f4f4f4f1b204f }, ++ "O\\x1BOOO\u492bO\\x1B O", ++ :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ // Test ticket 5691: HZ with illegal tilde sequences. ++ { ++ "HZ", ++ :bin{ 417e20427e21437e80447e7b41417e207e41427e7f41437e7d5a }, ++ "A\\x7E B\\x7E!C\\x7E\\x80D\u4eae\\x7E\\x20\\x7E\u8c05\\x7E\\x7F\u64a9Z", ++ :intvector{ 0,1,1,1,1,2,3,4,4,4,4,5,6,7,7,7,7,7,7,7,7,9, // SBCS ++ 12,14,14,14,14,14,14,14,14,16,16,16,16,17,19,19,19,19,19,19,19,19,21, // DBCS ++ 25 }, // SBCS ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ // Test ticket 5691: Example from Peter Edberg. ++ { ++ "ISO-2022-JP", ++ :bin{ 1b244230212f7e742630801b284a621b2458631b2842648061 }, ++ "\u4e9c\ufffd\u7199\ufffdb\ufffd$Xcd\ufffda", ++ :intvector{ 3,5,7,9,14,15,16,17,18,22,23,24 }, ++ :int{1}, :int{0}, "", "?", :bin{""} ++ } + // test that HZ limits its byte values to lead bytes 21..7d and trail bytes 21..7e + { + "HZ", +- :bin{ 7e7b21212120217e217f772100007e217e7d207e7e807e0a2b }, +- "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd ~\ufffd+", +- :intvector{ 2,4,6,8,10,12,14,18,19,21,24 }, ++ :bin{ 7e7b21212120217e217f772100007e217e7e7d207e7e807e0a2b }, ++ "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd\u3013 ~\ufffd+", ++ :intvector{ 2,4,6,8,10,12,14,15,19,20,22,25 }, + :int{1}, :int{1}, "", "?", :bin{""} + } + // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and +@@ -61,8 +193,8 @@ + { + "ISO-2022-JP", + :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b2842 }, +- "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e", +- :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 }, ++ "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e", ++ :intvector{ 3,4,5,9,11,12,14,16,17,19,21,23,25,27 }, + :int{1}, :int{1}, "", "?", :bin{""} + } + // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBMPWithOffsets() +@@ -341,7 +473,7 @@ + { + "ISO-2022-CN-EXT", + :bin{ 411b4e2121 }, "\x41", :intvector{ 0 }, +- :int{1}, :int{1}, "illesc", ".", :bin{ 1b4e } ++ :int{1}, :int{1}, "illesc", ".", :bin{ 1b } + } + // G3 designator: recognized, but not supported for -CN (only for -CN-EXT) + { diff --git a/icu.icu5797.backport.patch b/icu.icu5797.backport.patch new file mode 100644 index 0000000..39e3f77 --- /dev/null +++ b/icu.icu5797.backport.patch @@ -0,0 +1,749 @@ +diff -ru icu.5483/source/common/ucnv2022.c icu/source/common/ucnv2022.c +--- icu.5483/source/common/ucnv2022.c 2009-06-02 12:47:41.000000000 +0100 ++++ icu/source/common/ucnv2022.c 2009-06-02 13:18:23.000000000 +0100 +@@ -473,8 +473,7 @@ + if(jpCharsetMasks[version]&CSM(ISO8859_7)) { + myConverterData->myConverterArray[ISO8859_7]= ucnv_loadSharedData("ISO8859_7", NULL, errorCode); + } +- myConverterData->myConverterArray[JISX201] = ucnv_loadSharedData("JISX0201", NULL, errorCode); +- myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("jisx-208", NULL, errorCode); ++ myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("Shift-JIS", NULL, errorCode); + if(jpCharsetMasks[version]&CSM(JISX212)) { + myConverterData->myConverterArray[JISX212] = ucnv_loadSharedData("jisx-212", NULL, errorCode); + } +@@ -1045,14 +1044,6 @@ + length=3; + } + } +- /* +- * TODO(markus): Use Shift-JIS table for JIS X 0208, to save mapping table space. +- * Pass in parameter for type of output bytes, for validation and shifting: +- * - Direct: Pass bytes through, but forbid control codes 00-1F (except SI/SO/ESC) and space 20? +- * (Need to allow some (TAB/LF/CR) or most of them for ASCII and maybe JIS X 0201.) +- * - A1-FE: Subtract 80 after range check. +- * - SJIS: Shift DBCS result to 21-7E x 21-7E. +- */ + /* is this code point assigned, or do we use fallbacks? */ + if((stage2Entry&(1<<(16+(c&0xf))))!=0) { + /* assigned */ +@@ -1110,6 +1101,23 @@ + } + } + ++/* ++ * Check that the result is a 2-byte value with each byte in the range A1..FE ++ * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte ++ * to move it to the ISO 2022 range 21..7E. ++ * Return 0 if out of range. ++ */ ++static U_INLINE uint32_t ++_2022FromGR94DBCS(uint32_t value) { ++ if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && ++ (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) ++ ) { ++ return value - 0x8080; /* shift down to 21..7e byte range */ ++ } else { ++ return 0; /* not valid for ISO 2022 */ ++ } ++} ++ + #ifdef U_ENABLE_GENERIC_ISO_2022 + + /********************************************************************************** +@@ -1238,7 +1246,7 @@ + } + else{ + cnv->toUBytes[0] =(char) sourceChar; +- cnv->toULength = 2; ++ cnv->toULength = 1; + } + + if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){ +@@ -1332,6 +1340,181 @@ + 3 /* length of <ESC>(I HWKANA_7BIT */ + }; + ++/* Map 00..7F to Unicode according to JIS X 0201. */ ++static U_INLINE uint32_t ++jisx201ToU(uint32_t value) { ++ if(value < 0x5c) { ++ return value; ++ } else if(value == 0x5c) { ++ return 0xa5; ++ } else if(value == 0x7e) { ++ return 0x203e; ++ } else /* value <= 0x7f */ { ++ return value; ++ } ++} ++ ++/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */ ++static U_INLINE uint32_t ++jisx201FromU(uint32_t value) { ++ if(value<=0x7f) { ++ if(value!=0x5c && value!=0x7e) { ++ return value; ++ } ++ } else if(value==0xa5) { ++ return 0x5c; ++ } else if(value==0x203e) { ++ return 0x7e; ++ } ++ return 0xfffe; ++} ++ ++/* ++ * Take a valid Shift-JIS byte pair, check that it is in the range corresponding ++ * to JIS X 0208, and convert it to a pair of 21..7E bytes. ++ * Return 0 if the byte pair is out of range. ++ */ ++static U_INLINE uint32_t ++_2022FromSJIS(uint32_t value) { ++ uint8_t trail; ++ ++ if(value > 0xEFFC) { ++ return 0; /* beyond JIS X 0208 */ ++ } ++ ++ trail = (uint8_t)value; ++ ++ value &= 0xff00; /* lead byte */ ++ if(value <= 0x9f00) { ++ value -= 0x7000; ++ } else /* 0xe000 <= value <= 0xef00 */ { ++ value -= 0xb000; ++ } ++ value <<= 1; ++ ++ if(trail <= 0x9e) { ++ value -= 0x100; ++ if(trail <= 0x7e) { ++ value |= trail - 0x1f; ++ } else { ++ value |= trail - 0x20; ++ } ++ } else /* trail <= 0xfc */ { ++ value |= trail - 0x7e; ++ } ++ return value; ++} ++ ++/* ++ * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS. ++ * If either byte is outside 21..7E make sure that the result is not valid ++ * for Shift-JIS so that the converter catches it. ++ * Some invalid byte values already turn into equally invalid Shift-JIS ++ * byte values and need not be tested explicitly. ++ */ ++static U_INLINE void ++_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) { ++ if(c1&1) { ++ ++c1; ++ if(c2 <= 0x5f) { ++ c2 += 0x1f; ++ } else if(c2 <= 0x7e) { ++ c2 += 0x20; ++ } else { ++ c2 = 0; /* invalid */ ++ } ++ } else { ++ if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) { ++ c2 += 0x7e; ++ } else { ++ c2 = 0; /* invalid */ ++ } ++ } ++ c1 >>= 1; ++ if(c1 <= 0x2f) { ++ c1 += 0x70; ++ } else if(c1 <= 0x3f) { ++ c1 += 0xb0; ++ } else { ++ c1 = 0; /* invalid */ ++ } ++ bytes[0] = (char)c1; ++ bytes[1] = (char)c2; ++} ++ ++/* ++ * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS) ++ * Katakana. ++ * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks ++ * because Shift-JIS roundtrips half-width Katakana to single bytes. ++ * These were the only fallbacks in ICU's jisx-208.ucm file. ++ */ ++static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = { ++ 0x2123, /* U+FF61 */ ++ 0x2156, ++ 0x2157, ++ 0x2122, ++ 0x2126, ++ 0x2572, ++ 0x2521, ++ 0x2523, ++ 0x2525, ++ 0x2527, ++ 0x2529, ++ 0x2563, ++ 0x2565, ++ 0x2567, ++ 0x2543, ++ 0x213C, /* U+FF70 */ ++ 0x2522, ++ 0x2524, ++ 0x2526, ++ 0x2528, ++ 0x252A, ++ 0x252B, ++ 0x252D, ++ 0x252F, ++ 0x2531, ++ 0x2533, ++ 0x2535, ++ 0x2537, ++ 0x2539, ++ 0x253B, ++ 0x253D, ++ 0x253F, /* U+FF80 */ ++ 0x2541, ++ 0x2544, ++ 0x2546, ++ 0x2548, ++ 0x254A, ++ 0x254B, ++ 0x254C, ++ 0x254D, ++ 0x254E, ++ 0x254F, ++ 0x2552, ++ 0x2555, ++ 0x2558, ++ 0x255B, ++ 0x255E, ++ 0x255F, /* U+FF90 */ ++ 0x2560, ++ 0x2561, ++ 0x2562, ++ 0x2564, ++ 0x2566, ++ 0x2568, ++ 0x2569, ++ 0x256A, ++ 0x256B, ++ 0x256C, ++ 0x256D, ++ 0x256F, ++ 0x2573, ++ 0x212B, ++ 0x212C /* U+FF9F */ ++}; ++ + /* + * The iteration over various code pages works this way: + * i) Get the currentState from myConverterData->currentState +@@ -1504,7 +1687,7 @@ + } + break; + case HWKANA_7BIT: +- if((uint32_t)(HWKANA_END-sourceChar)<=(HWKANA_END-HWKANA_START)) { ++ if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { + if(converterData->version==3) { + /* JIS7: use G1 (SO) */ + /* Shift U+FF61..U+FF9F to bytes 21..5F. */ +@@ -1531,13 +1714,34 @@ + break; + case JISX201: + /* G0 SBCS */ +- len2 = MBCS_SINGLE_FROM_UCHAR32( ++ value = jisx201FromU(sourceChar); ++ if(value <= 0x7f) { ++ targetValue = value; ++ len = 1; ++ cs = cs0; ++ g = 0; ++ useFallback = FALSE; ++ } ++ break; ++ case JISX208: ++ /* G0 DBCS from Shift-JIS table */ ++ len2 = MBCS_FROM_UCHAR32_ISO2022( + converterData->myConverterArray[cs0], + sourceChar, &value, +- useFallback); +- if(len2 != 0 && !(len2 < 0 && len != 0) && value <= 0x7f) { +- targetValue = value; +- len = len2; ++ useFallback, MBCS_OUTPUT_2); ++ if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ ++ value = _2022FromSJIS(value); ++ if(value != 0) { ++ targetValue = value; ++ len = len2; ++ cs = cs0; ++ g = 0; ++ useFallback = FALSE; ++ } ++ } else if(len == 0 && useFallback && ++ (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { ++ targetValue = hwkana_fb[sourceChar - HWKANA_START]; ++ len = -2; + cs = cs0; + g = 0; + useFallback = FALSE; +@@ -1569,17 +1773,10 @@ + * Check for valid bytes for the encoding scheme. + * This is necessary because the sub-converter (windows-949) + * has a broader encoding scheme than is valid for 2022. +- * +- * Check that the result is a 2-byte value with each byte in the range A1..FE +- * (strict EUC-KR DBCS) before accepting it and subtracting 0x80 from each byte +- * to move it to the ISO 2022 range 21..7E. + */ +- if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && +- (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) +- ) { +- value -= 0x8080; /* shift down to 21..7e byte range */ +- } else { +- break; /* not valid for ISO 2022 */ ++ value = _2022FromGR94DBCS(value); ++ if(value == 0) { ++ break; + } + } + targetValue = value; +@@ -1755,7 +1952,7 @@ + static void + UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, + UErrorCode* err){ +- char tempBuf[3]; ++ char tempBuf[2]; + const char *mySource = (char *) args->source; + UChar *myTarget = args->target; + const char *mySourceLimit = args->sourceLimit; +@@ -1893,10 +2090,7 @@ + break; + case JISX201: + if(mySourceChar <= 0x7f) { +- targetUniChar = +- _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP( +- myData->myConverterArray[cs], +- mySourceChar); ++ targetUniChar = jisx201ToU(mySourceChar); + } + break; + case HWKANA_7BIT: +@@ -1910,8 +2104,13 @@ + if(mySource < mySourceLimit) { + char trailByte; + getTrailByte: +- tempBuf[0] = (char) (mySourceChar); +- tempBuf[1] = trailByte = *mySource++; ++ trailByte = *mySource++; ++ if(cs == JISX208) { ++ _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailByte, tempBuf); ++ } else { ++ tempBuf[0] = (char)mySourceChar; ++ tempBuf[1] = trailByte; ++ } + mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte); + targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); + } else { +@@ -3254,6 +3453,9 @@ + /* open a set and initialize it with code points that are algorithmically round-tripped */ + switch(cnvData->locale[0]){ + case 'j': ++ /* include JIS X 0201 which is hardcoded */ ++ sa->add(sa->set, 0xa5); ++ sa->add(sa->set, 0x203e); + if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) { + /* include Latin-1 for some variants of JP */ + sa->addRange(sa->set, 0, 0xff); +@@ -3262,6 +3464,11 @@ + sa->addRange(sa->set, 0, 0x7f); + } + if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) { ++ /* ++ * TODO(markus): If and when ucnv_getUnicodeSet() supports fallbacks, ++ * we need to include half-width Katakana for all JP variants because ++ * JIS X 0208 has hardcoded fallbacks for them. ++ */ + /* include half-width Katakana for JP */ + sa->addRange(sa->set, HWKANA_START, HWKANA_END); + } +@@ -3281,15 +3488,7 @@ + break; + } + +- /* +- * Version-specific for CN: +- * CN version 0 does not map CNS planes 3..7 although +- * they are all available in the CNS conversion table; +- * CN version 1 does map them all. +- * The two versions create different Unicode sets. +- */ +- for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { +- if(cnvData->myConverterArray[i]!=NULL) { ++#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ + if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && + cnvData->version==0 && i==CNS_11643 + ) { +@@ -3299,9 +3498,33 @@ + sa, UCNV_ROUNDTRIP_SET, + 0, 0x81, 0x82, + pErrorCode); ++ } ++#endif ++ ++ for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { ++ UConverterSetFilter filter; ++ if(cnvData->myConverterArray[i]!=NULL) { ++ if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && ++ cnvData->version==0 && i==CNS_11643 ++ ) { ++ /* ++ * Version-specific for CN: ++ * CN version 0 does not map CNS planes 3..7 although ++ * they are all available in the CNS conversion table; ++ * CN version 1 (-EXT) does map them all. ++ * The two versions create different Unicode sets. ++ */ ++ filter=UCNV_SET_FILTER_2022_CN; ++ } else if(cnvData->locale[0]=='j' && i==JISX208) { ++ /* ++ * Only add code points that map to Shift-JIS codes ++ * corresponding to JIS X 0208. ++ */ ++ filter=UCNV_SET_FILTER_SJIS; + } else { +- ucnv_MBCSGetUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, pErrorCode); ++ filter=UCNV_SET_FILTER_NONE; + } ++ ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode); + } + } + +diff -ru icu.5483/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c +--- icu.5483/source/common/ucnvmbcs.c 2009-06-02 12:47:41.000000000 +0100 ++++ icu/source/common/ucnvmbcs.c 2009-06-02 12:48:08.000000000 +0100 +@@ -340,6 +340,8 @@ + + /* Miscellaneous ------------------------------------------------------------ */ + ++#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ ++ + /* similar to ucnv_MBCSGetNextUChar() but recursive */ + static void + _getUnicodeSetForBytes(const UConverterSharedData *sharedData, +@@ -432,11 +434,14 @@ + pErrorCode); + } + ++#endif ++ + U_CFUNC void +-ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, +- const USetAdder *sa, +- UConverterUnicodeSet which, +- UErrorCode *pErrorCode) { ++ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData, ++ const USetAdder *sa, ++ UConverterUnicodeSet which, ++ UConverterSetFilter filter, ++ UErrorCode *pErrorCode) { + const UConverterMBCSTable *mbcsTable; + const uint16_t *table; + +@@ -490,50 +495,26 @@ + c+=1024; /* empty stage 2 block */ + } + } +- } else if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY) { +- /* ignore single-byte results */ ++ } else { + const uint32_t *stage2; +- const uint16_t *stage3, *results; +- +- results=(const uint16_t *)mbcsTable->fromUnicodeBytes; +- +- for(st1=0; st1<maxStage1; ++st1) { +- st2=table[st1]; +- if(st2>(maxStage1>>1)) { +- stage2=(const uint32_t *)table+st2; +- for(st2=0; st2<64; ++st2) { +- if((st3=stage2[st2])!=0) { +- /* read the stage 3 block */ +- stage3=results+16*(uint32_t)(uint16_t)st3; ++ const uint8_t *stage3, *bytes; ++ uint32_t st3Multiplier; ++ uint32_t value; + +- /* get the roundtrip flags for the stage 3 block */ +- st3>>=16; ++ bytes=mbcsTable->fromUnicodeBytes; + +- /* +- * Add code points for which the roundtrip flag is set. +- * Once we get a set for fallback mappings, we have to check +- * non-roundtrip stage 3 results for whether they are 0. +- * See ucnv_MBCSFromUnicodeWithOffsets() for details. +- * +- * Ignore single-byte results (<0x100). +- */ +- do { +- if((st3&1)!=0 && *stage3>=0x100) { +- sa->add(sa->set, c); +- } +- st3>>=1; +- ++stage3; +- } while((++c&0xf)!=0); +- } else { +- c+=16; /* empty stage 3 block */ +- } +- } +- } else { +- c+=1024; /* empty stage 2 block */ +- } ++ switch(mbcsTable->outputType) { ++ case MBCS_OUTPUT_3: ++ case MBCS_OUTPUT_4_EUC: ++ st3Multiplier=3; ++ break; ++ case MBCS_OUTPUT_4: ++ st3Multiplier=4; ++ break; ++ default: ++ st3Multiplier=2; ++ break; + } +- } else { +- const uint32_t *stage2; + + for(st1=0; st1<maxStage1; ++st1) { + st2=table[st1]; +@@ -541,6 +522,9 @@ + stage2=(const uint32_t *)table+st2; + for(st2=0; st2<64; ++st2) { + if((st3=stage2[st2])!=0) { ++ /* read the stage 3 block */ ++ stage3=bytes+st3Multiplier*16*(uint32_t)(uint16_t)st3; ++ + /* get the roundtrip flags for the stage 3 block */ + st3>>=16; + +@@ -550,12 +534,49 @@ + * non-roundtrip stage 3 results for whether they are 0. + * See ucnv_MBCSFromUnicodeWithOffsets() for details. + */ +- do { +- if(st3&1) { +- sa->add(sa->set, c); +- } +- st3>>=1; +- } while((++c&0xf)!=0); ++ switch(filter) { ++ case UCNV_SET_FILTER_NONE: ++ do { ++ if(st3&1) { ++ sa->add(sa->set, c); ++ } ++ st3>>=1; ++ } while((++c&0xf)!=0); ++ break; ++ case UCNV_SET_FILTER_DBCS_ONLY: ++ /* Ignore single-byte results (<0x100). */ ++ do { ++ if((st3&1)!=0 && *((const uint16_t *)stage3)>=0x100) { ++ sa->add(sa->set, c); ++ } ++ st3>>=1; ++ stage3+=2; /* +=st3Multiplier */ ++ } while((++c&0xf)!=0); ++ break; ++ case UCNV_SET_FILTER_2022_CN: ++ /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */ ++ do { ++ if((st3&1)!=0 && ((value=*stage3)==0x81 || value==0x82)) { ++ sa->add(sa->set, c); ++ } ++ st3>>=1; ++ stage3+=3; /* +=st3Multiplier */ ++ } while((++c&0xf)!=0); ++ break; ++ case UCNV_SET_FILTER_SJIS: ++ /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */ ++ do { ++ if((st3&1)!=0 && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) { ++ sa->add(sa->set, c); ++ } ++ st3>>=1; ++ stage3+=2; /* +=st3Multiplier */ ++ } while((++c&0xf)!=0); ++ break; ++ default: ++ *pErrorCode=U_INTERNAL_PROGRAM_ERROR; ++ return; ++ } + } else { + c+=16; /* empty stage 3 block */ + } +@@ -569,6 +590,19 @@ + ucnv_extGetUnicodeSet(sharedData, sa, which, pErrorCode); + } + ++U_CFUNC void ++ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, ++ const USetAdder *sa, ++ UConverterUnicodeSet which, ++ UErrorCode *pErrorCode) { ++ ucnv_MBCSGetFilteredUnicodeSetForUnicode( ++ sharedData, sa, which, ++ sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? ++ UCNV_SET_FILTER_DBCS_ONLY : ++ UCNV_SET_FILTER_NONE, ++ pErrorCode); ++} ++ + static void + ucnv_MBCSGetUnicodeSet(const UConverter *cnv, + const USetAdder *sa, +diff -ru icu.5483/source/common/ucnvmbcs.h icu/source/common/ucnvmbcs.h +--- icu.5483/source/common/ucnvmbcs.h 2009-06-02 12:47:41.000000000 +0100 ++++ icu/source/common/ucnvmbcs.h 2009-06-02 12:48:08.000000000 +0100 +@@ -363,6 +363,7 @@ + ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, + UErrorCode *pErrorCode); + ++#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ + /* + * Internal function returning a UnicodeSet for toUnicode() conversion. + * Currently only used for ISO-2022-CN, and only handles roundtrip mappings. +@@ -377,6 +378,7 @@ + UConverterUnicodeSet which, + uint8_t state, int32_t lowByte, int32_t highByte, + UErrorCode *pErrorCode); ++#endif + + /* + * Internal function returning a UnicodeSet for toUnicode() conversion. +@@ -388,9 +390,30 @@ + */ + U_CFUNC void + ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, +- const USetAdder *sa, +- UConverterUnicodeSet which, +- UErrorCode *pErrorCode); ++ const USetAdder *sa, ++ UConverterUnicodeSet which, ++ UErrorCode *pErrorCode); ++ ++typedef enum UConverterSetFilter { ++ UCNV_SET_FILTER_NONE, ++ UCNV_SET_FILTER_DBCS_ONLY, ++ UCNV_SET_FILTER_2022_CN, ++ UCNV_SET_FILTER_SJIS, ++ UCNV_SET_FILTER_COUNT ++} UConverterSetFilter; ++ ++/* ++ * Same as ucnv_MBCSGetUnicodeSetForUnicode() but ++ * the set can be filtered by encoding scheme. ++ * Used by stateful converters which share regular conversion tables ++ * but only use a subset of their mappings. ++ */ ++U_CFUNC void ++ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData, ++ const USetAdder *sa, ++ UConverterUnicodeSet which, ++ UConverterSetFilter filter, ++ UErrorCode *pErrorCode); + + #endif + +diff -ru icu.5483/source/test/cintltst/nucnvtst.c icu/source/test/cintltst/nucnvtst.c +--- icu.5483/source/test/cintltst/nucnvtst.c 2009-06-02 12:47:25.000000000 +0100 ++++ icu/source/test/cintltst/nucnvtst.c 2009-06-02 12:58:02.000000000 +0100 +@@ -3202,7 +3202,7 @@ + 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A, 0x000D, 0x000A, + 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, 0x0051, 0x0052, 0x000D, 0x000A, + 0x3005, 0x3006, 0x3007, 0x30FC, 0x2015, 0x2010, 0xFF0F, 0x005C, 0x000D, 0x000A, +- 0x301C, 0x2016, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A, ++ 0x3013, 0x2018, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A, + 0x201D, 0x3014, 0x000D, 0x000A, + 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x000D, 0x000A, + 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x000D, 0x000A, +@@ -3730,7 +3730,7 @@ + 0x52C8, 0x52CC, 0x52CF, 0x52D1, 0x52D4, 0x52D6, 0x52DB, 0x52DC, 0x000D, 0x000A, + 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, 0x0051, 0x0052, 0x000D, 0x000A, + 0x3005, 0x3006, 0x3007, 0x30FC, 0x2015, 0x2010, 0xFF0F, 0x005C, 0x000D, 0x000A, +- 0x301C, 0x2016, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A, ++ 0x3013, 0x2018, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A, + 0x201D, 0x000D, 0x000A, + 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x000D, 0x000A, + 0x4F94, 0x4F97, 0x52BA, 0x52BB, 0x52BD, 0x52C0, 0x52C4, 0x52C6, 0x000D, 0x000A, +diff -ru icu.5483/source/test/cintltst/udatatst.c icu/source/test/cintltst/udatatst.c +--- icu.5483/source/test/cintltst/udatatst.c 2009-06-02 12:47:25.000000000 +0100 ++++ icu/source/test/cintltst/udatatst.c 2009-06-02 13:09:15.000000000 +0100 +@@ -1260,6 +1260,11 @@ + {"gb18030", "cnv", ucnv_swap}, + /* MBCS conversion table file with extension */ + {"*test4x", "cnv", ucnv_swap}, ++ /* ++ * MBCS conversion table file without extension, ++ * to test swapping and preflighting of UTF-8-friendly mbcsIndex[]. ++ */ ++ {"jisx-212", "cnv", ucnv_swap}, + #endif + + #if !UCONFIG_NO_CONVERSION +diff -ru icu.5483/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt +--- icu.5483/source/test/testdata/conversion.txt 2009-06-02 12:47:25.000000000 +0100 ++++ icu/source/test/testdata/conversion.txt 2009-06-02 12:49:51.000000000 +0100 +@@ -48,6 +48,15 @@ + toUnicode { + Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" } + Cases { ++ // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and ++ // using the Shift-JIS table for JIS X 0208 (ticket #5797) ++ { ++ "ISO-2022-JP", ++ :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b2842 }, ++ "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e", ++ :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 }, ++ :int{1}, :int{1}, "", "?", :bin{""} ++ } + // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBMPWithOffsets() + { + "ISO-8859-3", +@@ -495,6 +504,15 @@ + } + { "UTF-16BE", :bin{ 00 }, "", :intvector{}, :int{1}, :int{0}, "truncated", ".", :bin{ 00 } } + { "UTF-16BE", :bin{ d800dc }, "", :intvector{}, :int{1}, :int{0}, "truncated", ".", :bin{ d800dc } } ++ // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and ++ // using the Shift-JIS table for JIS X 0208 (ticket #5797) ++ { ++ "ISO-2022-JP", ++ "\u203e\xa5\u4e00\ufa10\u6f3e\u0391", ++ :bin{ 1b284a7e5c1b2442306c222e5f2126211b2842 }, ++ :intvector{ 0,0,0,0,1,2,2,2,2,2,3,3,4,4,5,5,5,5,5 }, ++ :int{1}, :int{0}, "", "?=\u3013", "" // U+3013 Geta Mark converts to 222e ++ } + // Verify that mappings that would result in byte values outside 20..7F (for SBCS) + // or 21..7E (for DBCS) are not used. + // ibm-9005_X110-2007.ucm (ISO 8859-7, <ESC>.F=1b2e46): +@@ -1273,13 +1291,13 @@ + // versions of ISO-2022-JP + { + "ISO-2022-JP", +- "[\x00-\x0d\x10-\x1a\x1c-\x7f\u0391-\u03a1\uff61-\uff9f\u4e00\u4e01\uffe5]", +- "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\uffe6-\U0010ffff]", ++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u203e\uff61-\uff9f\u4e00\u4e01\uffe5]", ++ "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\ufa0e-\ufa2d\uffe6-\U0010ffff]", + :int{0} + } + { + "ISO-2022-JP-2", +- "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0385-\u038a\u0390-\u03a1\uff61-\uff9f\u4e00-\u4e05\uffe6]", ++ "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\uff61-\uff9f\u4e00-\u4e05\uffe6]", + "[\x0e\x0f\x1b\uffe7-\U0010ffff]", + :int{0} + } diff --git a/icu.icu6001.backport.patch b/icu.icu6001.backport.patch new file mode 100644 index 0000000..11b2ee3 --- /dev/null +++ b/icu.icu6001.backport.patch @@ -0,0 +1,741 @@ +diff -ru icu.5797/source/common/ucnv2022.c icu/source/common/ucnv2022.c +--- icu.5797/source/common/ucnv2022.c 2009-06-02 14:45:31.000000000 +0100 ++++ icu/source/common/ucnv2022.c 2009-06-02 15:05:10.000000000 +0100 +@@ -3399,11 +3399,19 @@ + /* include ASCII for JP */ + sa->addRange(sa->set, 0, 0x7f); + } +- if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) { ++ if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { + /* +- * TODO(markus): If and when ucnv_getUnicodeSet() supports fallbacks, +- * we need to include half-width Katakana for all JP variants because +- * JIS X 0208 has hardcoded fallbacks for them. ++ * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0 ++ * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8) ++ * use half-width Katakana. ++ * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode) ++ * half-width Katakana via the ESC ( I sequence. ++ * However, we only emit (fromUnicode) half-width Katakana according to the ++ * definition of each variant. ++ * ++ * When including fallbacks, ++ * we need to include half-width Katakana Unicode code points for all JP variants because ++ * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana). + */ + /* include half-width Katakana for JP */ + sa->addRange(sa->set, HWKANA_START, HWKANA_END); +@@ -3457,6 +3465,12 @@ + * corresponding to JIS X 0208. + */ + filter=UCNV_SET_FILTER_SJIS; ++ } else if(i==KSC5601) { ++ /* ++ * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables) ++ * are broader than GR94. ++ */ ++ filter=UCNV_SET_FILTER_GR94DBCS; + } else { + filter=UCNV_SET_FILTER_NONE; + } +@@ -3472,6 +3486,9 @@ + sa->remove(sa->set, 0x0e); + sa->remove(sa->set, 0x0f); + sa->remove(sa->set, 0x1b); ++ ++ /* ISO 2022 converters do not convert C1 controls either */ ++ sa->removeRange(sa->set, 0x80, 0x9f); + } + + static const UConverterImpl _ISO2022Impl={ +diff -ru icu.5797/source/common/ucnv_ext.c icu/source/common/ucnv_ext.c +--- icu.5797/source/common/ucnv_ext.c 2009-06-02 14:45:31.000000000 +0100 ++++ icu/source/common/ucnv_ext.c 2009-06-02 15:12:21.000000000 +0100 +@@ -946,7 +946,7 @@ + ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData, + const int32_t *cx, + const USetAdder *sa, +- UConverterUnicodeSet which, ++ UBool useFallback, + int32_t minLength, + UChar32 c, + UChar s[UCNV_EXT_MAX_UCHARS], int32_t length, +@@ -966,7 +966,7 @@ + value=*fromUSectionValues++; + + if( value!=0 && +- UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) && ++ (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || useFallback) && + UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength + ) { + if(c>=0) { +@@ -987,12 +987,14 @@ + /* no mapping, do nothing */ + } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) { + ucnv_extGetUnicodeSetString( +- sharedData, cx, sa, which, minLength, ++ sharedData, cx, sa, useFallback, minLength, + U_SENTINEL, s, length+1, + (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value), + pErrorCode); +- } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))== +- UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) && ++ } else if((useFallback ? ++ (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 : ++ ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))== ++ UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) && + UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength + ) { + sa->addString(sa->set, s, length+1); +@@ -1004,6 +1006,7 @@ + ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData, + const USetAdder *sa, + UConverterUnicodeSet which, ++ UConverterSetFilter filter, + UErrorCode *pErrorCode) { + const int32_t *cx; + const uint16_t *stage12, *stage3, *ps2, *ps3; +@@ -1011,6 +1014,7 @@ + + uint32_t value; + int32_t st1, stage1Length, st2, st3, minLength; ++ UBool useFallback; + + UChar s[UCNV_EXT_MAX_UCHARS]; + UChar32 c; +@@ -1027,12 +1031,20 @@ + + stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]; + ++ useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET); ++ + /* enumerate the from-Unicode trie table */ + c=0; /* keep track of the current code point while enumerating */ + +- if(sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY) { ++ if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY || ++ filter==UCNV_SET_FILTER_DBCS_ONLY || ++ filter==UCNV_SET_FILTER_SJIS || ++ filter==UCNV_SET_FILTER_GR94DBCS ++ ) { + /* DBCS-only, ignore single-byte results */ + minLength=2; ++ } else if(filter==UCNV_SET_FILTER_2022_CN) { ++ minLength=3; + } else { + minLength=1; + } +@@ -1064,14 +1076,41 @@ + length=0; + U16_APPEND_UNSAFE(s, length, c); + ucnv_extGetUnicodeSetString( +- sharedData, cx, sa, which, minLength, ++ sharedData, cx, sa, useFallback, minLength, + c, s, length, + (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value), + pErrorCode); +- } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))== +- UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) && ++ } else if((useFallback ? ++ (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 : ++ ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))== ++ UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) && + UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength + ) { ++ switch(filter) { ++ case UCNV_SET_FILTER_2022_CN: ++ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==3 && UCNV_EXT_FROM_U_GET_DATA(value)<=0x82ffff)) { ++ continue; ++ } ++ break; ++ case UCNV_SET_FILTER_SJIS: ++ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && (value=UCNV_EXT_FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)) { ++ continue; ++ } ++ break; ++ case UCNV_SET_FILTER_GR94DBCS: ++ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && ++ (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value)) - 0xa1a1)<=(0xfefe - 0xa1a1) && ++ (uint8_t)(value - 0xa1)<=(0xfe - 0xa1))) { ++ continue; ++ } ++ break; ++ default: ++ /* ++ * UCNV_SET_FILTER_NONE, ++ * or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength ++ */ ++ break; ++ } + sa->add(sa->set, c); + } + } while((++c&0xf)!=0); +diff -ru icu.5797/source/common/ucnv_ext.h icu/source/common/ucnv_ext.h +--- icu.5797/source/common/ucnv_ext.h 2009-06-02 14:45:31.000000000 +0100 ++++ icu/source/common/ucnv_ext.h 2009-06-02 15:05:10.000000000 +0100 +@@ -382,10 +382,20 @@ + UConverterFromUnicodeArgs *pArgs, int32_t srcIndex, + UErrorCode *pErrorCode); + ++/* ++ * Add code points and strings to the set according to the extension mappings. ++ * Limitation on the UConverterSetFilter: ++ * The filters currently assume that they are used with 1:1 mappings. ++ * They only apply to single input code points, and then they pass through ++ * only mappings with single-charset-code results. ++ * For example, the Shift-JIS filter only works for 2-byte results and tests ++ * that those 2 bytes are in the JIS X 0208 range of Shift-JIS. ++ */ + U_CFUNC void + ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData, + const USetAdder *sa, + UConverterUnicodeSet which, ++ UConverterSetFilter filter, + UErrorCode *pErrorCode); + + /* toUnicode helpers -------------------------------------------------------- */ +diff -ru icu.5797/source/common/ucnvhz.c icu/source/common/ucnvhz.c +--- icu.5797/source/common/ucnvhz.c 2009-06-02 14:45:31.000000000 +0100 ++++ icu/source/common/ucnvhz.c 2009-06-02 15:05:10.000000000 +0100 +@@ -528,6 +528,7 @@ + sa->add(sa->set, 0x7e); + + /* add all of the code points that the sub-converter handles */ ++ /* ucnv_MBCSGetFilteredUnicodeSetForUnicode(((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData, sa, which, UCNV_SET_FILTER_GR94DBCS, pErrorCode); */ + ((UConverterDataHZ*)cnv->extraInfo)-> + gbConverter->sharedData->impl-> + getUnicodeSet(((UConverterDataHZ*)cnv->extraInfo)->gbConverter, +diff -ru icu.5797/source/common/ucnv_lmb.c icu/source/common/ucnv_lmb.c +--- icu.5797/source/common/ucnv_lmb.c 2009-06-02 14:45:31.000000000 +0100 ++++ icu/source/common/ucnv_lmb.c 2009-06-02 15:09:13.000000000 +0100 +@@ -536,7 +536,7 @@ + NULL,\ + NULL,\ + _LMBCSSafeClone,\ +- _LMBCSGetUnicodeSet\ ++ ucnv_getCompleteUnicodeSet\ + };\ + static const UConverterStaticData _LMBCSStaticData##n={\ + sizeof(UConverterStaticData),\ +@@ -662,15 +662,14 @@ + return &newLMBCS->cnv; + } + +-static void +-_LMBCSGetUnicodeSet(const UConverter *cnv, +- const USetAdder *sa, +- UConverterUnicodeSet which, +- UErrorCode *pErrorCode) { +- /* all but U+F6xx, see LMBCS explanation above (search for F6xx) */ +- sa->addRange(sa->set, 0, 0xf5ff); +- sa->addRange(sa->set, 0xf700, 0x10ffff); +-} ++/* ++ * There used to be a _LMBCSGetUnicodeSet() function here (up to svn revision 20117) ++ * which added all code points except for U+F6xx ++ * because those cannot be represented in the Unicode group. ++ * However, it turns out that windows-950 has roundtrips for all of U+F6xx ++ * which means that LMBCS can convert all Unicode code points after all. ++ * We now simply use ucnv_getCompleteUnicodeSet(). ++ */ + + /* + Here's the basic helper function that we use when converting from +diff -ru icu.5797/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c +--- icu.5797/source/common/ucnvmbcs.c 2009-06-02 14:45:31.000000000 +0100 ++++ icu/source/common/ucnvmbcs.c 2009-06-02 15:12:40.000000000 +0100 +@@ -463,9 +463,23 @@ + + if(mbcsTable->outputType==MBCS_OUTPUT_1) { + const uint16_t *stage2, *stage3, *results; ++ uint16_t minValue; + + results=(const uint16_t *)mbcsTable->fromUnicodeBytes; + ++ /* ++ * Set a threshold variable for selecting which mappings to use. ++ * See ucnv_MBCSSingleFromBMPWithOffsets() and ++ * MBCS_SINGLE_RESULT_FROM_U() for details. ++ */ ++ if(which==UCNV_ROUNDTRIP_SET) { ++ /* use only roundtrips */ ++ minValue=0xf00; ++ } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ { ++ /* use all roundtrip and fallback results */ ++ minValue=0x800; ++ } ++ + for(st1=0; st1<maxStage1; ++st1) { + st2=table[st1]; + if(st2>maxStage1) { +@@ -475,15 +489,8 @@ + /* read the stage 3 block */ + stage3=results+st3; + +- /* +- * Add code points for which the roundtrip flag is set. +- * Once we get a set for fallback mappings, we have to use +- * a threshold variable with a value of 0x800. +- * See ucnv_MBCSSingleFromBMPWithOffsets() and +- * MBCS_SINGLE_RESULT_FROM_U() for details. +- */ + do { +- if(*stage3++>=0xf00) { ++ if(*stage3++>=minValue) { + sa->add(sa->set, c); + } + } while((++c&0xf)!=0); +@@ -500,9 +507,12 @@ + const uint8_t *stage3, *bytes; + uint32_t st3Multiplier; + uint32_t value; ++ UBool useFallback; + + bytes=mbcsTable->fromUnicodeBytes; + ++ useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET); ++ + switch(mbcsTable->outputType) { + case MBCS_OUTPUT_3: + case MBCS_OUTPUT_4_EUC: +@@ -529,9 +539,8 @@ + st3>>=16; + + /* +- * Add code points for which the roundtrip flag is set. +- * Once we get a set for fallback mappings, we have to check +- * non-roundtrip stage 3 results for whether they are 0. ++ * Add code points for which the roundtrip flag is set, ++ * or which map to non-zero bytes if we use fallbacks. + * See ucnv_MBCSFromUnicodeWithOffsets() for details. + */ + switch(filter) { +@@ -539,6 +548,23 @@ + do { + if(st3&1) { + sa->add(sa->set, c); ++ stage3+=st3Multiplier; ++ } else if(useFallback) { ++ uint8_t b=0; ++ switch(st3Multiplier) { ++ case 4: ++ b|=*stage3++; ++ case 3: ++ b|=*stage3++; ++ case 2: ++ b|=stage3[0]|stage3[1]; ++ stage3+=2; ++ default: ++ break; ++ } ++ if(b!=0) { ++ sa->add(sa->set, c); ++ } + } + st3>>=1; + } while((++c&0xf)!=0); +@@ -546,7 +572,7 @@ + case UCNV_SET_FILTER_DBCS_ONLY: + /* Ignore single-byte results (<0x100). */ + do { +- if((st3&1)!=0 && *((const uint16_t *)stage3)>=0x100) { ++ if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) { + sa->add(sa->set, c); + } + st3>>=1; +@@ -556,7 +582,7 @@ + case UCNV_SET_FILTER_2022_CN: + /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */ + do { +- if((st3&1)!=0 && ((value=*stage3)==0x81 || value==0x82)) { ++ if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) { + sa->add(sa->set, c); + } + st3>>=1; +@@ -566,7 +592,20 @@ + case UCNV_SET_FILTER_SJIS: + /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */ + do { +- if((st3&1)!=0 && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) { ++ if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) { ++ sa->add(sa->set, c); ++ } ++ st3>>=1; ++ stage3+=2; /* +=st3Multiplier */ ++ } while((++c&0xf)!=0); ++ break; ++ case UCNV_SET_FILTER_GR94DBCS: ++ /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */ ++ do { ++ if( ((st3&1)!=0 || useFallback) && ++ (uint16_t)((value=*((const uint16_t *)stage3)) - 0xa1a1)<=(0xfefe - 0xa1a1) && ++ (uint8_t)(value - 0xa1)<=(0xfe - 0xa1) ++ ) { + sa->add(sa->set, c); + } + st3>>=1; +@@ -587,7 +626,7 @@ + } + } + +- ucnv_extGetUnicodeSet(sharedData, sa, which, pErrorCode); ++ ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode); + } + + U_CFUNC void +diff -ru icu.5797/source/common/ucnvmbcs.h icu/source/common/ucnvmbcs.h +--- icu.5797/source/common/ucnvmbcs.h 2009-06-02 14:45:31.000000000 +0100 ++++ icu/source/common/ucnvmbcs.h 2009-06-02 15:05:10.000000000 +0100 +@@ -399,6 +399,7 @@ + UCNV_SET_FILTER_DBCS_ONLY, + UCNV_SET_FILTER_2022_CN, + UCNV_SET_FILTER_SJIS, ++ UCNV_SET_FILTER_GR94DBCS, + UCNV_SET_FILTER_COUNT + } UConverterSetFilter; + +diff -ru icu.5797/source/common/ucnv_set.c icu/source/common/ucnv_set.c +--- icu.5797/source/common/ucnv_set.c 2009-06-02 14:45:31.000000000 +0100 ++++ icu/source/common/ucnv_set.c 2009-06-02 15:05:10.000000000 +0100 +@@ -1,7 +1,7 @@ + /* + ******************************************************************************* + * +-* Copyright (C) 2003-2005, International Business Machines ++* Copyright (C) 2003-2007, International Business Machines + * Corporation and others. All Rights Reserved. + * + ******************************************************************************* +@@ -52,7 +52,8 @@ + uset_add, + uset_addRange, + uset_addString, +- uset_remove ++ uset_remove, ++ uset_removeRange + }; + sa.set=setFillIn; + +diff -ru icu.5797/source/common/unicode/ucnv.h icu/source/common/unicode/ucnv.h +--- icu.5797/source/common/unicode/ucnv.h 2009-06-02 14:45:30.000000000 +0100 ++++ icu/source/common/unicode/ucnv.h 2009-06-02 15:05:10.000000000 +0100 +@@ -870,6 +870,8 @@ + typedef enum UConverterUnicodeSet { + /** Select the set of roundtrippable Unicode code points. @stable ICU 2.6 */ + UCNV_ROUNDTRIP_SET, ++ /** Select the set of Unicode code points with roundtrip or fallback mappings. @draft ICU 4.0 */ ++ UCNV_ROUNDTRIP_AND_FALLBACK_SET, + /** Number of UConverterUnicodeSet selectors. @stable ICU 2.6 */ + UCNV_SET_COUNT + } UConverterUnicodeSet; +@@ -878,11 +880,16 @@ + /** + * Returns the set of Unicode code points that can be converted by an ICU converter. + * +- * The current implementation returns only one kind of set (UCNV_ROUNDTRIP_SET): ++ * Returns one of several kinds of set: ++ * ++ * 1. UCNV_ROUNDTRIP_SET ++ * + * The set of all Unicode code points that can be roundtrip-converted +- * (converted without any data loss) with the converter. ++ * (converted without any data loss) with the converter (ucnv_fromUnicode()). + * This set will not include code points that have fallback mappings + * or are only the result of reverse fallback mappings. ++ * This set will also not include PUA code points with fallbacks, although ++ * ucnv_fromUnicode() will always uses those mappings despite ucnv_setFallback(). + * See UTR #22 "Character Mapping Markup Language" + * at http://www.unicode.org/reports/tr22/ + * +@@ -893,6 +900,12 @@ + * by comparing its roundtrip set with the set of ExemplarCharacters from + * ICU's locale data or other sources + * ++ * 2. UCNV_ROUNDTRIP_AND_FALLBACK_SET ++ * ++ * The set of all Unicode code points that can be converted with the converter (ucnv_fromUnicode()) ++ * when fallbacks are turned on (see ucnv_setFallback()). ++ * This set includes all code points with roundtrips and fallbacks (but not reverse fallbacks). ++ * + * In the future, there may be more UConverterUnicodeSet choices to select + * sets with different properties. + * +diff -ru icu.5797/source/common/uset_imp.h icu/source/common/uset_imp.h +--- icu.5797/source/common/uset_imp.h 2009-06-02 14:45:31.000000000 +0100 ++++ icu/source/common/uset_imp.h 2009-06-02 15:05:10.000000000 +0100 +@@ -36,6 +36,9 @@ + typedef void U_CALLCONV + USetRemove(USet *set, UChar32 c); + ++typedef void U_CALLCONV ++USetRemoveRange(USet *set, UChar32 start, UChar32 end); ++ + /** + * Interface for adding items to a USet, to keep low-level code from + * statically depending on the USet implementation. +@@ -47,6 +50,7 @@ + USetAddRange *addRange; + USetAddString *addString; + USetRemove *remove; ++ USetRemoveRange *removeRange; + }; + typedef struct USetAdder USetAdder; + +diff -ru icu.5797/source/test/intltest/convtest.cpp icu/source/test/intltest/convtest.cpp +--- icu.5797/source/test/intltest/convtest.cpp 2009-06-02 14:45:18.000000000 +0100 ++++ icu/source/test/intltest/convtest.cpp 2009-06-02 15:09:31.000000000 +0100 +@@ -59,6 +59,7 @@ + case 0: name="TestToUnicode"; if (exec) TestToUnicode(); break; + case 1: name="TestFromUnicode"; if (exec) TestFromUnicode(); break; + case 2: name="TestGetUnicodeSet"; if (exec) TestGetUnicodeSet(); break; ++ case 3: name="TestGetUnicodeSet2"; if (exec) TestGetUnicodeSet2(); break; + default: name=""; break; //needed to end loop + } + } +@@ -454,6 +455,183 @@ + } + } + ++U_CDECL_BEGIN ++static void U_CALLCONV ++getUnicodeSetCallback(const void *context, ++ UConverterFromUnicodeArgs *fromUArgs, ++ const UChar* codeUnits, ++ int32_t length, ++ UChar32 codePoint, ++ UConverterCallbackReason reason, ++ UErrorCode *pErrorCode) { ++ if(reason<=UCNV_IRREGULAR) { ++ ((UnicodeSet *)context)->remove(codePoint); // the converter cannot convert this code point ++ *pErrorCode=U_ZERO_ERROR; // skip ++ } // else ignore the reset, close and clone calls. ++} ++U_CDECL_END ++ ++// Compare ucnv_getUnicodeSet() with the set of characters that can be converted. ++void ++ConversionTest::TestGetUnicodeSet2() { ++ // Build a string with all code points. ++ UChar32 cpLimit; ++ int32_t s0Length; ++ if(quick) { ++ cpLimit=s0Length=0x10000; // BMP only ++ } else { ++ cpLimit=0x110000; ++ s0Length=0x10000+0x200000; // BMP + surrogate pairs ++ } ++ UChar *s0=new UChar[s0Length]; ++ if(s0==NULL) { ++ return; ++ } ++ UChar *s=s0; ++ UChar32 c; ++ UChar c2; ++ // low BMP ++ for(c=0; c<=0xd7ff; ++c) { ++ *s++=(UChar)c; ++ } ++ // trail surrogates ++ for(c=0xdc00; c<=0xdfff; ++c) { ++ *s++=(UChar)c; ++ } ++ // lead surrogates ++ // (after trails so that there is not even one surrogate pair in between) ++ for(c=0xd800; c<=0xdbff; ++c) { ++ *s++=(UChar)c; ++ } ++ // high BMP ++ for(c=0xe000; c<=0xffff; ++c) { ++ *s++=(UChar)c; ++ } ++ // supplementary code points = surrogate pairs ++ if(cpLimit==0x110000) { ++ for(c=0xd800; c<=0xdbff; ++c) { ++ for(c2=0xdc00; c2<=0xdfff; ++c2) { ++ *s++=(UChar)c; ++ *s++=c2; ++ } ++ } ++ } ++ ++ static const char *const cnvNames[]={ ++ "UTF-8", ++ "UTF-7", ++ "UTF-16", ++ "US-ASCII", ++ "ISO-8859-1", ++ "windows-1252", ++ "Shift-JIS", ++ "ibm-1390", // EBCDIC_STATEFUL table ++ "ibm-16684", // DBCS-only extension table based on EBCDIC_STATEFUL table ++ // "HZ", TODO(markus): known bug, the set incorrectly contains [\u02CA\u02CB\u02D9\u2010\u2013\u2015...] ++ "ISO-2022-JP", ++ "JIS7", ++ "ISO-2022-CN", ++ "ISO-2022-CN-EXT", ++ "LMBCS" ++ }; ++ char buffer[1024]; ++ int32_t i; ++ for(i=0; i<LENGTHOF(cnvNames); ++i) { ++ UErrorCode errorCode=U_ZERO_ERROR; ++ UConverter *cnv=cnv_open(cnvNames[i], errorCode); ++ if(U_FAILURE(errorCode)) { ++ errln("failed to open converter %s - %s", cnvNames[i], u_errorName(errorCode)); ++ continue; ++ } ++ UnicodeSet expected; ++ ucnv_setFromUCallBack(cnv, getUnicodeSetCallback, &expected, NULL, NULL, &errorCode); ++ if(U_FAILURE(errorCode)) { ++ errln("failed to set the callback on converter %s - %s", cnvNames[i], u_errorName(errorCode)); ++ ucnv_close(cnv); ++ continue; ++ } ++ UConverterUnicodeSet which; ++ for(which=UCNV_ROUNDTRIP_SET; which<UCNV_SET_COUNT; which=(UConverterUnicodeSet)((int)which+1)) { ++ if(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { ++ ucnv_setFallback(cnv, TRUE); ++ } ++ expected.add(0, cpLimit-1); ++ s=s0; ++ UBool flush; ++ do { ++ char *t=buffer; ++ flush=(UBool)(s==s0+s0Length); ++ ucnv_fromUnicode(cnv, &t, buffer+sizeof(buffer), (const UChar **)&s, s0+s0Length, NULL, flush, &errorCode); ++ if(U_FAILURE(errorCode)) { ++ if(errorCode==U_BUFFER_OVERFLOW_ERROR) { ++ errorCode=U_ZERO_ERROR; ++ continue; ++ } else { ++ break; // unexpected error, should not occur ++ } ++ } ++ } while(!flush); ++ UnicodeSet set; ++ ucnv_getUnicodeSet(cnv, (USet *)&set, which, &errorCode); ++ if(cpLimit<0x110000) { ++ set.remove(cpLimit, 0x10ffff); ++ } ++ if(which==UCNV_ROUNDTRIP_SET) { ++ // ignore PUA code points because they will be converted even if they ++ // are fallbacks and when other fallbacks are turned off, ++ // but ucnv_getUnicodeSet(UCNV_ROUNDTRIP_SET) delivers true roundtrips ++ expected.remove(0xe000, 0xf8ff); ++ expected.remove(0xf0000, 0xffffd); ++ expected.remove(0x100000, 0x10fffd); ++ set.remove(0xe000, 0xf8ff); ++ set.remove(0xf0000, 0xffffd); ++ set.remove(0x100000, 0x10fffd); ++ } ++ if(set!=expected) { ++ // First try to see if we have different sets because ucnv_getUnicodeSet() ++ // added strings: The above conversion method does not tell us what strings might be convertible. ++ // Remove strings from the set and compare again. ++ // Unfortunately, there are no good, direct set methods for finding out whether there are strings ++ // in the set, nor for enumerating or removing just them. ++ // Intersect all code points with the set. The intersection will not contain strings. ++ UnicodeSet temp(0, 0x10ffff); ++ temp.retainAll(set); ++ set=temp; ++ } ++ if(set!=expected) { ++ UnicodeSet diffSet; ++ UnicodeString out; ++ ++ // are there items that must be in the set but are not? ++ (diffSet=expected).removeAll(set); ++ if(!diffSet.isEmpty()) { ++ diffSet.toPattern(out, TRUE); ++ if(out.length()>100) { ++ out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis)); ++ } ++ errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - which set: %d", ++ cnvNames[i], which); ++ errln(out); ++ } ++ ++ // are there items that must not be in the set but are? ++ (diffSet=set).removeAll(expected); ++ if(!diffSet.isEmpty()) { ++ diffSet.toPattern(out, TRUE); ++ if(out.length()>100) { ++ out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis)); ++ } ++ errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - which set: %d", ++ cnvNames[i], which); ++ errln(out); ++ } ++ } ++ } ++ } ++ ++ delete [] s0; ++} ++ + // open testdata or ICU data converter ------------------------------------- *** + + UConverter * +diff -ru icu.5797/source/test/intltest/convtest.h icu/source/test/intltest/convtest.h +--- icu.5797/source/test/intltest/convtest.h 2009-06-02 14:45:18.000000000 +0100 ++++ icu/source/test/intltest/convtest.h 2009-06-02 15:05:10.000000000 +0100 +@@ -64,6 +64,7 @@ + void TestToUnicode(); + void TestFromUnicode(); + void TestGetUnicodeSet(); ++ void TestGetUnicodeSet2(); + + private: + UBool +diff -ru icu.5797/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt +--- icu.5797/source/test/testdata/conversion.txt 2009-06-02 14:45:18.000000000 +0100 ++++ icu/source/test/testdata/conversion.txt 2009-06-02 15:25:04.000000000 +0100 +@@ -1198,16 +1198,29 @@ + // versions of ISO-2022-JP + { + "ISO-2022-JP", +- "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u203e\uff61-\uff9f\u4e00\u4e01\uffe5]", +- "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\ufa0e-\ufa2d\uffe6-\U0010ffff]", ++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2015\u203e\u4e00\u4e01\uffe5]", ++ "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u2014\u301c\u4e02\u4e27-\u4e29\u4fe0\u663b\u9eb5\ufa0e-\ufa2d\uff61-\uff9f\uffe4\uffe6-\U0010ffff]", + :int{0} + } + { + "ISO-2022-JP-2", +- "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\uff61-\uff9f\u4e00-\u4e05\uffe6]", +- "[\x0e\x0f\x1b\uffe7-\U0010ffff]", ++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uffe6]", ++ "[\x0e\x0f\x1b\uff61-\uff9f\uffe4\uffe7-\U0010ffff]", + :int{0} + } ++ { ++ "JIS7", ++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uff61-\uff9f\uffe6]", ++ "[\x0e\x0f\x1b\uffe4\uffe7-\U0010ffff]", ++ :int{0} ++ } ++ // with fallbacks ++ { ++ "ISO-2022-JP", ++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2014\u2015\u203e\u301c\u4e00\u4e01\u4fe0\u9eb5\uff61-\uff9f\uffe5]", ++ "[\x0e\x0f\x1b\xa6\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\u663b\ufa0e-\ufa2d\uffe4\uffe6-\U0010ffff]", ++ :int{1} ++ } + + // versions of ISO-2022-CN + { +@@ -1223,6 +1236,14 @@ + :int{0} + } + ++ // LMBCS ++ { ++ "LMBCS", ++ "[\x00-\U0010ffff]", ++ "[]", ++ :int{0} ++ } ++ + // DBCS-only + { + "ibm-971", diff --git a/icu.icu6002.backport.patch b/icu.icu6002.backport.patch new file mode 100644 index 0000000..51f0d75 --- /dev/null +++ b/icu.icu6002.backport.patch @@ -0,0 +1,397 @@ +diff -ru icu.6001/source/common/ucnv_ext.c icu/source/common/ucnv_ext.c +--- icu.6001/source/common/ucnv_ext.c 2009-06-02 15:29:01.000000000 +0100 ++++ icu/source/common/ucnv_ext.c 2009-06-02 15:29:18.000000000 +0100 +@@ -1036,15 +1036,13 @@ + /* enumerate the from-Unicode trie table */ + c=0; /* keep track of the current code point while enumerating */ + +- if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY || +- filter==UCNV_SET_FILTER_DBCS_ONLY || +- filter==UCNV_SET_FILTER_SJIS || +- filter==UCNV_SET_FILTER_GR94DBCS ++ if(filter==UCNV_SET_FILTER_2022_CN) { ++ minLength=3; ++ } else if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY || ++ filter!=UCNV_SET_FILTER_NONE + ) { + /* DBCS-only, ignore single-byte results */ + minLength=2; +- } else if(filter==UCNV_SET_FILTER_2022_CN) { +- minLength=3; + } else { + minLength=1; + } +@@ -1104,6 +1102,13 @@ + continue; + } + break; ++ case UCNV_SET_FILTER_HZ: ++ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && ++ (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfdfe - 0xa1a1) && ++ (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) { ++ continue; ++ } ++ break; + default: + /* + * UCNV_SET_FILTER_NONE, +diff -ru icu.6001/source/common/ucnvhz.c icu/source/common/ucnvhz.c +--- icu.6001/source/common/ucnvhz.c 2009-06-02 15:29:01.000000000 +0100 ++++ icu/source/common/ucnvhz.c 2009-06-02 15:29:15.000000000 +0100 +@@ -72,7 +72,7 @@ + cnv->extraInfo = uprv_malloc(sizeof(UConverterDataHZ)); + if(cnv->extraInfo != NULL){ + uprv_memset(cnv->extraInfo, 0, sizeof(UConverterDataHZ)); +- ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("ibm-1386",errorCode); ++ ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("GBK",errorCode); + } + else { + *errorCode = U_MEMORY_ALLOCATION_ERROR; +@@ -141,7 +141,7 @@ + UChar *myTarget = args->target; + const char *mySourceLimit = args->sourceLimit; + UChar32 targetUniChar = 0x0000; +- UChar mySourceChar = 0x0000; ++ int32_t mySourceChar = 0x0000; + UConverterDataHZ* myData=(UConverterDataHZ*)(args->converter->extraInfo); + tempBuf[0]=0; + tempBuf[1]=0; +@@ -156,90 +156,71 @@ + + mySourceChar= (unsigned char) *mySource++; + +- switch(mySourceChar){ ++ if(args->converter->mode == UCNV_TILDE) { ++ /* second byte after ~ */ ++ args->converter->mode=0; ++ switch(mySourceChar) { + case 0x0A: +- if(args->converter->mode ==UCNV_TILDE){ +- args->converter->mode=0; +- +- } +- *(myTarget++)=(UChar)mySourceChar; ++ /* no output for ~\n (line-continuation marker) */ + continue; +- + case UCNV_TILDE: +- if(args->converter->mode ==UCNV_TILDE){ +- *(myTarget++)=(UChar)mySourceChar; +- args->converter->mode=0; +- continue; +- ++ if(args->offsets) { ++ args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 2); + } +- else if(args->converter->toUnicodeStatus !=0){ +- args->converter->mode=0; +- break; +- } +- else{ +- args->converter->mode = UCNV_TILDE; +- continue; +- } +- +- ++ *(myTarget++)=(UChar)mySourceChar; ++ continue; + case UCNV_OPEN_BRACE: +- if(args->converter->mode == UCNV_TILDE){ +- args->converter->mode=0; +- myData->isStateDBCS = TRUE; +- continue; +- } +- else{ +- break; +- } +- +- ++ myData->isStateDBCS = TRUE; ++ continue; + case UCNV_CLOSE_BRACE: +- if(args->converter->mode == UCNV_TILDE){ +- args->converter->mode=0; +- myData->isStateDBCS = FALSE; +- continue; +- } +- else{ +- break; +- } +- ++ myData->isStateDBCS = FALSE; ++ continue; + default: + /* if the first byte is equal to TILDE and the trail byte + * is not a valid byte then it is an error condition + */ +- if(args->converter->mode == UCNV_TILDE){ +- args->converter->mode=0; +- mySourceChar= (UChar)(((UCNV_TILDE+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80)); +- goto SAVE_STATE; +- } +- ++ mySourceChar = 0x7e00 | mySourceChar; ++ targetUniChar = 0xffff; + break; +- +- } +- +- if(myData->isStateDBCS){ ++ } ++ } else if(myData->isStateDBCS) { + if(args->converter->toUnicodeStatus == 0x00){ +- args->converter->toUnicodeStatus = (UChar) mySourceChar; ++ /* lead byte */ ++ if(mySourceChar == UCNV_TILDE) { ++ args->converter->mode = UCNV_TILDE; ++ } else { ++ /* add another bit to distinguish a 0 byte from not having seen a lead byte */ ++ args->converter->toUnicodeStatus = (uint32_t) (mySourceChar | 0x100); ++ } + continue; + } + else{ +- tempBuf[0] = (char) (args->converter->toUnicodeStatus+0x80) ; +- tempBuf[1] = (char) (mySourceChar+0x80); +- mySourceChar= (UChar)(((args->converter->toUnicodeStatus+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80)); ++ /* trail byte */ ++ uint32_t leadByte = args->converter->toUnicodeStatus & 0xff; ++ if( (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21) && ++ (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21) ++ ) { ++ tempBuf[0] = (char) (leadByte+0x80) ; ++ tempBuf[1] = (char) (mySourceChar+0x80); ++ targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData, ++ tempBuf, 2, args->converter->useFallback); ++ } else { ++ targetUniChar = 0xffff; ++ } ++ /* add another bit so that the code below writes 2 bytes in case of error */ ++ mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar; + args->converter->toUnicodeStatus =0x00; +- targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData, +- tempBuf, 2, args->converter->useFallback); + } + } + else{ +- if(args->converter->fromUnicodeStatus == 0x00){ +- targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData, +- mySource - 1, 1, args->converter->useFallback); +- } +- else{ +- goto SAVE_STATE; ++ if(mySourceChar == UCNV_TILDE) { ++ args->converter->mode = UCNV_TILDE; ++ continue; ++ } else if(mySourceChar <= 0x7f) { ++ targetUniChar = (UChar)mySourceChar; /* ASCII */ ++ } else { ++ targetUniChar = 0xffff; + } +- + } + if(targetUniChar < 0xfffe){ + if(args->offsets) { +@@ -248,26 +229,17 @@ + + *(myTarget++)=(UChar)targetUniChar; + } +- else if(targetUniChar>=0xfffe){ +-SAVE_STATE: ++ else /* targetUniChar>=0xfffe */ { + if(targetUniChar == 0xfffe){ + *err = U_INVALID_CHAR_FOUND; + } + else{ + *err = U_ILLEGAL_CHAR_FOUND; + } +- if(myData->isStateDBCS){ +- /* this should never occur since isStateDBCS is set to true +- * only after tempBuf[0] and tempBuf[1] +- * are set to the input .. just to please BEAM +- */ +- if(tempBuf[0]==0 || tempBuf[1]==0){ +- *err = U_INTERNAL_PROGRAM_ERROR; +- }else{ +- args->converter->toUBytes[0] = (uint8_t)(tempBuf[0]-0x80); +- args->converter->toUBytes[1] = (uint8_t)(tempBuf[1]-0x80); +- args->converter->toULength=2; +- } ++ if(mySourceChar > 0xff){ ++ args->converter->toUBytes[0] = (uint8_t)(mySourceChar >> 8); ++ args->converter->toUBytes[1] = (uint8_t)mySourceChar; ++ args->converter->toULength=2; + } + else{ + args->converter->toUBytes[0] = (uint8_t)mySourceChar; +@@ -328,16 +300,21 @@ + escSeq = TILDE_ESCAPE; + CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex); + continue; +- } +- else{ ++ } else if(mySourceChar <= 0x7f) { ++ length = 1; ++ targetUniChar = mySourceChar; ++ } else { + length= ucnv_MBCSFromUChar32(myConverterData->gbConverter->sharedData, + mySourceChar,&targetUniChar,args->converter->useFallback); +- +- } +- /* only DBCS or SBCS characters are expected*/ +- /* DB haracters with high bit set to 1 are expected */ +- if(length > 2 || length==0 ||(((targetUniChar & 0x8080) != 0x8080)&& length==2)){ +- targetUniChar= missingCharMarker; ++ /* we can only use lead bytes 21..7D and trail bytes 21..7E */ ++ if( length == 2 && ++ (uint16_t)(targetUniChar - 0xa1a1) <= (0xfdfe - 0xa1a1) && ++ (uint8_t)(targetUniChar - 0xa1) <= (0xfe - 0xa1) ++ ) { ++ targetUniChar -= 0x8080; ++ } else { ++ targetUniChar = missingCharMarker; ++ } + } + if (targetUniChar != missingCharMarker){ + myConverterData->isTargetUCharDBCS = isTargetUCharDBCS = (UBool)(targetUniChar>0x00FF); +@@ -360,22 +337,22 @@ + + if(isTargetUCharDBCS){ + if( myTargetIndex <targetLength){ +- myTarget[myTargetIndex++] =(char) ((targetUniChar >> 8) -0x80); ++ myTarget[myTargetIndex++] =(char) (targetUniChar >> 8); + if(offsets){ + *(offsets++) = mySourceIndex-1; + } + if(myTargetIndex < targetLength){ +- myTarget[myTargetIndex++] =(char) ((targetUniChar & 0x00FF) -0x80); ++ myTarget[myTargetIndex++] =(char) targetUniChar; + if(offsets){ + *(offsets++) = mySourceIndex-1; + } + }else{ +- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80); ++ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar; + *err = U_BUFFER_OVERFLOW_ERROR; + } + }else{ +- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) ((targetUniChar >> 8) -0x80); +- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80); ++ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) (targetUniChar >> 8); ++ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar; + *err = U_BUFFER_OVERFLOW_ERROR; + } + +@@ -524,15 +501,14 @@ + const USetAdder *sa, + UConverterUnicodeSet which, + UErrorCode *pErrorCode) { +- /* the tilde '~' is hardcoded in the converter */ +- sa->add(sa->set, 0x7e); ++ /* HZ converts all of ASCII */ ++ sa->addRange(sa->set, 0, 0x7f); + + /* add all of the code points that the sub-converter handles */ +- /* ucnv_MBCSGetFilteredUnicodeSetForUnicode(((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData, sa, which, UCNV_SET_FILTER_GR94DBCS, pErrorCode); */ +- ((UConverterDataHZ*)cnv->extraInfo)-> +- gbConverter->sharedData->impl-> +- getUnicodeSet(((UConverterDataHZ*)cnv->extraInfo)->gbConverter, +- sa, which, pErrorCode); ++ ucnv_MBCSGetFilteredUnicodeSetForUnicode( ++ ((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData, ++ sa, which, UCNV_SET_FILTER_HZ, ++ pErrorCode); + } + + static const UConverterImpl _HZImpl={ +diff -ru icu.6001/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c +--- icu.6001/source/common/ucnvmbcs.c 2009-06-02 15:29:01.000000000 +0100 ++++ icu/source/common/ucnvmbcs.c 2009-06-02 15:35:01.000000000 +0100 +@@ -612,6 +612,19 @@ + stage3+=2; /* +=st3Multiplier */ + } while((++c&0xf)!=0); + break; ++ case UCNV_SET_FILTER_HZ: ++ /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */ ++ do { ++ if( ((st3&1)!=0 || useFallback) && ++ (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) && ++ (uint8_t)(value - 0xa1)<=(0xfe - 0xa1) ++ ) { ++ sa->add(sa->set, c); ++ } ++ st3>>=1; ++ stage3+=2; /* +=st3Multiplier */ ++ } while((++c&0xf)!=0); ++ break; + default: + *pErrorCode=U_INTERNAL_PROGRAM_ERROR; + return; +diff -ru icu.6001/source/common/ucnvmbcs.h icu/source/common/ucnvmbcs.h +--- icu.6001/source/common/ucnvmbcs.h 2009-06-02 15:29:01.000000000 +0100 ++++ icu/source/common/ucnvmbcs.h 2009-06-02 15:29:15.000000000 +0100 +@@ -400,6 +400,7 @@ + UCNV_SET_FILTER_2022_CN, + UCNV_SET_FILTER_SJIS, + UCNV_SET_FILTER_GR94DBCS, ++ UCNV_SET_FILTER_HZ, + UCNV_SET_FILTER_COUNT + } UConverterSetFilter; + +diff -ru icu.6001/source/test/cintltst/ncnvtst.c icu/source/test/cintltst/ncnvtst.c +--- icu.6001/source/test/cintltst/ncnvtst.c 2009-06-02 15:28:46.000000000 +0100 ++++ icu/source/test/cintltst/ncnvtst.c 2009-06-02 15:29:15.000000000 +0100 +@@ -1928,7 +1928,7 @@ + #if !UCONFIG_NO_LEGACY_CONVERSION + { "UTF-8", 0, 0xd7ff, 0xe000, 0x10ffff, 0xd800, 0xdfff }, + { "windows-1251", 0, 0x7f, 0x410, 0x44f, 0x3000, 0xd7ff }, +- { "HZ", 0x410, 0x44f, 0x4e00, 0x4eff, 0xac00, 0xd7ff }, ++ /* HZ test case fixed and moved to intltest's conversion.txt, ticket #6002 */ + { "shift-jis", 0x3041, 0x3093, 0x30a1, 0x30f3, 0x900, 0x1cff } + #else + { "UTF-8", 0, 0xd7ff, 0xe000, 0x10ffff, 0xd800, 0xdfff } +diff -ru icu.6001/source/test/intltest/convtest.cpp icu/source/test/intltest/convtest.cpp +--- icu.6001/source/test/intltest/convtest.cpp 2009-06-02 15:28:46.000000000 +0100 ++++ icu/source/test/intltest/convtest.cpp 2009-06-02 15:29:15.000000000 +0100 +@@ -527,7 +527,7 @@ + "Shift-JIS", + "ibm-1390", // EBCDIC_STATEFUL table + "ibm-16684", // DBCS-only extension table based on EBCDIC_STATEFUL table +- // "HZ", TODO(markus): known bug, the set incorrectly contains [\u02CA\u02CB\u02D9\u2010\u2013\u2015...] ++ "HZ", + "ISO-2022-JP", + "JIS7", + "ISO-2022-CN", +diff -ru icu.6001/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt +--- icu.6001/source/test/testdata/conversion.txt 2009-06-02 15:28:46.000000000 +0100 ++++ icu/source/test/testdata/conversion.txt 2009-06-02 15:29:15.000000000 +0100 +@@ -48,6 +48,14 @@ + toUnicode { + Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" } + Cases { ++ // test that HZ limits its byte values to lead bytes 21..7d and trail bytes 21..7e ++ { ++ "HZ", ++ :bin{ 7e7b21212120217e217f772100007e217e7d207e7e807e0a2b }, ++ "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd ~\ufffd+", ++ :intvector{ 2,4,6,8,10,12,14,18,19,21,24 }, ++ :int{1}, :int{1}, "", "?", :bin{""} ++ } + // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and + // using the Shift-JIS table for JIS X 0208 (ticket #5797) + { +@@ -1244,6 +1252,14 @@ + :int{0} + } + ++ // HZ ++ { ++ "HZ", ++ "[\u0410-\u044f\u4e00\u4e01\u4e03]", ++ "[\u4e02\u4e04-\u4e06\uac00-\ud7ff]", ++ :int{0} ++ } ++ + // DBCS-only + { + "ibm-971", diff --git a/icu.icu6175.emptysegments.patch b/icu.icu6175.emptysegments.patch new file mode 100644 index 0000000..bb40bd5 --- /dev/null +++ b/icu.icu6175.emptysegments.patch @@ -0,0 +1,535 @@ +diff -ru icu.6002/source/common/ucnv2022.c icu/source/common/ucnv2022.c +--- icu.6002/source/common/ucnv2022.c 2009-06-02 15:38:08.000000000 +0100 ++++ icu/source/common/ucnv2022.c 2009-06-02 15:40:20.000000000 +0100 +@@ -201,6 +201,7 @@ + #ifdef U_ENABLE_GENERIC_ISO_2022 + UBool isFirstBuffer; + #endif ++ UBool isEmptySegment; + char name[30]; + char locale[3]; + }UConverterDataISO2022; +@@ -609,6 +610,7 @@ + if(choice<=UCNV_RESET_TO_UNICODE) { + uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State)); + myConverterData->key = 0; ++ myConverterData->isEmptySegment = FALSE; + } + if(choice!=UCNV_RESET_TO_UNICODE) { + uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State)); +@@ -814,6 +816,7 @@ + if(chosenConverterName == NULL) { + /* SS2 or SS3 */ + *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; ++ _this->toUCallbackReason = UCNV_UNASSIGNED; + return; + } + +@@ -935,6 +938,8 @@ + } + if(U_SUCCESS(*err)) { + _this->toULength = 0; ++ } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) { ++ _this->toUCallbackReason = UCNV_UNASSIGNED; + } + } + +@@ -1986,6 +1991,7 @@ + continue; + } else { + /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ ++ myData->isEmptySegment = FALSE; /* reset this, we have a different error */ + break; + } + +@@ -1997,21 +2003,39 @@ + continue; + } else { + /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ ++ myData->isEmptySegment = FALSE; /* reset this, we have a different error */ + break; + } + + case ESC_2022: + mySource--; + escape: +- changeState_2022(args->converter,&(mySource), +- mySourceLimit, ISO_2022_JP,err); ++ { ++ const char * mySourceBefore = mySource; ++ int8_t toULengthBefore = args->converter->toULength; ++ ++ changeState_2022(args->converter,&(mySource), ++ mySourceLimit, ISO_2022_JP,err); ++ ++ /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */ ++ if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { ++ *err = U_ILLEGAL_ESCAPE_SEQUENCE; ++ args->converter->toUCallbackReason = UCNV_IRREGULAR; ++ args->converter->toULength = toULengthBefore + (mySource - mySourceBefore); ++ } ++ } + + /* invalid or illegal escape sequence */ + if(U_FAILURE(*err)){ + args->target = myTarget; + args->source = mySource; ++ myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ + return; + } ++ /* If we successfully completed an escape sequence, we begin a new segment, empty so far */ ++ if(myData->key==0) { ++ myData->isEmptySegment = TRUE; ++ } + continue; + + /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */ +@@ -2028,6 +2052,7 @@ + /* falls through */ + default: + /* convert one or two bytes */ ++ myData->isEmptySegment = FALSE; + cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; + if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 && + !IS_JP_DBCS(cs) +@@ -2524,15 +2549,27 @@ + + if(mySourceChar==UCNV_SI){ + myData->toU2022State.g = 0; ++ if (myData->isEmptySegment) { ++ myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ ++ *err = U_ILLEGAL_ESCAPE_SEQUENCE; ++ args->converter->toUCallbackReason = UCNV_IRREGULAR; ++ args->converter->toUBytes[0] = mySourceChar; ++ args->converter->toULength = 1; ++ args->target = myTarget; ++ args->source = mySource; ++ return; ++ } + /*consume the source */ + continue; + }else if(mySourceChar==UCNV_SO){ + myData->toU2022State.g = 1; ++ myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ + /*consume the source */ + continue; + }else if(mySourceChar==ESC_2022){ + mySource--; + escape: ++ myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */ + changeState_2022(args->converter,&(mySource), + mySourceLimit, ISO_2022_KR, err); + if(U_FAILURE(*err)){ +@@ -2543,6 +2580,7 @@ + continue; + } + ++ myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */ + if(myData->toU2022State.g == 1) { + if(mySource < mySourceLimit) { + char trailByte; +@@ -3075,27 +3113,52 @@ + switch(mySourceChar){ + case UCNV_SI: + pToU2022State->g=0; ++ if (myData->isEmptySegment) { ++ myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ ++ *err = U_ILLEGAL_ESCAPE_SEQUENCE; ++ args->converter->toUCallbackReason = UCNV_IRREGULAR; ++ args->converter->toUBytes[0] = mySourceChar; ++ args->converter->toULength = 1; ++ args->target = myTarget; ++ args->source = mySource; ++ return; ++ } + continue; + + case UCNV_SO: + if(pToU2022State->cs[1] != 0) { + pToU2022State->g=1; ++ myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ + continue; + } else { + /* illegal to have SO before a matching designator */ ++ myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */ + break; + } + + case ESC_2022: + mySource--; + escape: +- changeState_2022(args->converter,&(mySource), +- mySourceLimit, ISO_2022_CN,err); ++ { ++ const char * mySourceBefore = mySource; ++ int8_t toULengthBefore = args->converter->toULength; ++ ++ changeState_2022(args->converter,&(mySource), ++ mySourceLimit, ISO_2022_CN,err); ++ ++ /* After SO there must be at least one character before a designator (designator error handled separately) */ ++ if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { ++ *err = U_ILLEGAL_ESCAPE_SEQUENCE; ++ args->converter->toUCallbackReason = UCNV_IRREGULAR; ++ args->converter->toULength = toULengthBefore + (mySource - mySourceBefore); ++ } ++ } + + /* invalid or illegal escape sequence */ + if(U_FAILURE(*err)){ + args->target = myTarget; + args->source = mySource; ++ myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ + return; + } + continue; +@@ -3109,6 +3172,7 @@ + /* falls through */ + default: + /* convert one or two bytes */ ++ myData->isEmptySegment = FALSE; + if(pToU2022State->g != 0) { + if(mySource < mySourceLimit) { + UConverterSharedData *cnv; +diff -ru icu.6002/source/common/ucnv_bld.c icu/source/common/ucnv_bld.c +--- icu.6002/source/common/ucnv_bld.c 2009-06-02 15:38:05.000000000 +0100 ++++ icu/source/common/ucnv_bld.c 2009-06-02 15:38:31.000000000 +0100 +@@ -914,6 +914,7 @@ + myUConverter->subCharLen = mySharedConverterData->staticData->subCharLen; + myUConverter->subChars = (uint8_t *)myUConverter->subUChars; + uprv_memcpy(myUConverter->subChars, mySharedConverterData->staticData->subChar, myUConverter->subCharLen); ++ myUConverter->toUCallbackReason = UCNV_ILLEGAL; /* default reason to invoke (*fromCharErrorBehaviour) */ + + if(mySharedConverterData->impl->open != NULL) { + mySharedConverterData->impl->open(myUConverter, realName, locale, options, err); +diff -ru icu.6002/source/common/ucnv_bld.h icu/source/common/ucnv_bld.h +--- icu.6002/source/common/ucnv_bld.h 2009-06-02 15:38:08.000000000 +0100 ++++ icu/source/common/ucnv_bld.h 2009-06-02 15:38:31.000000000 +0100 +@@ -226,6 +226,9 @@ + char preToU[UCNV_EXT_MAX_BYTES]; + int8_t preFromULength, preToULength; /* negative: replay */ + int8_t preToUFirstLength; /* length of first character */ ++ ++ /* new fields for ICU 4.0 */ ++ UConverterCallbackReason toUCallbackReason; /* (*fromCharErrorBehaviour) reason, set when error is detected */ + }; + + U_CDECL_END /* end of UConverter */ +diff -ru icu.6002/source/common/ucnv.c icu/source/common/ucnv.c +--- icu.6002/source/common/ucnv.c 2009-06-02 15:38:05.000000000 +0100 ++++ icu/source/common/ucnv.c 2009-06-02 15:38:31.000000000 +0100 +@@ -1473,11 +1473,14 @@ + cnv->toULength=0; + + /* call the callback function */ ++ if(cnv->toUCallbackReason==UCNV_ILLEGAL && *err==U_INVALID_CHAR_FOUND) { ++ cnv->toUCallbackReason = UCNV_UNASSIGNED; ++ } + cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, + cnv->invalidCharBuffer, errorInputLength, +- (*err==U_INVALID_CHAR_FOUND || *err==U_UNSUPPORTED_ESCAPE_SEQUENCE) ? +- UCNV_UNASSIGNED : UCNV_ILLEGAL, ++ cnv->toUCallbackReason, + err); ++ cnv->toUCallbackReason = UCNV_ILLEGAL; /* reset to default value */ + + /* + * loop back to the offset handling +diff -ru icu.6002/source/common/ucnvhz.c icu/source/common/ucnvhz.c +--- icu.6002/source/common/ucnvhz.c 2009-06-02 15:38:08.000000000 +0100 ++++ icu/source/common/ucnvhz.c 2009-06-02 15:38:31.000000000 +0100 +@@ -59,6 +59,7 @@ + UBool isEscapeAppended; + UBool isStateDBCS; + UBool isTargetUCharDBCS; ++ UBool isEmptySegment; + }UConverterDataHZ; + + +@@ -98,6 +99,7 @@ + cnv->mode=0; + if(cnv->extraInfo != NULL){ + ((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = FALSE; ++ ((UConverterDataHZ*)cnv->extraInfo)->isEmptySegment = FALSE; + } + } + if(choice!=UCNV_RESET_TO_UNICODE) { +@@ -130,6 +132,10 @@ + * from-GB code '~}' ($7E7D) is outside the defined GB range.) + * + * Source: RFC 1842 ++* ++* Note that the formal syntax in RFC 1842 is invalid. I assume that the ++* intended definition of single-byte-segment is as follows (pedberg): ++* single-byte-segment = single-byte-seq 1*single-byte-char + */ + + +@@ -168,12 +174,23 @@ + args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 2); + } + *(myTarget++)=(UChar)mySourceChar; ++ myData->isEmptySegment = FALSE; + continue; + case UCNV_OPEN_BRACE: +- myData->isStateDBCS = TRUE; +- continue; + case UCNV_CLOSE_BRACE: +- myData->isStateDBCS = FALSE; ++ myData->isStateDBCS = (mySourceChar == UCNV_OPEN_BRACE); ++ if (myData->isEmptySegment) { ++ myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ ++ *err = U_ILLEGAL_ESCAPE_SEQUENCE; ++ args->converter->toUCallbackReason = UCNV_IRREGULAR; ++ args->converter->toUBytes[0] = UCNV_TILDE; ++ args->converter->toUBytes[1] = mySourceChar; ++ args->converter->toULength = 2; ++ args->target = myTarget; ++ args->source = mySource; ++ return; ++ } ++ myData->isEmptySegment = TRUE; + continue; + default: + /* if the first byte is equal to TILDE and the trail byte +@@ -181,6 +198,7 @@ + */ + mySourceChar = 0x7e00 | mySourceChar; + targetUniChar = 0xffff; ++ myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */ + break; + } + } else if(myData->isStateDBCS) { +@@ -191,6 +209,7 @@ + } else { + /* add another bit to distinguish a 0 byte from not having seen a lead byte */ + args->converter->toUnicodeStatus = (uint32_t) (mySourceChar | 0x100); ++ myData->isEmptySegment = FALSE; /* the segment has something, either valid or will produce a different error, so reset this */ + } + continue; + } +@@ -218,8 +237,10 @@ + continue; + } else if(mySourceChar <= 0x7f) { + targetUniChar = (UChar)mySourceChar; /* ASCII */ ++ myData->isEmptySegment = FALSE; /* the segment has something valid */ + } else { + targetUniChar = 0xffff; ++ myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */ + } + } + if(targetUniChar < 0xfffe){ +diff -ru icu.6002/source/test/cintltst/nucnvtst.c icu/source/test/cintltst/nucnvtst.c +--- icu.6002/source/test/cintltst/nucnvtst.c 2009-06-02 15:37:53.000000000 +0100 ++++ icu/source/test/cintltst/nucnvtst.c 2009-06-02 15:40:52.000000000 +0100 +@@ -81,6 +81,7 @@ + static void TestJitterbug2411(void); + #endif + ++static void TestJitterbug6175(void); + static void TestRoundTrippingAllUTF(void); + static void TestConv(const uint16_t in[], + int len, +@@ -294,6 +295,7 @@ + #if !UCONFIG_NO_LEGACY_CONVERSION + addTest(root, &TestJitterbug2346, "tsconv/nucnvtst/TestJitterbug2346"); + addTest(root, &TestJitterbug2411, "tsconv/nucnvtst/TestJitterbug2411"); ++ addTest(root, &TestJitterbug6175, "tsconv/nucnvtst/TestJitterbug6175"); + #endif + + } +@@ -4454,6 +4456,70 @@ + free(offsets); + } + ++/* Tests for empty segments in ISO-2022-JP/KR/CN, HZ, check that UConverterCallbackReason is UCNV_IRREGULAR */ ++typedef struct { ++ const char * converterName; ++ const char * inputText; ++ int inputTextLength; ++} EmptySegmentTest; ++ ++/* Callback for TestJitterbug6175, should only get called for empty segment errors */ ++static void UCNV_TO_U_CALLBACK_EMPTYSEGMENT( const void *context, UConverterToUnicodeArgs *toArgs, const char* codeUnits, ++ int32_t length, UConverterCallbackReason reason, UErrorCode * err ) { ++ if (reason > UCNV_IRREGULAR) { ++ return; ++ } ++ if (reason != UCNV_IRREGULAR) { ++ log_err("toUnicode callback invoked for empty segment but reason is not UCNV_IRREGULAR\n"); ++ } ++ /* Standard stuff below from UCNV_TO_U_CALLBACK_SUBSTITUTE */ ++ *err = U_ZERO_ERROR; ++ ucnv_cbToUWriteSub(toArgs,0,err); ++} ++ ++enum { kEmptySegmentToUCharsMax = 64 }; ++static void TestJitterbug6175(void) { ++ static const char iso2022jp_a[] = { 0x61, 0x62, 0x1B,0x24,0x42, 0x1B,0x28,0x42, 0x63, 0x64, 0x0D, 0x0A }; ++ static const char iso2022kr_a[] = { 0x1B,0x24,0x29,0x43, 0x61, 0x0E, 0x0F, 0x62, 0x0D, 0x0A }; ++ static const char iso2022cn_a[] = { 0x61, 0x1B,0x24,0x29,0x41, 0x62, 0x0E, 0x0F, 0x1B,0x24,0x2A,0x48, 0x1B,0x4E, 0x6A,0x65, 0x63, 0x0D, 0x0A }; ++ static const char iso2022cn_b[] = { 0x61, 0x1B,0x24,0x29,0x41, 0x62, 0x0E, 0x1B,0x24,0x29,0x47, 0x68,0x64, 0x0F, 0x63, 0x0D, 0x0A }; ++ static const char hzGB2312_a[] = { 0x61, 0x62, 0x7E,0x7B, 0x7E,0x7D, 0x63, 0x64 }; ++ static const EmptySegmentTest emptySegmentTests[] = { ++ /* converterName inputText inputTextLength */ ++ { "ISO-2022-JP", iso2022jp_a, sizeof(iso2022jp_a) }, ++ { "ISO-2022-KR", iso2022kr_a, sizeof(iso2022kr_a) }, ++ { "ISO-2022-CN", iso2022cn_a, sizeof(iso2022cn_a) }, ++ { "ISO-2022-CN", iso2022cn_b, sizeof(iso2022cn_b) }, ++ { "HZ-GB-2312", hzGB2312_a, sizeof(hzGB2312_a) }, ++ /* terminator: */ ++ { NULL, NULL, 0, } ++ }; ++ const EmptySegmentTest * testPtr; ++ for (testPtr = emptySegmentTests; testPtr->converterName != NULL; ++testPtr) { ++ UErrorCode err = U_ZERO_ERROR; ++ UConverter * cnv = ucnv_open(testPtr->converterName, &err); ++ if (U_FAILURE(err)) { ++ log_data_err("Unable to open %s converter: %s\n", testPtr->converterName, u_errorName(err)); ++ return; ++ } ++ ucnv_setToUCallBack(cnv, UCNV_TO_U_CALLBACK_EMPTYSEGMENT, NULL, NULL, NULL, &err); ++ if (U_FAILURE(err)) { ++ log_data_err("Unable to setToUCallBack for %s converter: %s\n", testPtr->converterName, u_errorName(err)); ++ ucnv_close(cnv); ++ return; ++ } ++ { ++ UChar toUChars[kEmptySegmentToUCharsMax]; ++ UChar * toUCharsPtr = toUChars; ++ const UChar * toUCharsLimit = toUCharsPtr + kEmptySegmentToUCharsMax; ++ const char * inCharsPtr = testPtr->inputText; ++ const char * inCharsLimit = inCharsPtr + testPtr->inputTextLength; ++ ucnv_toUnicode(cnv, &toUCharsPtr, toUCharsLimit, &inCharsPtr, inCharsLimit, NULL, TRUE, &err); ++ } ++ ucnv_close(cnv); ++ } ++} ++ + static void + TestEBCDIC_STATEFUL() { + /* test input */ +diff -ru icu.6002/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt +--- icu.6002/source/test/testdata/conversion.txt 2009-06-02 15:37:54.000000000 +0100 ++++ icu/source/test/testdata/conversion.txt 2009-06-02 15:40:52.000000000 +0100 +@@ -199,6 +199,21 @@ + :intvector{ 0, 5, 7, 9, 9, 9, 9, 9, 9, 9, 9, 12 }, + :int{1}, :int{1}, "", "&", :bin{""} + } ++ // empty segment (using substitution and stop) ++ { ++ "ISO-2022-KR", ++ :bin{ 1b242943610e0f620d0a }, ++ "a\uFFFDb\u000D\u000A", ++ :intvector{ 4, 6, 7, 8, 9 }, ++ :int{1}, :int{1}, "", "?", :bin{""} ++ } ++ { ++ "ISO-2022-KR", ++ :bin{ 1b242943610e0f620d0a }, ++ "a", ++ :intvector{ 4 }, ++ :int{1}, :int{1}, "illesc", ".", :bin{"0f"} ++ } + + // ISO-2022-JP + +@@ -249,6 +264,21 @@ + :bin{ 41c15c1b284a5cc242 }, "A\uff81\\\xa5\uff82B", :intvector{ 0, 1, 2, 6, 7, 8 }, + :int{1}, :int{1}, "", ".", :bin{""} + } ++ // empty segment (using substitution and stop) ++ { ++ "ISO-2022-JP", ++ :bin{ 61621b24421b284263640d0a }, ++ "ab\uFFFDcd\u000D\u000A", ++ :intvector{ 0, 1, 5, 8, 9, 10, 11 }, ++ :int{1}, :int{1}, "", "?", :bin{""} ++ } ++ { ++ "ISO-2022-JP", ++ :bin{ 61621b24421b284263640d0a }, ++ "ab", ++ :intvector{ 0, 1 }, ++ :int{1}, :int{1}, "illesc", ".", :bin{"1b2842"} ++ } + + // ISO-2022-CN + +@@ -319,6 +349,36 @@ + :bin{ 411b242b491b4f2121 }, "\x41", :intvector{ 0 }, + :int{1}, :int{1}, "unsuppesc", ".", :bin{ 1b242b49 } + } ++ // empty segment 1 (using substitution and stop) ++ { ++ "ISO-2022-CN", ++ :bin{ 611b242941620e0f1b242a481b4e6a65630d0a }, ++ "ab\uFFFD\u994Cc\u000D\u000A", ++ :intvector{ 0, 5, 7, 14, 16, 17, 18 }, ++ :int{1}, :int{1}, "", "?", :bin{""} ++ } ++ { ++ "ISO-2022-CN", ++ :bin{ 611b242941620e0f1b242a481b4e6a65630d0a }, ++ "ab", ++ :intvector{ 0, 5 }, ++ :int{1}, :int{1}, "illesc", ".", :bin{"0f"} ++ } ++ // empty segment 2 (using substitution and stop) ++ { ++ "ISO-2022-CN", ++ :bin{ 611b242941620e1b24294768640f630d0a }, ++ "ab\uFFFD\u5F70c\u000D\u000A", ++ :intvector{ 0, 5, 7, 11, 14, 15, 16 }, ++ :int{1}, :int{1}, "", "?", :bin{""} ++ } ++ { ++ "ISO-2022-CN", ++ :bin{ 611b242941620e1b24294768640f630d0a }, ++ "ab", ++ :intvector{ 0, 5 }, ++ :int{1}, :int{1}, "illesc", ".", :bin{"1b242947"} ++ } + + // ISO-2022 SBCS + // [U_ENABLE_GENERIC_ISO_2022] +@@ -333,6 +393,39 @@ + // :int{1}, :int{1}, "", ".", :bin{""} + //} + ++ // HZ-GB-2312 ++ ++ // empty segment 1 (using substitution and stop) ++ { ++ "HZ-GB-2312", ++ :bin{ 61627e7b7e7d6364 }, ++ "ab\uFFFDcd", ++ :intvector{ 0, 1, 4, 6, 7 }, ++ :int{1}, :int{1}, "", "?", :bin{""} ++ } ++ { ++ "HZ-GB-2312", ++ :bin{ 61627e7b7e7d63640d0a }, ++ "ab", ++ :intvector{ 0, 1 }, ++ :int{1}, :int{1}, "illesc", ".", :bin{"7e7d"} ++ } ++ // empty segment 2 & legal redundant switches (using substitution and stop) ++ { ++ "HZ-GB-2312", ++ :bin{ 61627e7b323b3f557e7b7e7b523b7e7d63647e7d65667e7d7e7d }, ++ "ab\u4E0D\u7A7A\uFFFD\u4E00cdef\uFFFD", ++ :intvector{ 0, 1, 4, 6, 10, 12, 16, 17, 20, 21, 24 }, ++ :int{1}, :int{1}, "", "?", :bin{""} ++ } ++ { ++ "HZ-GB-2312", ++ :bin{ 61627e7b323b3f557e7b7e7b523b7e7d63647e7d65667e7d7e7d }, ++ "ab\u4E0D\u7A7A", ++ :intvector{ 0, 1, 4, 6 }, ++ :int{1}, :int{1}, "illesc", ".", :bin{"7e7b"} ++ } ++ + // DBCS-only extensions + { + "ibm-970", diff --git a/icu.icuXXXX.malayalam.bysyllable.patch b/icu.icuXXXX.malayalam.bysyllable.patch new file mode 100644 index 0000000..d0cd1b1 --- /dev/null +++ b/icu.icuXXXX.malayalam.bysyllable.patch @@ -0,0 +1,250 @@ +diff -ruN icu.orig/source/layout/IndicReordering.h icu/source/layout/IndicReordering.h +--- icu.orig/source/layout/IndicReordering.h 2007-04-27 10:28:22.000000000 +0100 ++++ icu/source/layout/IndicReordering.h 2007-04-27 10:39:22.000000000 +0100 +@@ -142,6 +142,7 @@ + // do not instantiate + IndicReordering(); + ++public: + static le_int32 findSyllable(const IndicClassTable *classTable, const LEUnicode *chars, le_int32 prev, le_int32 charCount); + + }; +diff -ruN icu.orig/source/layout/LayoutEngine.cpp icu/source/layout/LayoutEngine.cpp +--- icu.orig/source/layout/LayoutEngine.cpp 2007-04-27 10:28:22.000000000 +0100 ++++ icu/source/layout/LayoutEngine.cpp 2007-04-27 10:39:22.000000000 +0100 +@@ -14,6 +14,7 @@ + #include "CanonShaping.h" + #include "HanLayoutEngine.h" + #include "HangulLayoutEngine.h" ++#include "MalayalamLayoutEngine.h" + #include "IndicLayoutEngine.h" + #include "KhmerLayoutEngine.h" + #include "ThaiLayoutEngine.h" +@@ -451,11 +452,13 @@ + + if (gsubTable != NULL && gsubTable->coversScript(scriptTag = OpenTypeLayoutEngine::getScriptTag(scriptCode))) { + switch (scriptCode) { ++ case mlymScriptCode: ++ result = new MalayalamOpenTypeLayoutEngine(fontInstance, scriptCode, languageCode, typoFlags, gsubTable); ++ break; + case bengScriptCode: + case devaScriptCode: + case gujrScriptCode: + case kndaScriptCode: +- case mlymScriptCode: + case oryaScriptCode: + case guruScriptCode: + case tamlScriptCode: +@@ -512,11 +515,13 @@ + result = new GXLayoutEngine(fontInstance, scriptCode, languageCode, morphTable); + } else { + switch (scriptCode) { ++ case mlymScriptCode: ++ result = new MalayalamOpenTypeLayoutEngine(fontInstance, scriptCode, languageCode, typoFlags); ++ break; + case bengScriptCode: + case devaScriptCode: + case gujrScriptCode: + case kndaScriptCode: +- case mlymScriptCode: + case oryaScriptCode: + case guruScriptCode: + case tamlScriptCode: +diff -ruN icu.orig/source/layout/LEGlyphStorage.h icu/source/layout/LEGlyphStorage.h +--- icu.orig/source/layout/LEGlyphStorage.h 2007-04-27 10:28:22.000000000 +0100 ++++ icu/source/layout/LEGlyphStorage.h 2007-04-27 10:43:54.000000000 +0100 +@@ -413,6 +413,8 @@ + */ + void adoptGlyphArray(LEGlyphStorage &from); + ++ void appendGlyphStorage(LEGlyphStorage &from); ++ + /** + * Delete the char indices array and replace it with the one + * in <code>from</code>. Set the char indices array pointer +diff -ruN icu.orig/source/layout/Makefile.in icu/source/layout/Makefile.in +--- icu.orig/source/layout/Makefile.in 2007-04-27 10:28:22.000000000 +0100 ++++ icu/source/layout/Makefile.in 2007-04-27 10:39:22.000000000 +0100 +@@ -66,6 +66,7 @@ + ArabicLayoutEngine.o \ + GXLayoutEngine.o \ + HanLayoutEngine.o \ ++MalayalamLayoutEngine.o \ + IndicLayoutEngine.o \ + LayoutEngine.o \ + ContextualGlyphSubstProc.o \ +diff -ruN icu.orig/source/layout/MalayalamLayoutEngine.cpp icu/source/layout/MalayalamLayoutEngine.cpp +--- icu.orig/source/layout/MalayalamLayoutEngine.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ icu/source/layout/MalayalamLayoutEngine.cpp 2007-04-27 10:44:26.000000000 +0100 +@@ -0,0 +1,126 @@ ++ ++/* ++ * ++ * (C) Copyright IBM Corp. 1998-2005 - All Rights Reserved ++ * ++ */ ++ ++#include "LETypes.h" ++#include "LayoutEngine.h" ++#include "OpenTypeLayoutEngine.h" ++#include "MalayalamLayoutEngine.h" ++#include "ScriptAndLanguageTags.h" ++ ++#include "GlyphSubstitutionTables.h" ++#include "GlyphDefinitionTables.h" ++#include "GlyphPositioningTables.h" ++ ++#include "GDEFMarkFilter.h" ++#include "LEGlyphStorage.h" ++ ++#include "IndicReordering.h" ++ ++#include <stdio.h> ++ ++U_NAMESPACE_BEGIN ++ ++UOBJECT_DEFINE_RTTI_IMPLEMENTATION(MalayalamOpenTypeLayoutEngine) ++ ++void LEGlyphStorage::appendGlyphStorage(LEGlyphStorage &from) ++{ ++ if (fInsertionList) applyInsertions(); ++ if (from.fInsertionList) from.applyInsertions(); ++ if ((!fInsertionList) && (from.fInsertionList)) ++ { ++ fInsertionList = from.fInsertionList; ++ from.fInsertionList = NULL; ++ } ++ ++ if (!from.fGlyphCount) ++ return; ++ ++ le_int32 newGlyphCount = fGlyphCount + from.fGlyphCount; ++ ++ fGlyphs = (LEGlyphID*)LE_GROW_ARRAY(fGlyphs, newGlyphCount); ++ LE_ARRAY_COPY(fGlyphs+fGlyphCount, from.fGlyphs, from.fGlyphCount); ++ ++ le_int32 nLargestIndex = 0; ++ if (fGlyphCount) ++ { ++ for (le_int32 i = 0; i < fGlyphCount; ++i) ++ { ++ if (fCharIndices[i] > nLargestIndex) ++ nLargestIndex = fCharIndices[i]; ++ } ++ nLargestIndex+=1; ++ } ++ fCharIndices = (le_int32 *)LE_GROW_ARRAY(fCharIndices, newGlyphCount); ++ for (le_int32 i = 0; i < from.fGlyphCount; ++i) ++ fCharIndices[fGlyphCount+i] = from.fCharIndices[i] + nLargestIndex; ++ ++ fAuxData = (le_uint32 *)LE_GROW_ARRAY(fAuxData, newGlyphCount); ++ LE_ARRAY_COPY(fAuxData+fGlyphCount, from.fAuxData, from.fGlyphCount); ++ ++ fGlyphCount = newGlyphCount; ++} ++ ++le_int32 MalayalamOpenTypeLayoutEngine::glyphPostProcessing(LEGlyphStorage &tempGlyphStorage, LEGlyphStorage &glyphStorage, LEErrorCode &success) ++{ ++ if (LE_FAILURE(success)) { ++ return 0; ++ } ++ ++ glyphStorage.appendGlyphStorage(tempGlyphStorage); ++ ++ return glyphStorage.getGlyphCount(); ++} ++ ++ ++le_int32 MalayalamOpenTypeLayoutEngine::computeGlyphs(const LEUnicode chars[], le_int32 offset, le_int32 count, le_int32 max, le_bool rightToLeft, LEGlyphStorage &glyphStorage, LEErrorCode &success) ++{ ++ if (LE_FAILURE(success)) { ++ return 0; ++ } ++ ++ if (chars == NULL || offset < 0 || count < 0 || max < 0 || offset >= max || offset + count > max) { ++ success = LE_ILLEGAL_ARGUMENT_ERROR; ++ return 0; ++ } ++ ++ le_int32 outGlyphCount=0; ++ ++ const IndicClassTable *classTable = IndicClassTable::getScriptClassTable(fScriptCode); ++ le_int32 prev = 0; ++ while (prev < count) ++ { ++ le_int32 outCharCount=0, fakeGlyphCount=0; ++ LEUnicode *outChars = NULL; ++ LEGlyphStorage fakeGlyphStorage; ++ ++ le_int32 syllable = IndicReordering::findSyllable(classTable, chars+offset, prev, count); ++ outCharCount = characterProcessing(chars+prev, offset, syllable-prev, max, rightToLeft, outChars, fakeGlyphStorage, success); ++ ++ if (LE_FAILURE(success)) { ++ return 0; ++ } ++ ++ if (outChars != NULL) { ++ fakeGlyphCount = glyphProcessing(outChars, 0, outCharCount, outCharCount, rightToLeft, fakeGlyphStorage, success); ++ LE_DELETE_ARRAY(outChars); // FIXME: a subclass may have allocated this, in which case this delete might not work... ++ } else { ++ fakeGlyphCount = glyphProcessing(chars+prev, offset, syllable-prev, max, rightToLeft, fakeGlyphStorage, success); ++ } ++ ++ if (LE_FAILURE(success)) { ++ return 0; ++ } ++ ++ outGlyphCount = glyphPostProcessing(fakeGlyphStorage, glyphStorage, success); ++ ++ prev = syllable; ++ } ++ ++ return outGlyphCount; ++} ++ ++U_NAMESPACE_END +diff -ruN icu.orig/source/layout/MalayalamLayoutEngine.h icu/source/layout/MalayalamLayoutEngine.h +--- icu.orig/source/layout/MalayalamLayoutEngine.h 1970-01-01 01:00:00.000000000 +0100 ++++ icu/source/layout/MalayalamLayoutEngine.h 2007-04-27 10:39:52.000000000 +0100 +@@ -0,0 +1,41 @@ ++ ++/* ++ * ++ * (C) Copyright IBM Corp. 1998-2005 - All Rights Reserved ++ * ++ */ ++ ++#ifndef __MALAYALAMLAYOUTENGINE_H ++#define __MALAYALAMLAYOUTENGINE_H ++ ++#include "IndicLayoutEngine.h" ++ ++U_NAMESPACE_BEGIN ++ ++class MalayalamOpenTypeLayoutEngine : public IndicOpenTypeLayoutEngine ++{ ++public: ++ MalayalamOpenTypeLayoutEngine(const LEFontInstance *fontInstance, le_int32 scriptCode, le_int32 languageCode, ++ le_int32 typoFlags, const GlyphSubstitutionTableHeader *gsubTable) : ++ IndicOpenTypeLayoutEngine(fontInstance, scriptCode, languageCode, typoFlags, gsubTable) ++ ++ {} ++ ++ MalayalamOpenTypeLayoutEngine(const LEFontInstance *fontInstance, le_int32 scriptCode, le_int32 languageCode, ++ le_int32 typoFlags) : ++ IndicOpenTypeLayoutEngine(fontInstance, scriptCode, languageCode, typoFlags) ++ ++ {} ++ ++ virtual UClassID getDynamicClassID() const; ++ static UClassID getStaticClassID(); ++ ++protected: ++ virtual le_int32 glyphPostProcessing(LEGlyphStorage &tempGlyphStorage, LEGlyphStorage &glyphStorage, LEErrorCode &success); ++ ++ virtual le_int32 computeGlyphs(const LEUnicode chars[], le_int32 offset, le_int32 count, le_int32 max, le_bool rightToLeft, LEGlyphStorage &glyphStorage, LEErrorCode &success); ++}; ++ ++U_NAMESPACE_END ++#endif ++ diff --git a/icu.icuXXXX.rollbackabi.patch b/icu.icuXXXX.rollbackabi.patch new file mode 100644 index 0000000..038d4b6 --- /dev/null +++ b/icu.icuXXXX.rollbackabi.patch @@ -0,0 +1,131 @@ +diff -ru icu.5691/source/common/ucnv2022.c icu/source/common/ucnv2022.c +--- icu.5691/source/common/ucnv2022.c 2009-06-02 16:07:36.000000000 +0100 ++++ icu/source/common/ucnv2022.c 2009-06-02 16:21:56.000000000 +0100 +@@ -3566,7 +3566,7 @@ + /* include ASCII for JP */ + sa->addRange(sa->set, 0, 0x7f); + } +- if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { ++ if(cnvData->version==3 || cnvData->version==4) { + /* + * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0 + * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8) +diff -ru icu.5691/source/common/ucnv_ext.c icu/source/common/ucnv_ext.c +--- icu.5691/source/common/ucnv_ext.c 2009-06-02 16:07:36.000000000 +0100 ++++ icu/source/common/ucnv_ext.c 2009-06-02 16:23:12.000000000 +0100 +@@ -1031,7 +1031,7 @@ + + stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]; + +- useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET); ++ useFallback=(UBool)(FALSE); + + /* enumerate the from-Unicode trie table */ + c=0; /* keep track of the current code point while enumerating */ +diff -ru icu.5691/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c +--- icu.5691/source/common/ucnvmbcs.c 2009-06-02 16:07:36.000000000 +0100 ++++ icu/source/common/ucnvmbcs.c 2009-06-02 16:23:50.000000000 +0100 +@@ -340,7 +340,7 @@ + + /* Miscellaneous ------------------------------------------------------------ */ + +-#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ ++/* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ + + /* similar to ucnv_MBCSGetNextUChar() but recursive */ + static void +@@ -434,8 +434,6 @@ + pErrorCode); + } + +-#endif +- + U_CFUNC void + ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData, + const USetAdder *sa, +@@ -511,7 +509,7 @@ + + bytes=mbcsTable->fromUnicodeBytes; + +- useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET); ++ useFallback=(UBool)(FALSE); + + switch(mbcsTable->outputType) { + case MBCS_OUTPUT_3: +diff -ru icu.5691/source/common/ucnvmbcs.h icu/source/common/ucnvmbcs.h +--- icu.5691/source/common/ucnvmbcs.h 2009-06-02 16:07:36.000000000 +0100 ++++ icu/source/common/ucnvmbcs.h 2009-06-02 16:23:50.000000000 +0100 +@@ -363,7 +363,8 @@ + ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, + UErrorCode *pErrorCode); + +-#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ ++/* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ ++ + /* + * Internal function returning a UnicodeSet for toUnicode() conversion. + * Currently only used for ISO-2022-CN, and only handles roundtrip mappings. +@@ -378,7 +379,6 @@ + UConverterUnicodeSet which, + uint8_t state, int32_t lowByte, int32_t highByte, + UErrorCode *pErrorCode); +-#endif + + /* + * Internal function returning a UnicodeSet for toUnicode() conversion. +diff -ru icu.5691/source/common/unicode/ucnv.h icu/source/common/unicode/ucnv.h +--- icu.5691/source/common/unicode/ucnv.h 2009-06-02 16:07:32.000000000 +0100 ++++ icu/source/common/unicode/ucnv.h 2009-06-02 16:20:18.000000000 +0100 +@@ -870,8 +870,6 @@ + typedef enum UConverterUnicodeSet { + /** Select the set of roundtrippable Unicode code points. @stable ICU 2.6 */ + UCNV_ROUNDTRIP_SET, +- /** Select the set of Unicode code points with roundtrip or fallback mappings. @draft ICU 4.0 */ +- UCNV_ROUNDTRIP_AND_FALLBACK_SET, + /** Number of UConverterUnicodeSet selectors. @stable ICU 2.6 */ + UCNV_SET_COUNT + } UConverterUnicodeSet; +@@ -880,16 +878,11 @@ + /** + * Returns the set of Unicode code points that can be converted by an ICU converter. + * +- * Returns one of several kinds of set: +- * +- * 1. UCNV_ROUNDTRIP_SET +- * ++ * The current implementation returns only one kind of set (UCNV_ROUNDTRIP_SET): + * The set of all Unicode code points that can be roundtrip-converted +- * (converted without any data loss) with the converter (ucnv_fromUnicode()). ++ * (converted without any data loss) with the converter. + * This set will not include code points that have fallback mappings + * or are only the result of reverse fallback mappings. +- * This set will also not include PUA code points with fallbacks, although +- * ucnv_fromUnicode() will always uses those mappings despite ucnv_setFallback(). + * See UTR #22 "Character Mapping Markup Language" + * at http://www.unicode.org/reports/tr22/ + * +@@ -900,12 +893,6 @@ + * by comparing its roundtrip set with the set of ExemplarCharacters from + * ICU's locale data or other sources + * +- * 2. UCNV_ROUNDTRIP_AND_FALLBACK_SET +- * +- * The set of all Unicode code points that can be converted with the converter (ucnv_fromUnicode()) +- * when fallbacks are turned on (see ucnv_setFallback()). +- * This set includes all code points with roundtrips and fallbacks (but not reverse fallbacks). +- * + * In the future, there may be more UConverterUnicodeSet choices to select + * sets with different properties. + * +diff -ru icu.5691/source/test/intltest/convtest.cpp icu/source/test/intltest/convtest.cpp +--- icu.5691/source/test/intltest/convtest.cpp 2009-06-02 16:07:21.000000000 +0100 ++++ icu/source/test/intltest/convtest.cpp 2009-06-02 16:24:08.000000000 +0100 +@@ -552,7 +552,7 @@ + } + UConverterUnicodeSet which; + for(which=UCNV_ROUNDTRIP_SET; which<UCNV_SET_COUNT; which=(UConverterUnicodeSet)((int)which+1)) { +- if(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { ++ if(FALSE) { + ucnv_setFallback(cnv, TRUE); + } + expected.add(0, cpLimit-1); diff --git a/icu.icuXXXX.virama.prevnext.patch b/icu.icuXXXX.virama.prevnext.patch new file mode 100644 index 0000000..49393c2 --- /dev/null +++ b/icu.icuXXXX.virama.prevnext.patch @@ -0,0 +1,98 @@ +diff -ur icu.orig/source/common/rbbi.cpp icu/source/common/rbbi.cpp +--- icu.orig/source/common/rbbi.cpp 2006-10-05 11:54:13.000000000 +0100 ++++ icu/source/common/rbbi.cpp 2006-10-05 11:57:31.000000000 +0100 +@@ -879,6 +879,22 @@ + RBBI_END // state machine processing is after end of user text. + }; + ++#define VIRAMA_SCRIPT(wc) ((wc) >= 0x0901 && (wc) <= 0x17FF) ++#define VIRAMA(wc) ((wc) == 0x094D || \ ++ (wc) == 0x09CD || \ ++ (wc) == 0x0A4D || \ ++ (wc) == 0x0ACD || \ ++ (wc) == 0x0B4D || \ ++ (wc) == 0x0BCD || \ ++ (wc) == 0x0C4D || \ ++ (wc) == 0x0CCD || \ ++ (wc) == 0x0D4D || \ ++ (wc) == 0x0DCA || \ ++ (wc) == 0x0E3A || \ ++ (wc) == 0x0F84 || \ ++ (wc) == 0x1039 || \ ++ (wc) == 0x17D2 || \ ++ (wc) == 0x200D) + + //----------------------------------------------------------------------------------- + // +@@ -896,6 +911,7 @@ + RBBIRunMode mode; + + RBBIStateTableRow *row; ++ UChar32 prevchar; + UChar32 c; + int32_t lookaheadStatus = 0; + int32_t lookaheadTagIdx = 0; +@@ -919,6 +935,7 @@ + // if we're already at the end of the text, return DONE. + initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); + result = initialPosition; ++ prevchar = 0; + c = UTEXT_NEXT32(fText); + if (fData == NULL || c==U_SENTINEL) { + return BreakIterator::DONE; +@@ -1001,6 +1018,11 @@ + + // State Transition - move machine to its next state + // ++ if (VIRAMA_SCRIPT(c) && VIRAMA(prevchar)) ++ { ++ state = START_STATE; ++ row = (RBBIStateTableRow *) (tableData + tableRowLen * state); ++ } + state = row->fNextState[category]; + row = (RBBIStateTableRow *) + // (statetable->fTableData + (statetable->fRowLen * state)); +@@ -1059,6 +1081,7 @@ + // the input position. The next iteration will be processing the + // first real input character. + if (mode == RBBI_RUN) { ++ prevchar = c; + c = UTEXT_NEXT32(fText); + } else { + if (mode == RBBI_START) { +@@ -1107,6 +1130,7 @@ + int16_t category = 0; + RBBIRunMode mode; + RBBIStateTableRow *row; ++ UChar32 prevchar; + UChar32 c; + int32_t lookaheadStatus = 0; + int32_t result = 0; +@@ -1135,6 +1159,7 @@ + // Set up the starting char. + initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); + result = initialPosition; ++ prevchar = 0; + c = UTEXT_PREVIOUS32(fText); + + // Set the initial state for the state machine +@@ -1218,6 +1243,11 @@ + + // State Transition - move machine to its next state + // ++ if (VIRAMA_SCRIPT(prevchar) && VIRAMA(c)) ++ { ++ state = START_STATE; ++ row = (RBBIStateTableRow *) (statetable->fTableData + (statetable->fRowLen * state)); ++ } + state = row->fNextState[category]; + row = (RBBIStateTableRow *) + (statetable->fTableData + (statetable->fRowLen * state)); +@@ -1269,6 +1299,7 @@ + // the input position. The next iteration will be processing the + // first real input character. + if (mode == RBBI_RUN) { ++ prevchar = c; + c = UTEXT_PREVIOUS32(fText); + } else { + if (mode == RBBI_START) { diff --git a/icu.rh429023.regexp.patch b/icu.rh429023.regexp.patch new file mode 100644 index 0000000..ef8eded --- /dev/null +++ b/icu.rh429023.regexp.patch @@ -0,0 +1,307 @@ +diff -ru icu.orig/source/common/uvectr32.cpp icu/source/common/uvectr32.cpp +--- icu.orig/source/common/uvectr32.cpp 2003-08-27 02:01:30.000000000 +0100 ++++ icu/source/common/uvectr32.cpp 2008-01-22 08:37:06.000000000 +0000 +@@ -1,6 +1,6 @@ + /* + ****************************************************************************** +-* Copyright (C) 1999-2003, International Business Machines Corporation and * ++* Copyright (C) 1999-2008, International Business Machines Corporation and * + * others. All Rights Reserved. * + ****************************************************************************** + * Date Name Description +@@ -26,6 +26,7 @@ + UVector32::UVector32(UErrorCode &status) : + count(0), + capacity(0), ++ maxCapacity(0), + elements(NULL) + { + _init(DEFUALT_CAPACITY, status); +@@ -34,6 +35,7 @@ + UVector32::UVector32(int32_t initialCapacity, UErrorCode &status) : + count(0), + capacity(0), ++ maxCapacity(0), + elements(0) + { + _init(initialCapacity, status); +@@ -46,6 +48,9 @@ + if (initialCapacity < 1) { + initialCapacity = DEFUALT_CAPACITY; + } ++ if (maxCapacity>0 && maxCapacity<initialCapacity) { ++ initialCapacity = maxCapacity; ++ } + elements = (int32_t *)uprv_malloc(sizeof(int32_t)*initialCapacity); + if (elements == 0) { + status = U_MEMORY_ALLOCATION_ERROR; +@@ -189,21 +194,35 @@ + UBool UVector32::expandCapacity(int32_t minimumCapacity, UErrorCode &status) { + if (capacity >= minimumCapacity) { + return TRUE; +- } else { +- int32_t newCap = capacity * 2; +- if (newCap < minimumCapacity) { +- newCap = minimumCapacity; +- } +- int32_t* newElems = (int32_t *)uprv_malloc(sizeof(int32_t)*newCap); +- if (newElems == 0) { +- status = U_MEMORY_ALLOCATION_ERROR; +- return FALSE; +- } +- uprv_memcpy(newElems, elements, sizeof(elements[0]) * count); +- uprv_free(elements); +- elements = newElems; +- capacity = newCap; +- return TRUE; ++ } ++ if (maxCapacity>0 && minimumCapacity>maxCapacity) { ++ status = U_BUFFER_OVERFLOW_ERROR; ++ return FALSE; ++ } ++ int32_t newCap = capacity * 2; ++ if (newCap < minimumCapacity) { ++ newCap = minimumCapacity; ++ } ++ if (maxCapacity > 0 && newCap > maxCapacity) { ++ newCap = maxCapacity; ++ } ++ int32_t* newElems = (int32_t *)uprv_malloc(sizeof(int32_t)*newCap); ++ if (newElems == 0) { ++ status = U_MEMORY_ALLOCATION_ERROR; ++ return FALSE; ++ } ++ uprv_memcpy(newElems, elements, sizeof(elements[0]) * count); ++ uprv_free(elements); ++ elements = newElems; ++ capacity = newCap; ++ return TRUE; ++} ++ ++void UVector32::setMaxCapacity(int32_t limit) { ++ U_ASSERT(limit >= 0); ++ maxCapacity = limit; ++ if (maxCapacity < 0) { ++ maxCapacity = 0; + } + } + +diff -ru icu.orig/source/common/uvectr32.h icu/source/common/uvectr32.h +--- icu.orig/source/common/uvectr32.h 2006-01-18 03:52:04.000000000 +0000 ++++ icu/source/common/uvectr32.h 2008-01-22 08:37:07.000000000 +0000 +@@ -1,6 +1,6 @@ + /* + ********************************************************************** +-* Copyright (C) 1999-2006, International Business Machines ++* Copyright (C) 1999-2008, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + */ +@@ -61,6 +61,8 @@ + int32_t count; + + int32_t capacity; ++ ++ int32_t maxCapacity; // Limit beyond which capacity is not permitted to grow. + + int32_t* elements; + +@@ -162,6 +164,14 @@ + int32_t *getBuffer() const; + + /** ++ * Set the maximum allowed buffer capacity for this vector/stack. ++ * Default with no limit set is unlimited, go until malloc() fails. ++ * A Limit of zero means unlimited capacity. ++ * Units are vector elements (32 bits each), not bytes. ++ */ ++ void setMaxCapacity(int32_t limit); ++ ++ /** + * ICU "poor man's RTTI", returns a UClassID for this class. + */ + static UClassID U_EXPORT2 getStaticClassID(); +@@ -221,7 +231,9 @@ + } + + inline int32_t *UVector32::reserveBlock(int32_t size, UErrorCode &status) { +- ensureCapacity(count+size, status); ++ if (ensureCapacity(count+size, status) == FALSE) { ++ return NULL; ++ } + int32_t *rp = elements+count; + count += size; + return rp; +diff -ru icu.orig/source/i18n/regexcmp.cpp icu/source/i18n/regexcmp.cpp +--- icu.orig/source/i18n/regexcmp.cpp 2006-02-02 04:37:14.000000000 +0000 ++++ icu/source/i18n/regexcmp.cpp 2008-01-22 08:37:06.000000000 +0000 +@@ -1187,14 +1187,17 @@ + // Because capture groups can be forward-referenced by back-references, + // we fill the operand with the capture group number. At the end + // of compilation, it will be changed to the variable's location. +- U_ASSERT(groupNum > 0); +- int32_t op; +- if (fModeFlags & UREGEX_CASE_INSENSITIVE) { +- op = URX_BUILD(URX_BACKREF_I, groupNum); ++ if (groupNum < 1) { ++ error(U_REGEX_INVALID_BACK_REF); + } else { +- op = URX_BUILD(URX_BACKREF, groupNum); ++ int32_t op; ++ if (fModeFlags & UREGEX_CASE_INSENSITIVE) { ++ op = URX_BUILD(URX_BACKREF_I, groupNum); ++ } else { ++ op = URX_BUILD(URX_BACKREF, groupNum); ++ } ++ fRXPat->fCompiledPat->addElement(op, *fStatus); + } +- fRXPat->fCompiledPat->addElement(op, *fStatus); + } + break; + +diff -ru icu.orig/source/i18n/rematch.cpp icu/source/i18n/rematch.cpp +--- icu.orig/source/i18n/rematch.cpp 2005-08-25 19:02:20.000000000 +0100 ++++ icu/source/i18n/rematch.cpp 2008-01-22 08:37:44.000000000 +0000 +@@ -30,6 +30,15 @@ + + U_NAMESPACE_BEGIN + ++// Limit the size of the back track stack, to avoid system failures caused ++// by heap exhaustion. Units are in 32 bit words, not bytes. ++// This value puts ICU's limits higher than most other regexp implementations, ++// which use recursion rather than the heap, and take more storage per ++// backtrack point. ++// This constant is _temporary_. Proper API to control the value will added. ++// ++static const int32_t BACKTRACK_STACK_CAPACITY = 8000000; ++ + //----------------------------------------------------------------------------- + // + // Constructor and Destructor +@@ -53,6 +62,8 @@ + } + if (fStack == NULL || fData == NULL) { + fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; ++ } else { ++ fStack->setMaxCapacity(BACKTRACK_STACK_CAPACITY); + } + + reset(*RegexStaticSets::gStaticSets->fEmptyString); +@@ -78,6 +89,8 @@ + } + if (fStack == NULL || fData == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; ++ } else { ++ fStack->setMaxCapacity(BACKTRACK_STACK_CAPACITY); + } + reset(input); + } +@@ -102,6 +115,8 @@ + } + if (fStack == NULL || fData == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; ++ } else { ++ fStack->setMaxCapacity(BACKTRACK_STACK_CAPACITY); + } + reset(*RegexStaticSets::gStaticSets->fEmptyString); + } +@@ -1015,6 +1030,14 @@ + inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int32_t savePatIdx, int32_t frameSize, UErrorCode &status) { + // push storage for a new frame. + int32_t *newFP = fStack->reserveBlock(frameSize, status); ++ if (newFP == NULL) { ++ // Heap allocation error on attempted stack expansion. ++ // We need to return a writable stack frame, so just return the ++ // previous frame. The match operation will stop quickly ++ // becuase of the error status, after which the frame will never ++ // be looked at again. ++ return fp; ++ } + fp = (REStackFrame *)(newFP - frameSize); // in case of realloc of stack. + + // New stack frame = copy of old top frame. +@@ -1030,8 +1053,8 @@ + fp->fPatIdx = savePatIdx; + return (REStackFrame *)newFP; + } +- +- ++ ++ + //-------------------------------------------------------------------------------- + // + // MatchAt This is the actual matching engine. +@@ -2262,6 +2285,7 @@ + } + + if (U_FAILURE(status)) { ++ isMatch = FALSE; + break; + } + } +diff -ru icu.orig/source/test/intltest/regextst.cpp icu/source/test/intltest/regextst.cpp +--- icu.orig/source/test/intltest/regextst.cpp 2005-07-05 19:39:00.000000000 +0100 ++++ icu/source/test/intltest/regextst.cpp 2008-01-22 08:38:21.000000000 +0000 +@@ -66,6 +66,10 @@ + case 6: name = "PerlTests"; + if (exec) PerlTests(); + break; ++ case 7: name = "Bug 6149"; ++ if (exec) Bug6149(); ++ break; ++ + + + default: name = ""; +@@ -1637,6 +1641,13 @@ + // UnicodeSet containing a string + REGEX_ERR("abc[{def}]xyz", 1, 10, U_REGEX_SET_CONTAINS_STRING); + ++ ++ // Invalid Back Reference \0 ++ // For ICU 3.8 and earlier ++ // For ICU versions newer than 3.8, \0 introduces an octal escape. ++ // ++ REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_INVALID_BACK_REF); ++ + } + + +@@ -2119,6 +2130,26 @@ + } + + ++//-------------------------------------------------------------- ++// ++// Bug6149 Verify limits to heap expansion for backtrack stack. ++// Use this pattern, ++// "(a?){1,}" ++// The zero-length match will repeat forever. ++// (That this goes into a loop is another bug) ++// ++//--------------------------------------------------------------- ++void RegexTest::Bug6149() { ++ UnicodeString pattern("(a?){1,}"); ++ UnicodeString s("xyz"); ++ uint32_t flags = 0; ++ UErrorCode status = U_ZERO_ERROR; ++ ++ RegexMatcher matcher(pattern, s, flags, status); ++ UBool result = false; ++ REGEX_ASSERT_FAIL(result=matcher.matches(status), U_BUFFER_OVERFLOW_ERROR); ++ REGEX_ASSERT(result == FALSE); ++ } + + #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ + +diff -ru icu.orig/source/test/intltest/regextst.h icu/source/test/intltest/regextst.h +--- icu.orig/source/test/intltest/regextst.h 2003-12-03 06:58:28.000000000 +0000 ++++ icu/source/test/intltest/regextst.h 2008-01-22 08:37:06.000000000 +0000 +@@ -30,6 +30,7 @@ + virtual void Extended(); + virtual void Errors(); + virtual void PerlTests(); ++ virtual void Bug6149(); + + // The following functions are internal to the regexp tests. + virtual UBool doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int line); |