1 files changed, 397 insertions, 0 deletions
diff --git a/icu.icu6002.backport.patch b/icu.icu6002.backport.patch
new file mode 100644
index 0000000..51f0d75
--- /dev/null
+++ b/icu.icu6002.backport.patch
@@ -0,0 +1,397 @@
+diff -ru icu.6001/source/common/ucnv_ext.c icu/source/common/ucnv_ext.c
+--- icu.6001/source/common/ucnv_ext.c	2009-06-02 15:29:01.000000000 +0100
++++ icu/source/common/ucnv_ext.c	2009-06-02 15:29:18.000000000 +0100
+@@ -1036,15 +1036,13 @@
+     /* enumerate the from-Unicode trie table */
+     c=0; /* keep track of the current code point while enumerating */
+ 
+-    if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ||
+-        filter==UCNV_SET_FILTER_DBCS_ONLY ||
+-        filter==UCNV_SET_FILTER_SJIS ||
+-        filter==UCNV_SET_FILTER_GR94DBCS
++    if(filter==UCNV_SET_FILTER_2022_CN) {
++        minLength=3;
++    } else if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ||
++               filter!=UCNV_SET_FILTER_NONE
+     ) {
+         /* DBCS-only, ignore single-byte results */
+         minLength=2;
+-    } else if(filter==UCNV_SET_FILTER_2022_CN) {
+-        minLength=3;
+     } else {
+         minLength=1;
+     }
+@@ -1104,6 +1102,13 @@
+                                     continue;
+                                 }
+                                 break;
++                            case UCNV_SET_FILTER_HZ:
++                                if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&
++                                     (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
++                                     (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) {
++                                    continue;
++                                }
++                                break;
+                             default:
+                                 /*
+                                  * UCNV_SET_FILTER_NONE,
+diff -ru icu.6001/source/common/ucnvhz.c icu/source/common/ucnvhz.c
+--- icu.6001/source/common/ucnvhz.c	2009-06-02 15:29:01.000000000 +0100
++++ icu/source/common/ucnvhz.c	2009-06-02 15:29:15.000000000 +0100
+@@ -72,7 +72,7 @@
+     cnv->extraInfo = uprv_malloc(sizeof(UConverterDataHZ));
+     if(cnv->extraInfo != NULL){
+         uprv_memset(cnv->extraInfo, 0, sizeof(UConverterDataHZ));
+-        ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("ibm-1386",errorCode);
++        ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("GBK",errorCode);
+     }
+     else {
+         *errorCode = U_MEMORY_ALLOCATION_ERROR;
+@@ -141,7 +141,7 @@
+     UChar *myTarget = args->target;
+     const char *mySourceLimit = args->sourceLimit;
+     UChar32 targetUniChar = 0x0000;
+-    UChar mySourceChar = 0x0000;
++    int32_t mySourceChar = 0x0000;
+     UConverterDataHZ* myData=(UConverterDataHZ*)(args->converter->extraInfo);
+     tempBuf[0]=0; 
+     tempBuf[1]=0;
+@@ -156,90 +156,71 @@
+             
+             mySourceChar= (unsigned char) *mySource++;
+ 
+-            switch(mySourceChar){
++            if(args->converter->mode == UCNV_TILDE) {
++                /* second byte after ~ */
++                args->converter->mode=0;
++                switch(mySourceChar) {
+                 case 0x0A:
+-                    if(args->converter->mode ==UCNV_TILDE){
+-                        args->converter->mode=0;
+-                        
+-                    }
+-                    *(myTarget++)=(UChar)mySourceChar;
++                    /* no output for ~\n (line-continuation marker) */
+                     continue;
+-            
+                 case UCNV_TILDE:
+-                    if(args->converter->mode ==UCNV_TILDE){
+-                        *(myTarget++)=(UChar)mySourceChar;
+-                        args->converter->mode=0;
+-                        continue;
+-                        
++                    if(args->offsets) {
++                        args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 2);
+                     }
+-                    else if(args->converter->toUnicodeStatus !=0){
+-                        args->converter->mode=0;
+-                        break;
+-                    }
+-                    else{
+-                        args->converter->mode = UCNV_TILDE;
+-                        continue;
+-                    }
+-                
+-                
++                    *(myTarget++)=(UChar)mySourceChar;
++                    continue;
+                 case UCNV_OPEN_BRACE:
+-                    if(args->converter->mode == UCNV_TILDE){
+-                        args->converter->mode=0;
+-                        myData->isStateDBCS = TRUE;
+-                        continue;
+-                    }
+-                    else{
+-                        break;
+-                    }
+-               
+-                
++                    myData->isStateDBCS = TRUE;
++                    continue;
+                 case UCNV_CLOSE_BRACE:
+-                    if(args->converter->mode == UCNV_TILDE){
+-                        args->converter->mode=0;
+-                         myData->isStateDBCS = FALSE;
+-                        continue;
+-                    }
+-                    else{
+-                        break;
+-                    }
+-                
++                    myData->isStateDBCS = FALSE;
++                    continue;
+                 default:
+                      /* if the first byte is equal to TILDE and the trail byte
+                      * is not a valid byte then it is an error condition
+                      */
+-                    if(args->converter->mode == UCNV_TILDE){
+-                        args->converter->mode=0;
+-                        mySourceChar= (UChar)(((UCNV_TILDE+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80));
+-                        goto SAVE_STATE;
+-                    }
+-                    
++                    mySourceChar = 0x7e00 | mySourceChar;
++                    targetUniChar = 0xffff;
+                     break;
+-
+-            }
+-             
+-            if(myData->isStateDBCS){
++                }
++            } else if(myData->isStateDBCS) {
+                 if(args->converter->toUnicodeStatus == 0x00){
+-                    args->converter->toUnicodeStatus = (UChar) mySourceChar;
++                    /* lead byte */
++                    if(mySourceChar == UCNV_TILDE) {
++                        args->converter->mode = UCNV_TILDE;
++                    } else {
++                        /* add another bit to distinguish a 0 byte from not having seen a lead byte */
++                        args->converter->toUnicodeStatus = (uint32_t) (mySourceChar | 0x100);
++                    }
+                     continue;
+                 }
+                 else{
+-                    tempBuf[0] = (char) (args->converter->toUnicodeStatus+0x80) ;
+-                    tempBuf[1] = (char) (mySourceChar+0x80);
+-                    mySourceChar= (UChar)(((args->converter->toUnicodeStatus+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80));
++                    /* trail byte */
++                    uint32_t leadByte = args->converter->toUnicodeStatus & 0xff;
++                    if( (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21) &&
++                        (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21)
++                    ) {
++                        tempBuf[0] = (char) (leadByte+0x80) ;
++                        tempBuf[1] = (char) (mySourceChar+0x80);
++                        targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
++                            tempBuf, 2, args->converter->useFallback);
++                    } else {
++                        targetUniChar = 0xffff;
++                    }
++                    /* add another bit so that the code below writes 2 bytes in case of error */
++                    mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar;
+                     args->converter->toUnicodeStatus =0x00;
+-                    targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
+-                        tempBuf, 2, args->converter->useFallback);
+                 }
+             }
+             else{
+-                if(args->converter->fromUnicodeStatus == 0x00){
+-                    targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
+-                        mySource - 1, 1, args->converter->useFallback);
+-                }
+-                else{
+-                    goto SAVE_STATE;
++                if(mySourceChar == UCNV_TILDE) {
++                    args->converter->mode = UCNV_TILDE;
++                    continue;
++                } else if(mySourceChar <= 0x7f) {
++                    targetUniChar = (UChar)mySourceChar;  /* ASCII */
++                } else {
++                    targetUniChar = 0xffff;
+                 }
+-
+             }
+             if(targetUniChar < 0xfffe){
+                 if(args->offsets) {
+@@ -248,26 +229,17 @@
+ 
+                 *(myTarget++)=(UChar)targetUniChar;
+             }
+-            else if(targetUniChar>=0xfffe){
+-SAVE_STATE:
++            else /* targetUniChar>=0xfffe */ {
+                 if(targetUniChar == 0xfffe){
+                     *err = U_INVALID_CHAR_FOUND;
+                 }
+                 else{
+                     *err = U_ILLEGAL_CHAR_FOUND;
+                 }
+-                if(myData->isStateDBCS){
+-                    /* this should never occur since isStateDBCS is set to true 
+-                     * only after tempBuf[0] and tempBuf[1]
+-                     * are set to the input ..  just to please BEAM 
+-                     */
+-                    if(tempBuf[0]==0 || tempBuf[1]==0){
+-                        *err = U_INTERNAL_PROGRAM_ERROR;
+-                    }else{
+-                        args->converter->toUBytes[0] = (uint8_t)(tempBuf[0]-0x80);
+-                        args->converter->toUBytes[1] = (uint8_t)(tempBuf[1]-0x80);
+-                        args->converter->toULength=2;
+-                    }
++                if(mySourceChar > 0xff){
++                    args->converter->toUBytes[0] = (uint8_t)(mySourceChar >> 8);
++                    args->converter->toUBytes[1] = (uint8_t)mySourceChar;
++                    args->converter->toULength=2;
+                 }
+                 else{
+                     args->converter->toUBytes[0] = (uint8_t)mySourceChar;
+@@ -328,16 +300,21 @@
+                 escSeq = TILDE_ESCAPE;
+                 CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
+                 continue;
+-            }
+-            else{
++            } else if(mySourceChar <= 0x7f) {
++                length = 1;
++                targetUniChar = mySourceChar;
++            } else {
+                 length= ucnv_MBCSFromUChar32(myConverterData->gbConverter->sharedData,
+                     mySourceChar,&targetUniChar,args->converter->useFallback);
+-
+-            }
+-            /* only DBCS or SBCS characters are expected*/
+-            /* DB haracters with high bit set to 1 are expected */
+-            if(length > 2 || length==0 ||(((targetUniChar & 0x8080) != 0x8080)&& length==2)){
+-                targetUniChar= missingCharMarker;
++                /* we can only use lead bytes 21..7D and trail bytes 21..7E */
++                if( length == 2 &&
++                    (uint16_t)(targetUniChar - 0xa1a1) <= (0xfdfe - 0xa1a1) &&
++                    (uint8_t)(targetUniChar - 0xa1) <= (0xfe - 0xa1)
++                ) {
++                    targetUniChar -= 0x8080;
++                } else {
++                    targetUniChar = missingCharMarker;
++                }
+             }
+             if (targetUniChar != missingCharMarker){
+                myConverterData->isTargetUCharDBCS = isTargetUCharDBCS = (UBool)(targetUniChar>0x00FF);     
+@@ -360,22 +337,22 @@
+             
+                 if(isTargetUCharDBCS){
+                     if( myTargetIndex <targetLength){
+-                        myTarget[myTargetIndex++] =(char) ((targetUniChar >> 8) -0x80);
++                        myTarget[myTargetIndex++] =(char) (targetUniChar >> 8);
+                         if(offsets){
+                             *(offsets++) = mySourceIndex-1;
+                         }
+                         if(myTargetIndex < targetLength){
+-                            myTarget[myTargetIndex++] =(char) ((targetUniChar & 0x00FF) -0x80);
++                            myTarget[myTargetIndex++] =(char) targetUniChar;
+                             if(offsets){
+                                 *(offsets++) = mySourceIndex-1;
+                             }
+                         }else{
+-                            args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80);
++                            args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar;
+                             *err = U_BUFFER_OVERFLOW_ERROR;
+                         } 
+                     }else{
+-                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) ((targetUniChar >> 8) -0x80);
+-                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80);
++                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) (targetUniChar >> 8);
++                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar;
+                         *err = U_BUFFER_OVERFLOW_ERROR;
+                     }
+ 
+@@ -524,15 +501,14 @@
+                   const USetAdder *sa,
+                   UConverterUnicodeSet which,
+                   UErrorCode *pErrorCode) {
+-    /* the tilde '~' is hardcoded in the converter */
+-    sa->add(sa->set, 0x7e);
++    /* HZ converts all of ASCII */
++    sa->addRange(sa->set, 0, 0x7f);
+ 
+     /* add all of the code points that the sub-converter handles */
+-    /* ucnv_MBCSGetFilteredUnicodeSetForUnicode(((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData, sa, which, UCNV_SET_FILTER_GR94DBCS, pErrorCode); */
+-    ((UConverterDataHZ*)cnv->extraInfo)->
+-        gbConverter->sharedData->impl->
+-            getUnicodeSet(((UConverterDataHZ*)cnv->extraInfo)->gbConverter,
+-                          sa, which, pErrorCode);
++    ucnv_MBCSGetFilteredUnicodeSetForUnicode(
++        ((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData,
++        sa, which, UCNV_SET_FILTER_HZ,
++        pErrorCode);
+ }
+ 
+ static const UConverterImpl _HZImpl={
+diff -ru icu.6001/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c
+--- icu.6001/source/common/ucnvmbcs.c	2009-06-02 15:29:01.000000000 +0100
++++ icu/source/common/ucnvmbcs.c	2009-06-02 15:35:01.000000000 +0100
+@@ -612,6 +612,19 @@
+                                 stage3+=2;  /* +=st3Multiplier */
+                             } while((++c&0xf)!=0);
+                             break;
++                        case UCNV_SET_FILTER_HZ:
++                            /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */
++                            do {
++                                if( ((st3&1)!=0 || useFallback) &&
++                                    (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
++                                    (uint8_t)(value - 0xa1)<=(0xfe - 0xa1)
++                                ) {
++                                    sa->add(sa->set, c);
++                                }
++                                st3>>=1;
++                                stage3+=2;  /* +=st3Multiplier */
++                            } while((++c&0xf)!=0);
++                            break;
+                         default:
+                             *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
+                             return;
+diff -ru icu.6001/source/common/ucnvmbcs.h icu/source/common/ucnvmbcs.h
+--- icu.6001/source/common/ucnvmbcs.h	2009-06-02 15:29:01.000000000 +0100
++++ icu/source/common/ucnvmbcs.h	2009-06-02 15:29:15.000000000 +0100
+@@ -400,6 +400,7 @@
+     UCNV_SET_FILTER_2022_CN,
+     UCNV_SET_FILTER_SJIS,
+     UCNV_SET_FILTER_GR94DBCS,
++    UCNV_SET_FILTER_HZ,
+     UCNV_SET_FILTER_COUNT
+ } UConverterSetFilter;
+ 
+diff -ru icu.6001/source/test/cintltst/ncnvtst.c icu/source/test/cintltst/ncnvtst.c
+--- icu.6001/source/test/cintltst/ncnvtst.c	2009-06-02 15:28:46.000000000 +0100
++++ icu/source/test/cintltst/ncnvtst.c	2009-06-02 15:29:15.000000000 +0100
+@@ -1928,7 +1928,7 @@
+ #if !UCONFIG_NO_LEGACY_CONVERSION
+         { "UTF-8", 0, 0xd7ff, 0xe000, 0x10ffff, 0xd800, 0xdfff },
+         { "windows-1251", 0, 0x7f, 0x410, 0x44f, 0x3000, 0xd7ff },
+-        { "HZ", 0x410, 0x44f, 0x4e00, 0x4eff, 0xac00, 0xd7ff },
++        /* HZ test case fixed and moved to intltest's conversion.txt, ticket #6002 */
+         { "shift-jis", 0x3041, 0x3093, 0x30a1, 0x30f3, 0x900, 0x1cff }
+ #else
+         { "UTF-8", 0, 0xd7ff, 0xe000, 0x10ffff, 0xd800, 0xdfff }
+diff -ru icu.6001/source/test/intltest/convtest.cpp icu/source/test/intltest/convtest.cpp
+--- icu.6001/source/test/intltest/convtest.cpp	2009-06-02 15:28:46.000000000 +0100
++++ icu/source/test/intltest/convtest.cpp	2009-06-02 15:29:15.000000000 +0100
+@@ -527,7 +527,7 @@
+         "Shift-JIS",
+         "ibm-1390",  // EBCDIC_STATEFUL table
+         "ibm-16684",  // DBCS-only extension table based on EBCDIC_STATEFUL table
+-        // "HZ", TODO(markus): known bug, the set incorrectly contains [\u02CA\u02CB\u02D9\u2010\u2013\u2015...]
++        "HZ",
+         "ISO-2022-JP",
+         "JIS7",
+         "ISO-2022-CN",
+diff -ru icu.6001/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt
+--- icu.6001/source/test/testdata/conversion.txt	2009-06-02 15:28:46.000000000 +0100
++++ icu/source/test/testdata/conversion.txt	2009-06-02 15:29:15.000000000 +0100
+@@ -48,6 +48,14 @@
+     toUnicode {
+       Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }
+       Cases {
++        // test that HZ limits its byte values to lead bytes 21..7d and trail bytes 21..7e
++        {
++          "HZ",
++          :bin{ 7e7b21212120217e217f772100007e217e7d207e7e807e0a2b },
++          "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd ~\ufffd+",
++          :intvector{ 2,4,6,8,10,12,14,18,19,21,24 },
++          :int{1}, :int{1}, "", "?", :bin{""}
++        }
+         // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and
+         // using the Shift-JIS table for JIS X 0208 (ticket #5797)
+         {
+@@ -1244,6 +1252,14 @@
+           :int{0}
+         }
+ 
++        // HZ
++        {
++          "HZ",
++          "[\u0410-\u044f\u4e00\u4e01\u4e03]",
++          "[\u4e02\u4e04-\u4e06\uac00-\ud7ff]",
++          :int{0}
++        }
++        
+         // DBCS-only
+         {
+           "ibm-971",