diff options
Diffstat (limited to 'icu.icu5797.backport.patch')
-rw-r--r-- | icu.icu5797.backport.patch | 749 |
1 files changed, 749 insertions, 0 deletions
diff --git a/icu.icu5797.backport.patch b/icu.icu5797.backport.patch new file mode 100644 index 0000000..39e3f77 --- /dev/null +++ b/icu.icu5797.backport.patch @@ -0,0 +1,749 @@ +diff -ru icu.5483/source/common/ucnv2022.c icu/source/common/ucnv2022.c +--- icu.5483/source/common/ucnv2022.c 2009-06-02 12:47:41.000000000 +0100 ++++ icu/source/common/ucnv2022.c 2009-06-02 13:18:23.000000000 +0100 +@@ -473,8 +473,7 @@ + if(jpCharsetMasks[version]&CSM(ISO8859_7)) { + myConverterData->myConverterArray[ISO8859_7]= ucnv_loadSharedData("ISO8859_7", NULL, errorCode); + } +- myConverterData->myConverterArray[JISX201] = ucnv_loadSharedData("JISX0201", NULL, errorCode); +- myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("jisx-208", NULL, errorCode); ++ myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("Shift-JIS", NULL, errorCode); + if(jpCharsetMasks[version]&CSM(JISX212)) { + myConverterData->myConverterArray[JISX212] = ucnv_loadSharedData("jisx-212", NULL, errorCode); + } +@@ -1045,14 +1044,6 @@ + length=3; + } + } +- /* +- * TODO(markus): Use Shift-JIS table for JIS X 0208, to save mapping table space. +- * Pass in parameter for type of output bytes, for validation and shifting: +- * - Direct: Pass bytes through, but forbid control codes 00-1F (except SI/SO/ESC) and space 20? +- * (Need to allow some (TAB/LF/CR) or most of them for ASCII and maybe JIS X 0201.) +- * - A1-FE: Subtract 80 after range check. +- * - SJIS: Shift DBCS result to 21-7E x 21-7E. +- */ + /* is this code point assigned, or do we use fallbacks? */ + if((stage2Entry&(1<<(16+(c&0xf))))!=0) { + /* assigned */ +@@ -1110,6 +1101,23 @@ + } + } + ++/* ++ * Check that the result is a 2-byte value with each byte in the range A1..FE ++ * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte ++ * to move it to the ISO 2022 range 21..7E. ++ * Return 0 if out of range. ++ */ ++static U_INLINE uint32_t ++_2022FromGR94DBCS(uint32_t value) { ++ if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && ++ (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) ++ ) { ++ return value - 0x8080; /* shift down to 21..7e byte range */ ++ } else { ++ return 0; /* not valid for ISO 2022 */ ++ } ++} ++ + #ifdef U_ENABLE_GENERIC_ISO_2022 + + /********************************************************************************** +@@ -1238,7 +1246,7 @@ + } + else{ + cnv->toUBytes[0] =(char) sourceChar; +- cnv->toULength = 2; ++ cnv->toULength = 1; + } + + if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){ +@@ -1332,6 +1340,181 @@ + 3 /* length of <ESC>(I HWKANA_7BIT */ + }; + ++/* Map 00..7F to Unicode according to JIS X 0201. */ ++static U_INLINE uint32_t ++jisx201ToU(uint32_t value) { ++ if(value < 0x5c) { ++ return value; ++ } else if(value == 0x5c) { ++ return 0xa5; ++ } else if(value == 0x7e) { ++ return 0x203e; ++ } else /* value <= 0x7f */ { ++ return value; ++ } ++} ++ ++/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */ ++static U_INLINE uint32_t ++jisx201FromU(uint32_t value) { ++ if(value<=0x7f) { ++ if(value!=0x5c && value!=0x7e) { ++ return value; ++ } ++ } else if(value==0xa5) { ++ return 0x5c; ++ } else if(value==0x203e) { ++ return 0x7e; ++ } ++ return 0xfffe; ++} ++ ++/* ++ * Take a valid Shift-JIS byte pair, check that it is in the range corresponding ++ * to JIS X 0208, and convert it to a pair of 21..7E bytes. ++ * Return 0 if the byte pair is out of range. ++ */ ++static U_INLINE uint32_t ++_2022FromSJIS(uint32_t value) { ++ uint8_t trail; ++ ++ if(value > 0xEFFC) { ++ return 0; /* beyond JIS X 0208 */ ++ } ++ ++ trail = (uint8_t)value; ++ ++ value &= 0xff00; /* lead byte */ ++ if(value <= 0x9f00) { ++ value -= 0x7000; ++ } else /* 0xe000 <= value <= 0xef00 */ { ++ value -= 0xb000; ++ } ++ value <<= 1; ++ ++ if(trail <= 0x9e) { ++ value -= 0x100; ++ if(trail <= 0x7e) { ++ value |= trail - 0x1f; ++ } else { ++ value |= trail - 0x20; ++ } ++ } else /* trail <= 0xfc */ { ++ value |= trail - 0x7e; ++ } ++ return value; ++} ++ ++/* ++ * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS. ++ * If either byte is outside 21..7E make sure that the result is not valid ++ * for Shift-JIS so that the converter catches it. ++ * Some invalid byte values already turn into equally invalid Shift-JIS ++ * byte values and need not be tested explicitly. ++ */ ++static U_INLINE void ++_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) { ++ if(c1&1) { ++ ++c1; ++ if(c2 <= 0x5f) { ++ c2 += 0x1f; ++ } else if(c2 <= 0x7e) { ++ c2 += 0x20; ++ } else { ++ c2 = 0; /* invalid */ ++ } ++ } else { ++ if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) { ++ c2 += 0x7e; ++ } else { ++ c2 = 0; /* invalid */ ++ } ++ } ++ c1 >>= 1; ++ if(c1 <= 0x2f) { ++ c1 += 0x70; ++ } else if(c1 <= 0x3f) { ++ c1 += 0xb0; ++ } else { ++ c1 = 0; /* invalid */ ++ } ++ bytes[0] = (char)c1; ++ bytes[1] = (char)c2; ++} ++ ++/* ++ * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS) ++ * Katakana. ++ * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks ++ * because Shift-JIS roundtrips half-width Katakana to single bytes. ++ * These were the only fallbacks in ICU's jisx-208.ucm file. ++ */ ++static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = { ++ 0x2123, /* U+FF61 */ ++ 0x2156, ++ 0x2157, ++ 0x2122, ++ 0x2126, ++ 0x2572, ++ 0x2521, ++ 0x2523, ++ 0x2525, ++ 0x2527, ++ 0x2529, ++ 0x2563, ++ 0x2565, ++ 0x2567, ++ 0x2543, ++ 0x213C, /* U+FF70 */ ++ 0x2522, ++ 0x2524, ++ 0x2526, ++ 0x2528, ++ 0x252A, ++ 0x252B, ++ 0x252D, ++ 0x252F, ++ 0x2531, ++ 0x2533, ++ 0x2535, ++ 0x2537, ++ 0x2539, ++ 0x253B, ++ 0x253D, ++ 0x253F, /* U+FF80 */ ++ 0x2541, ++ 0x2544, ++ 0x2546, ++ 0x2548, ++ 0x254A, ++ 0x254B, ++ 0x254C, ++ 0x254D, ++ 0x254E, ++ 0x254F, ++ 0x2552, ++ 0x2555, ++ 0x2558, ++ 0x255B, ++ 0x255E, ++ 0x255F, /* U+FF90 */ ++ 0x2560, ++ 0x2561, ++ 0x2562, ++ 0x2564, ++ 0x2566, ++ 0x2568, ++ 0x2569, ++ 0x256A, ++ 0x256B, ++ 0x256C, ++ 0x256D, ++ 0x256F, ++ 0x2573, ++ 0x212B, ++ 0x212C /* U+FF9F */ ++}; ++ + /* + * The iteration over various code pages works this way: + * i) Get the currentState from myConverterData->currentState +@@ -1504,7 +1687,7 @@ + } + break; + case HWKANA_7BIT: +- if((uint32_t)(HWKANA_END-sourceChar)<=(HWKANA_END-HWKANA_START)) { ++ if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { + if(converterData->version==3) { + /* JIS7: use G1 (SO) */ + /* Shift U+FF61..U+FF9F to bytes 21..5F. */ +@@ -1531,13 +1714,34 @@ + break; + case JISX201: + /* G0 SBCS */ +- len2 = MBCS_SINGLE_FROM_UCHAR32( ++ value = jisx201FromU(sourceChar); ++ if(value <= 0x7f) { ++ targetValue = value; ++ len = 1; ++ cs = cs0; ++ g = 0; ++ useFallback = FALSE; ++ } ++ break; ++ case JISX208: ++ /* G0 DBCS from Shift-JIS table */ ++ len2 = MBCS_FROM_UCHAR32_ISO2022( + converterData->myConverterArray[cs0], + sourceChar, &value, +- useFallback); +- if(len2 != 0 && !(len2 < 0 && len != 0) && value <= 0x7f) { +- targetValue = value; +- len = len2; ++ useFallback, MBCS_OUTPUT_2); ++ if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ ++ value = _2022FromSJIS(value); ++ if(value != 0) { ++ targetValue = value; ++ len = len2; ++ cs = cs0; ++ g = 0; ++ useFallback = FALSE; ++ } ++ } else if(len == 0 && useFallback && ++ (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { ++ targetValue = hwkana_fb[sourceChar - HWKANA_START]; ++ len = -2; + cs = cs0; + g = 0; + useFallback = FALSE; +@@ -1569,17 +1773,10 @@ + * Check for valid bytes for the encoding scheme. + * This is necessary because the sub-converter (windows-949) + * has a broader encoding scheme than is valid for 2022. +- * +- * Check that the result is a 2-byte value with each byte in the range A1..FE +- * (strict EUC-KR DBCS) before accepting it and subtracting 0x80 from each byte +- * to move it to the ISO 2022 range 21..7E. + */ +- if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && +- (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) +- ) { +- value -= 0x8080; /* shift down to 21..7e byte range */ +- } else { +- break; /* not valid for ISO 2022 */ ++ value = _2022FromGR94DBCS(value); ++ if(value == 0) { ++ break; + } + } + targetValue = value; +@@ -1755,7 +1952,7 @@ + static void + UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, + UErrorCode* err){ +- char tempBuf[3]; ++ char tempBuf[2]; + const char *mySource = (char *) args->source; + UChar *myTarget = args->target; + const char *mySourceLimit = args->sourceLimit; +@@ -1893,10 +2090,7 @@ + break; + case JISX201: + if(mySourceChar <= 0x7f) { +- targetUniChar = +- _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP( +- myData->myConverterArray[cs], +- mySourceChar); ++ targetUniChar = jisx201ToU(mySourceChar); + } + break; + case HWKANA_7BIT: +@@ -1910,8 +2104,13 @@ + if(mySource < mySourceLimit) { + char trailByte; + getTrailByte: +- tempBuf[0] = (char) (mySourceChar); +- tempBuf[1] = trailByte = *mySource++; ++ trailByte = *mySource++; ++ if(cs == JISX208) { ++ _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailByte, tempBuf); ++ } else { ++ tempBuf[0] = (char)mySourceChar; ++ tempBuf[1] = trailByte; ++ } + mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte); + targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); + } else { +@@ -3254,6 +3453,9 @@ + /* open a set and initialize it with code points that are algorithmically round-tripped */ + switch(cnvData->locale[0]){ + case 'j': ++ /* include JIS X 0201 which is hardcoded */ ++ sa->add(sa->set, 0xa5); ++ sa->add(sa->set, 0x203e); + if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) { + /* include Latin-1 for some variants of JP */ + sa->addRange(sa->set, 0, 0xff); +@@ -3262,6 +3464,11 @@ + sa->addRange(sa->set, 0, 0x7f); + } + if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) { ++ /* ++ * TODO(markus): If and when ucnv_getUnicodeSet() supports fallbacks, ++ * we need to include half-width Katakana for all JP variants because ++ * JIS X 0208 has hardcoded fallbacks for them. ++ */ + /* include half-width Katakana for JP */ + sa->addRange(sa->set, HWKANA_START, HWKANA_END); + } +@@ -3281,15 +3488,7 @@ + break; + } + +- /* +- * Version-specific for CN: +- * CN version 0 does not map CNS planes 3..7 although +- * they are all available in the CNS conversion table; +- * CN version 1 does map them all. +- * The two versions create different Unicode sets. +- */ +- for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { +- if(cnvData->myConverterArray[i]!=NULL) { ++#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ + if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && + cnvData->version==0 && i==CNS_11643 + ) { +@@ -3299,9 +3498,33 @@ + sa, UCNV_ROUNDTRIP_SET, + 0, 0x81, 0x82, + pErrorCode); ++ } ++#endif ++ ++ for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { ++ UConverterSetFilter filter; ++ if(cnvData->myConverterArray[i]!=NULL) { ++ if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && ++ cnvData->version==0 && i==CNS_11643 ++ ) { ++ /* ++ * Version-specific for CN: ++ * CN version 0 does not map CNS planes 3..7 although ++ * they are all available in the CNS conversion table; ++ * CN version 1 (-EXT) does map them all. ++ * The two versions create different Unicode sets. ++ */ ++ filter=UCNV_SET_FILTER_2022_CN; ++ } else if(cnvData->locale[0]=='j' && i==JISX208) { ++ /* ++ * Only add code points that map to Shift-JIS codes ++ * corresponding to JIS X 0208. ++ */ ++ filter=UCNV_SET_FILTER_SJIS; + } else { +- ucnv_MBCSGetUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, pErrorCode); ++ filter=UCNV_SET_FILTER_NONE; + } ++ ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode); + } + } + +diff -ru icu.5483/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c +--- icu.5483/source/common/ucnvmbcs.c 2009-06-02 12:47:41.000000000 +0100 ++++ icu/source/common/ucnvmbcs.c 2009-06-02 12:48:08.000000000 +0100 +@@ -340,6 +340,8 @@ + + /* Miscellaneous ------------------------------------------------------------ */ + ++#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ ++ + /* similar to ucnv_MBCSGetNextUChar() but recursive */ + static void + _getUnicodeSetForBytes(const UConverterSharedData *sharedData, +@@ -432,11 +434,14 @@ + pErrorCode); + } + ++#endif ++ + U_CFUNC void +-ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, +- const USetAdder *sa, +- UConverterUnicodeSet which, +- UErrorCode *pErrorCode) { ++ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData, ++ const USetAdder *sa, ++ UConverterUnicodeSet which, ++ UConverterSetFilter filter, ++ UErrorCode *pErrorCode) { + const UConverterMBCSTable *mbcsTable; + const uint16_t *table; + +@@ -490,50 +495,26 @@ + c+=1024; /* empty stage 2 block */ + } + } +- } else if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY) { +- /* ignore single-byte results */ ++ } else { + const uint32_t *stage2; +- const uint16_t *stage3, *results; +- +- results=(const uint16_t *)mbcsTable->fromUnicodeBytes; +- +- for(st1=0; st1<maxStage1; ++st1) { +- st2=table[st1]; +- if(st2>(maxStage1>>1)) { +- stage2=(const uint32_t *)table+st2; +- for(st2=0; st2<64; ++st2) { +- if((st3=stage2[st2])!=0) { +- /* read the stage 3 block */ +- stage3=results+16*(uint32_t)(uint16_t)st3; ++ const uint8_t *stage3, *bytes; ++ uint32_t st3Multiplier; ++ uint32_t value; + +- /* get the roundtrip flags for the stage 3 block */ +- st3>>=16; ++ bytes=mbcsTable->fromUnicodeBytes; + +- /* +- * Add code points for which the roundtrip flag is set. +- * Once we get a set for fallback mappings, we have to check +- * non-roundtrip stage 3 results for whether they are 0. +- * See ucnv_MBCSFromUnicodeWithOffsets() for details. +- * +- * Ignore single-byte results (<0x100). +- */ +- do { +- if((st3&1)!=0 && *stage3>=0x100) { +- sa->add(sa->set, c); +- } +- st3>>=1; +- ++stage3; +- } while((++c&0xf)!=0); +- } else { +- c+=16; /* empty stage 3 block */ +- } +- } +- } else { +- c+=1024; /* empty stage 2 block */ +- } ++ switch(mbcsTable->outputType) { ++ case MBCS_OUTPUT_3: ++ case MBCS_OUTPUT_4_EUC: ++ st3Multiplier=3; ++ break; ++ case MBCS_OUTPUT_4: ++ st3Multiplier=4; ++ break; ++ default: ++ st3Multiplier=2; ++ break; + } +- } else { +- const uint32_t *stage2; + + for(st1=0; st1<maxStage1; ++st1) { + st2=table[st1]; +@@ -541,6 +522,9 @@ + stage2=(const uint32_t *)table+st2; + for(st2=0; st2<64; ++st2) { + if((st3=stage2[st2])!=0) { ++ /* read the stage 3 block */ ++ stage3=bytes+st3Multiplier*16*(uint32_t)(uint16_t)st3; ++ + /* get the roundtrip flags for the stage 3 block */ + st3>>=16; + +@@ -550,12 +534,49 @@ + * non-roundtrip stage 3 results for whether they are 0. + * See ucnv_MBCSFromUnicodeWithOffsets() for details. + */ +- do { +- if(st3&1) { +- sa->add(sa->set, c); +- } +- st3>>=1; +- } while((++c&0xf)!=0); ++ switch(filter) { ++ case UCNV_SET_FILTER_NONE: ++ do { ++ if(st3&1) { ++ sa->add(sa->set, c); ++ } ++ st3>>=1; ++ } while((++c&0xf)!=0); ++ break; ++ case UCNV_SET_FILTER_DBCS_ONLY: ++ /* Ignore single-byte results (<0x100). */ ++ do { ++ if((st3&1)!=0 && *((const uint16_t *)stage3)>=0x100) { ++ sa->add(sa->set, c); ++ } ++ st3>>=1; ++ stage3+=2; /* +=st3Multiplier */ ++ } while((++c&0xf)!=0); ++ break; ++ case UCNV_SET_FILTER_2022_CN: ++ /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */ ++ do { ++ if((st3&1)!=0 && ((value=*stage3)==0x81 || value==0x82)) { ++ sa->add(sa->set, c); ++ } ++ st3>>=1; ++ stage3+=3; /* +=st3Multiplier */ ++ } while((++c&0xf)!=0); ++ break; ++ case UCNV_SET_FILTER_SJIS: ++ /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */ ++ do { ++ if((st3&1)!=0 && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) { ++ sa->add(sa->set, c); ++ } ++ st3>>=1; ++ stage3+=2; /* +=st3Multiplier */ ++ } while((++c&0xf)!=0); ++ break; ++ default: ++ *pErrorCode=U_INTERNAL_PROGRAM_ERROR; ++ return; ++ } + } else { + c+=16; /* empty stage 3 block */ + } +@@ -569,6 +590,19 @@ + ucnv_extGetUnicodeSet(sharedData, sa, which, pErrorCode); + } + ++U_CFUNC void ++ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, ++ const USetAdder *sa, ++ UConverterUnicodeSet which, ++ UErrorCode *pErrorCode) { ++ ucnv_MBCSGetFilteredUnicodeSetForUnicode( ++ sharedData, sa, which, ++ sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? ++ UCNV_SET_FILTER_DBCS_ONLY : ++ UCNV_SET_FILTER_NONE, ++ pErrorCode); ++} ++ + static void + ucnv_MBCSGetUnicodeSet(const UConverter *cnv, + const USetAdder *sa, +diff -ru icu.5483/source/common/ucnvmbcs.h icu/source/common/ucnvmbcs.h +--- icu.5483/source/common/ucnvmbcs.h 2009-06-02 12:47:41.000000000 +0100 ++++ icu/source/common/ucnvmbcs.h 2009-06-02 12:48:08.000000000 +0100 +@@ -363,6 +363,7 @@ + ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, + UErrorCode *pErrorCode); + ++#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ + /* + * Internal function returning a UnicodeSet for toUnicode() conversion. + * Currently only used for ISO-2022-CN, and only handles roundtrip mappings. +@@ -377,6 +378,7 @@ + UConverterUnicodeSet which, + uint8_t state, int32_t lowByte, int32_t highByte, + UErrorCode *pErrorCode); ++#endif + + /* + * Internal function returning a UnicodeSet for toUnicode() conversion. +@@ -388,9 +390,30 @@ + */ + U_CFUNC void + ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, +- const USetAdder *sa, +- UConverterUnicodeSet which, +- UErrorCode *pErrorCode); ++ const USetAdder *sa, ++ UConverterUnicodeSet which, ++ UErrorCode *pErrorCode); ++ ++typedef enum UConverterSetFilter { ++ UCNV_SET_FILTER_NONE, ++ UCNV_SET_FILTER_DBCS_ONLY, ++ UCNV_SET_FILTER_2022_CN, ++ UCNV_SET_FILTER_SJIS, ++ UCNV_SET_FILTER_COUNT ++} UConverterSetFilter; ++ ++/* ++ * Same as ucnv_MBCSGetUnicodeSetForUnicode() but ++ * the set can be filtered by encoding scheme. ++ * Used by stateful converters which share regular conversion tables ++ * but only use a subset of their mappings. ++ */ ++U_CFUNC void ++ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData, ++ const USetAdder *sa, ++ UConverterUnicodeSet which, ++ UConverterSetFilter filter, ++ UErrorCode *pErrorCode); + + #endif + +diff -ru icu.5483/source/test/cintltst/nucnvtst.c icu/source/test/cintltst/nucnvtst.c +--- icu.5483/source/test/cintltst/nucnvtst.c 2009-06-02 12:47:25.000000000 +0100 ++++ icu/source/test/cintltst/nucnvtst.c 2009-06-02 12:58:02.000000000 +0100 +@@ -3202,7 +3202,7 @@ + 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A, 0x000D, 0x000A, + 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, 0x0051, 0x0052, 0x000D, 0x000A, + 0x3005, 0x3006, 0x3007, 0x30FC, 0x2015, 0x2010, 0xFF0F, 0x005C, 0x000D, 0x000A, +- 0x301C, 0x2016, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A, ++ 0x3013, 0x2018, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A, + 0x201D, 0x3014, 0x000D, 0x000A, + 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x000D, 0x000A, + 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x000D, 0x000A, +@@ -3730,7 +3730,7 @@ + 0x52C8, 0x52CC, 0x52CF, 0x52D1, 0x52D4, 0x52D6, 0x52DB, 0x52DC, 0x000D, 0x000A, + 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, 0x0051, 0x0052, 0x000D, 0x000A, + 0x3005, 0x3006, 0x3007, 0x30FC, 0x2015, 0x2010, 0xFF0F, 0x005C, 0x000D, 0x000A, +- 0x301C, 0x2016, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A, ++ 0x3013, 0x2018, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A, + 0x201D, 0x000D, 0x000A, + 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x000D, 0x000A, + 0x4F94, 0x4F97, 0x52BA, 0x52BB, 0x52BD, 0x52C0, 0x52C4, 0x52C6, 0x000D, 0x000A, +diff -ru icu.5483/source/test/cintltst/udatatst.c icu/source/test/cintltst/udatatst.c +--- icu.5483/source/test/cintltst/udatatst.c 2009-06-02 12:47:25.000000000 +0100 ++++ icu/source/test/cintltst/udatatst.c 2009-06-02 13:09:15.000000000 +0100 +@@ -1260,6 +1260,11 @@ + {"gb18030", "cnv", ucnv_swap}, + /* MBCS conversion table file with extension */ + {"*test4x", "cnv", ucnv_swap}, ++ /* ++ * MBCS conversion table file without extension, ++ * to test swapping and preflighting of UTF-8-friendly mbcsIndex[]. ++ */ ++ {"jisx-212", "cnv", ucnv_swap}, + #endif + + #if !UCONFIG_NO_CONVERSION +diff -ru icu.5483/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt +--- icu.5483/source/test/testdata/conversion.txt 2009-06-02 12:47:25.000000000 +0100 ++++ icu/source/test/testdata/conversion.txt 2009-06-02 12:49:51.000000000 +0100 +@@ -48,6 +48,15 @@ + toUnicode { + Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" } + Cases { ++ // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and ++ // using the Shift-JIS table for JIS X 0208 (ticket #5797) ++ { ++ "ISO-2022-JP", ++ :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b2842 }, ++ "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e", ++ :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 }, ++ :int{1}, :int{1}, "", "?", :bin{""} ++ } + // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBMPWithOffsets() + { + "ISO-8859-3", +@@ -495,6 +504,15 @@ + } + { "UTF-16BE", :bin{ 00 }, "", :intvector{}, :int{1}, :int{0}, "truncated", ".", :bin{ 00 } } + { "UTF-16BE", :bin{ d800dc }, "", :intvector{}, :int{1}, :int{0}, "truncated", ".", :bin{ d800dc } } ++ // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and ++ // using the Shift-JIS table for JIS X 0208 (ticket #5797) ++ { ++ "ISO-2022-JP", ++ "\u203e\xa5\u4e00\ufa10\u6f3e\u0391", ++ :bin{ 1b284a7e5c1b2442306c222e5f2126211b2842 }, ++ :intvector{ 0,0,0,0,1,2,2,2,2,2,3,3,4,4,5,5,5,5,5 }, ++ :int{1}, :int{0}, "", "?=\u3013", "" // U+3013 Geta Mark converts to 222e ++ } + // Verify that mappings that would result in byte values outside 20..7F (for SBCS) + // or 21..7E (for DBCS) are not used. + // ibm-9005_X110-2007.ucm (ISO 8859-7, <ESC>.F=1b2e46): +@@ -1273,13 +1291,13 @@ + // versions of ISO-2022-JP + { + "ISO-2022-JP", +- "[\x00-\x0d\x10-\x1a\x1c-\x7f\u0391-\u03a1\uff61-\uff9f\u4e00\u4e01\uffe5]", +- "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\uffe6-\U0010ffff]", ++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u203e\uff61-\uff9f\u4e00\u4e01\uffe5]", ++ "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\ufa0e-\ufa2d\uffe6-\U0010ffff]", + :int{0} + } + { + "ISO-2022-JP-2", +- "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0385-\u038a\u0390-\u03a1\uff61-\uff9f\u4e00-\u4e05\uffe6]", ++ "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\uff61-\uff9f\u4e00-\u4e05\uffe6]", + "[\x0e\x0f\x1b\uffe7-\U0010ffff]", + :int{0} + } |