summaryrefslogtreecommitdiffstats
path: root/icu.icu5797.backport.patch
diff options
context:
space:
mode:
Diffstat (limited to 'icu.icu5797.backport.patch')
-rw-r--r--icu.icu5797.backport.patch749
1 files changed, 749 insertions, 0 deletions
diff --git a/icu.icu5797.backport.patch b/icu.icu5797.backport.patch
new file mode 100644
index 0000000..39e3f77
--- /dev/null
+++ b/icu.icu5797.backport.patch
@@ -0,0 +1,749 @@
+diff -ru icu.5483/source/common/ucnv2022.c icu/source/common/ucnv2022.c
+--- icu.5483/source/common/ucnv2022.c 2009-06-02 12:47:41.000000000 +0100
++++ icu/source/common/ucnv2022.c 2009-06-02 13:18:23.000000000 +0100
+@@ -473,8 +473,7 @@
+ if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
+ myConverterData->myConverterArray[ISO8859_7]= ucnv_loadSharedData("ISO8859_7", NULL, errorCode);
+ }
+- myConverterData->myConverterArray[JISX201] = ucnv_loadSharedData("JISX0201", NULL, errorCode);
+- myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("jisx-208", NULL, errorCode);
++ myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("Shift-JIS", NULL, errorCode);
+ if(jpCharsetMasks[version]&CSM(JISX212)) {
+ myConverterData->myConverterArray[JISX212] = ucnv_loadSharedData("jisx-212", NULL, errorCode);
+ }
+@@ -1045,14 +1044,6 @@
+ length=3;
+ }
+ }
+- /*
+- * TODO(markus): Use Shift-JIS table for JIS X 0208, to save mapping table space.
+- * Pass in parameter for type of output bytes, for validation and shifting:
+- * - Direct: Pass bytes through, but forbid control codes 00-1F (except SI/SO/ESC) and space 20?
+- * (Need to allow some (TAB/LF/CR) or most of them for ASCII and maybe JIS X 0201.)
+- * - A1-FE: Subtract 80 after range check.
+- * - SJIS: Shift DBCS result to 21-7E x 21-7E.
+- */
+ /* is this code point assigned, or do we use fallbacks? */
+ if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
+ /* assigned */
+@@ -1110,6 +1101,23 @@
+ }
+ }
+
++/*
++ * Check that the result is a 2-byte value with each byte in the range A1..FE
++ * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
++ * to move it to the ISO 2022 range 21..7E.
++ * Return 0 if out of range.
++ */
++static U_INLINE uint32_t
++_2022FromGR94DBCS(uint32_t value) {
++ if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
++ (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
++ ) {
++ return value - 0x8080; /* shift down to 21..7e byte range */
++ } else {
++ return 0; /* not valid for ISO 2022 */
++ }
++}
++
+ #ifdef U_ENABLE_GENERIC_ISO_2022
+
+ /**********************************************************************************
+@@ -1238,7 +1246,7 @@
+ }
+ else{
+ cnv->toUBytes[0] =(char) sourceChar;
+- cnv->toULength = 2;
++ cnv->toULength = 1;
+ }
+
+ if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
+@@ -1332,6 +1340,181 @@
+ 3 /* length of <ESC>(I HWKANA_7BIT */
+ };
+
++/* Map 00..7F to Unicode according to JIS X 0201. */
++static U_INLINE uint32_t
++jisx201ToU(uint32_t value) {
++ if(value < 0x5c) {
++ return value;
++ } else if(value == 0x5c) {
++ return 0xa5;
++ } else if(value == 0x7e) {
++ return 0x203e;
++ } else /* value <= 0x7f */ {
++ return value;
++ }
++}
++
++/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
++static U_INLINE uint32_t
++jisx201FromU(uint32_t value) {
++ if(value<=0x7f) {
++ if(value!=0x5c && value!=0x7e) {
++ return value;
++ }
++ } else if(value==0xa5) {
++ return 0x5c;
++ } else if(value==0x203e) {
++ return 0x7e;
++ }
++ return 0xfffe;
++}
++
++/*
++ * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
++ * to JIS X 0208, and convert it to a pair of 21..7E bytes.
++ * Return 0 if the byte pair is out of range.
++ */
++static U_INLINE uint32_t
++_2022FromSJIS(uint32_t value) {
++ uint8_t trail;
++
++ if(value > 0xEFFC) {
++ return 0; /* beyond JIS X 0208 */
++ }
++
++ trail = (uint8_t)value;
++
++ value &= 0xff00; /* lead byte */
++ if(value <= 0x9f00) {
++ value -= 0x7000;
++ } else /* 0xe000 <= value <= 0xef00 */ {
++ value -= 0xb000;
++ }
++ value <<= 1;
++
++ if(trail <= 0x9e) {
++ value -= 0x100;
++ if(trail <= 0x7e) {
++ value |= trail - 0x1f;
++ } else {
++ value |= trail - 0x20;
++ }
++ } else /* trail <= 0xfc */ {
++ value |= trail - 0x7e;
++ }
++ return value;
++}
++
++/*
++ * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
++ * If either byte is outside 21..7E make sure that the result is not valid
++ * for Shift-JIS so that the converter catches it.
++ * Some invalid byte values already turn into equally invalid Shift-JIS
++ * byte values and need not be tested explicitly.
++ */
++static U_INLINE void
++_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
++ if(c1&1) {
++ ++c1;
++ if(c2 <= 0x5f) {
++ c2 += 0x1f;
++ } else if(c2 <= 0x7e) {
++ c2 += 0x20;
++ } else {
++ c2 = 0; /* invalid */
++ }
++ } else {
++ if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
++ c2 += 0x7e;
++ } else {
++ c2 = 0; /* invalid */
++ }
++ }
++ c1 >>= 1;
++ if(c1 <= 0x2f) {
++ c1 += 0x70;
++ } else if(c1 <= 0x3f) {
++ c1 += 0xb0;
++ } else {
++ c1 = 0; /* invalid */
++ }
++ bytes[0] = (char)c1;
++ bytes[1] = (char)c2;
++}
++
++/*
++ * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
++ * Katakana.
++ * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
++ * because Shift-JIS roundtrips half-width Katakana to single bytes.
++ * These were the only fallbacks in ICU's jisx-208.ucm file.
++ */
++static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
++ 0x2123, /* U+FF61 */
++ 0x2156,
++ 0x2157,
++ 0x2122,
++ 0x2126,
++ 0x2572,
++ 0x2521,
++ 0x2523,
++ 0x2525,
++ 0x2527,
++ 0x2529,
++ 0x2563,
++ 0x2565,
++ 0x2567,
++ 0x2543,
++ 0x213C, /* U+FF70 */
++ 0x2522,
++ 0x2524,
++ 0x2526,
++ 0x2528,
++ 0x252A,
++ 0x252B,
++ 0x252D,
++ 0x252F,
++ 0x2531,
++ 0x2533,
++ 0x2535,
++ 0x2537,
++ 0x2539,
++ 0x253B,
++ 0x253D,
++ 0x253F, /* U+FF80 */
++ 0x2541,
++ 0x2544,
++ 0x2546,
++ 0x2548,
++ 0x254A,
++ 0x254B,
++ 0x254C,
++ 0x254D,
++ 0x254E,
++ 0x254F,
++ 0x2552,
++ 0x2555,
++ 0x2558,
++ 0x255B,
++ 0x255E,
++ 0x255F, /* U+FF90 */
++ 0x2560,
++ 0x2561,
++ 0x2562,
++ 0x2564,
++ 0x2566,
++ 0x2568,
++ 0x2569,
++ 0x256A,
++ 0x256B,
++ 0x256C,
++ 0x256D,
++ 0x256F,
++ 0x2573,
++ 0x212B,
++ 0x212C /* U+FF9F */
++};
++
+ /*
+ * The iteration over various code pages works this way:
+ * i) Get the currentState from myConverterData->currentState
+@@ -1504,7 +1687,7 @@
+ }
+ break;
+ case HWKANA_7BIT:
+- if((uint32_t)(HWKANA_END-sourceChar)<=(HWKANA_END-HWKANA_START)) {
++ if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
+ if(converterData->version==3) {
+ /* JIS7: use G1 (SO) */
+ /* Shift U+FF61..U+FF9F to bytes 21..5F. */
+@@ -1531,13 +1714,34 @@
+ break;
+ case JISX201:
+ /* G0 SBCS */
+- len2 = MBCS_SINGLE_FROM_UCHAR32(
++ value = jisx201FromU(sourceChar);
++ if(value <= 0x7f) {
++ targetValue = value;
++ len = 1;
++ cs = cs0;
++ g = 0;
++ useFallback = FALSE;
++ }
++ break;
++ case JISX208:
++ /* G0 DBCS from Shift-JIS table */
++ len2 = MBCS_FROM_UCHAR32_ISO2022(
+ converterData->myConverterArray[cs0],
+ sourceChar, &value,
+- useFallback);
+- if(len2 != 0 && !(len2 < 0 && len != 0) && value <= 0x7f) {
+- targetValue = value;
+- len = len2;
++ useFallback, MBCS_OUTPUT_2);
++ if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
++ value = _2022FromSJIS(value);
++ if(value != 0) {
++ targetValue = value;
++ len = len2;
++ cs = cs0;
++ g = 0;
++ useFallback = FALSE;
++ }
++ } else if(len == 0 && useFallback &&
++ (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
++ targetValue = hwkana_fb[sourceChar - HWKANA_START];
++ len = -2;
+ cs = cs0;
+ g = 0;
+ useFallback = FALSE;
+@@ -1569,17 +1773,10 @@
+ * Check for valid bytes for the encoding scheme.
+ * This is necessary because the sub-converter (windows-949)
+ * has a broader encoding scheme than is valid for 2022.
+- *
+- * Check that the result is a 2-byte value with each byte in the range A1..FE
+- * (strict EUC-KR DBCS) before accepting it and subtracting 0x80 from each byte
+- * to move it to the ISO 2022 range 21..7E.
+ */
+- if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
+- (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
+- ) {
+- value -= 0x8080; /* shift down to 21..7e byte range */
+- } else {
+- break; /* not valid for ISO 2022 */
++ value = _2022FromGR94DBCS(value);
++ if(value == 0) {
++ break;
+ }
+ }
+ targetValue = value;
+@@ -1755,7 +1952,7 @@
+ static void
+ UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
+ UErrorCode* err){
+- char tempBuf[3];
++ char tempBuf[2];
+ const char *mySource = (char *) args->source;
+ UChar *myTarget = args->target;
+ const char *mySourceLimit = args->sourceLimit;
+@@ -1893,10 +2090,7 @@
+ break;
+ case JISX201:
+ if(mySourceChar <= 0x7f) {
+- targetUniChar =
+- _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
+- myData->myConverterArray[cs],
+- mySourceChar);
++ targetUniChar = jisx201ToU(mySourceChar);
+ }
+ break;
+ case HWKANA_7BIT:
+@@ -1910,8 +2104,13 @@
+ if(mySource < mySourceLimit) {
+ char trailByte;
+ getTrailByte:
+- tempBuf[0] = (char) (mySourceChar);
+- tempBuf[1] = trailByte = *mySource++;
++ trailByte = *mySource++;
++ if(cs == JISX208) {
++ _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailByte, tempBuf);
++ } else {
++ tempBuf[0] = (char)mySourceChar;
++ tempBuf[1] = trailByte;
++ }
+ mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
+ targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
+ } else {
+@@ -3254,6 +3453,9 @@
+ /* open a set and initialize it with code points that are algorithmically round-tripped */
+ switch(cnvData->locale[0]){
+ case 'j':
++ /* include JIS X 0201 which is hardcoded */
++ sa->add(sa->set, 0xa5);
++ sa->add(sa->set, 0x203e);
+ if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
+ /* include Latin-1 for some variants of JP */
+ sa->addRange(sa->set, 0, 0xff);
+@@ -3262,6 +3464,11 @@
+ sa->addRange(sa->set, 0, 0x7f);
+ }
+ if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) {
++ /*
++ * TODO(markus): If and when ucnv_getUnicodeSet() supports fallbacks,
++ * we need to include half-width Katakana for all JP variants because
++ * JIS X 0208 has hardcoded fallbacks for them.
++ */
+ /* include half-width Katakana for JP */
+ sa->addRange(sa->set, HWKANA_START, HWKANA_END);
+ }
+@@ -3281,15 +3488,7 @@
+ break;
+ }
+
+- /*
+- * Version-specific for CN:
+- * CN version 0 does not map CNS planes 3..7 although
+- * they are all available in the CNS conversion table;
+- * CN version 1 does map them all.
+- * The two versions create different Unicode sets.
+- */
+- for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
+- if(cnvData->myConverterArray[i]!=NULL) {
++#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
+ if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
+ cnvData->version==0 && i==CNS_11643
+ ) {
+@@ -3299,9 +3498,33 @@
+ sa, UCNV_ROUNDTRIP_SET,
+ 0, 0x81, 0x82,
+ pErrorCode);
++ }
++#endif
++
++ for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
++ UConverterSetFilter filter;
++ if(cnvData->myConverterArray[i]!=NULL) {
++ if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
++ cnvData->version==0 && i==CNS_11643
++ ) {
++ /*
++ * Version-specific for CN:
++ * CN version 0 does not map CNS planes 3..7 although
++ * they are all available in the CNS conversion table;
++ * CN version 1 (-EXT) does map them all.
++ * The two versions create different Unicode sets.
++ */
++ filter=UCNV_SET_FILTER_2022_CN;
++ } else if(cnvData->locale[0]=='j' && i==JISX208) {
++ /*
++ * Only add code points that map to Shift-JIS codes
++ * corresponding to JIS X 0208.
++ */
++ filter=UCNV_SET_FILTER_SJIS;
+ } else {
+- ucnv_MBCSGetUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, pErrorCode);
++ filter=UCNV_SET_FILTER_NONE;
+ }
++ ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
+ }
+ }
+
+diff -ru icu.5483/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c
+--- icu.5483/source/common/ucnvmbcs.c 2009-06-02 12:47:41.000000000 +0100
++++ icu/source/common/ucnvmbcs.c 2009-06-02 12:48:08.000000000 +0100
+@@ -340,6 +340,8 @@
+
+ /* Miscellaneous ------------------------------------------------------------ */
+
++#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
++
+ /* similar to ucnv_MBCSGetNextUChar() but recursive */
+ static void
+ _getUnicodeSetForBytes(const UConverterSharedData *sharedData,
+@@ -432,11 +434,14 @@
+ pErrorCode);
+ }
+
++#endif
++
+ U_CFUNC void
+-ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
+- const USetAdder *sa,
+- UConverterUnicodeSet which,
+- UErrorCode *pErrorCode) {
++ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
++ const USetAdder *sa,
++ UConverterUnicodeSet which,
++ UConverterSetFilter filter,
++ UErrorCode *pErrorCode) {
+ const UConverterMBCSTable *mbcsTable;
+ const uint16_t *table;
+
+@@ -490,50 +495,26 @@
+ c+=1024; /* empty stage 2 block */
+ }
+ }
+- } else if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY) {
+- /* ignore single-byte results */
++ } else {
+ const uint32_t *stage2;
+- const uint16_t *stage3, *results;
+-
+- results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
+-
+- for(st1=0; st1<maxStage1; ++st1) {
+- st2=table[st1];
+- if(st2>(maxStage1>>1)) {
+- stage2=(const uint32_t *)table+st2;
+- for(st2=0; st2<64; ++st2) {
+- if((st3=stage2[st2])!=0) {
+- /* read the stage 3 block */
+- stage3=results+16*(uint32_t)(uint16_t)st3;
++ const uint8_t *stage3, *bytes;
++ uint32_t st3Multiplier;
++ uint32_t value;
+
+- /* get the roundtrip flags for the stage 3 block */
+- st3>>=16;
++ bytes=mbcsTable->fromUnicodeBytes;
+
+- /*
+- * Add code points for which the roundtrip flag is set.
+- * Once we get a set for fallback mappings, we have to check
+- * non-roundtrip stage 3 results for whether they are 0.
+- * See ucnv_MBCSFromUnicodeWithOffsets() for details.
+- *
+- * Ignore single-byte results (<0x100).
+- */
+- do {
+- if((st3&1)!=0 && *stage3>=0x100) {
+- sa->add(sa->set, c);
+- }
+- st3>>=1;
+- ++stage3;
+- } while((++c&0xf)!=0);
+- } else {
+- c+=16; /* empty stage 3 block */
+- }
+- }
+- } else {
+- c+=1024; /* empty stage 2 block */
+- }
++ switch(mbcsTable->outputType) {
++ case MBCS_OUTPUT_3:
++ case MBCS_OUTPUT_4_EUC:
++ st3Multiplier=3;
++ break;
++ case MBCS_OUTPUT_4:
++ st3Multiplier=4;
++ break;
++ default:
++ st3Multiplier=2;
++ break;
+ }
+- } else {
+- const uint32_t *stage2;
+
+ for(st1=0; st1<maxStage1; ++st1) {
+ st2=table[st1];
+@@ -541,6 +522,9 @@
+ stage2=(const uint32_t *)table+st2;
+ for(st2=0; st2<64; ++st2) {
+ if((st3=stage2[st2])!=0) {
++ /* read the stage 3 block */
++ stage3=bytes+st3Multiplier*16*(uint32_t)(uint16_t)st3;
++
+ /* get the roundtrip flags for the stage 3 block */
+ st3>>=16;
+
+@@ -550,12 +534,49 @@
+ * non-roundtrip stage 3 results for whether they are 0.
+ * See ucnv_MBCSFromUnicodeWithOffsets() for details.
+ */
+- do {
+- if(st3&1) {
+- sa->add(sa->set, c);
+- }
+- st3>>=1;
+- } while((++c&0xf)!=0);
++ switch(filter) {
++ case UCNV_SET_FILTER_NONE:
++ do {
++ if(st3&1) {
++ sa->add(sa->set, c);
++ }
++ st3>>=1;
++ } while((++c&0xf)!=0);
++ break;
++ case UCNV_SET_FILTER_DBCS_ONLY:
++ /* Ignore single-byte results (<0x100). */
++ do {
++ if((st3&1)!=0 && *((const uint16_t *)stage3)>=0x100) {
++ sa->add(sa->set, c);
++ }
++ st3>>=1;
++ stage3+=2; /* +=st3Multiplier */
++ } while((++c&0xf)!=0);
++ break;
++ case UCNV_SET_FILTER_2022_CN:
++ /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */
++ do {
++ if((st3&1)!=0 && ((value=*stage3)==0x81 || value==0x82)) {
++ sa->add(sa->set, c);
++ }
++ st3>>=1;
++ stage3+=3; /* +=st3Multiplier */
++ } while((++c&0xf)!=0);
++ break;
++ case UCNV_SET_FILTER_SJIS:
++ /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */
++ do {
++ if((st3&1)!=0 && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {
++ sa->add(sa->set, c);
++ }
++ st3>>=1;
++ stage3+=2; /* +=st3Multiplier */
++ } while((++c&0xf)!=0);
++ break;
++ default:
++ *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
++ return;
++ }
+ } else {
+ c+=16; /* empty stage 3 block */
+ }
+@@ -569,6 +590,19 @@
+ ucnv_extGetUnicodeSet(sharedData, sa, which, pErrorCode);
+ }
+
++U_CFUNC void
++ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
++ const USetAdder *sa,
++ UConverterUnicodeSet which,
++ UErrorCode *pErrorCode) {
++ ucnv_MBCSGetFilteredUnicodeSetForUnicode(
++ sharedData, sa, which,
++ sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ?
++ UCNV_SET_FILTER_DBCS_ONLY :
++ UCNV_SET_FILTER_NONE,
++ pErrorCode);
++}
++
+ static void
+ ucnv_MBCSGetUnicodeSet(const UConverter *cnv,
+ const USetAdder *sa,
+diff -ru icu.5483/source/common/ucnvmbcs.h icu/source/common/ucnvmbcs.h
+--- icu.5483/source/common/ucnvmbcs.h 2009-06-02 12:47:41.000000000 +0100
++++ icu/source/common/ucnvmbcs.h 2009-06-02 12:48:08.000000000 +0100
+@@ -363,6 +363,7 @@
+ ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
+ UErrorCode *pErrorCode);
+
++#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
+ /*
+ * Internal function returning a UnicodeSet for toUnicode() conversion.
+ * Currently only used for ISO-2022-CN, and only handles roundtrip mappings.
+@@ -377,6 +378,7 @@
+ UConverterUnicodeSet which,
+ uint8_t state, int32_t lowByte, int32_t highByte,
+ UErrorCode *pErrorCode);
++#endif
+
+ /*
+ * Internal function returning a UnicodeSet for toUnicode() conversion.
+@@ -388,9 +390,30 @@
+ */
+ U_CFUNC void
+ ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
+- const USetAdder *sa,
+- UConverterUnicodeSet which,
+- UErrorCode *pErrorCode);
++ const USetAdder *sa,
++ UConverterUnicodeSet which,
++ UErrorCode *pErrorCode);
++
++typedef enum UConverterSetFilter {
++ UCNV_SET_FILTER_NONE,
++ UCNV_SET_FILTER_DBCS_ONLY,
++ UCNV_SET_FILTER_2022_CN,
++ UCNV_SET_FILTER_SJIS,
++ UCNV_SET_FILTER_COUNT
++} UConverterSetFilter;
++
++/*
++ * Same as ucnv_MBCSGetUnicodeSetForUnicode() but
++ * the set can be filtered by encoding scheme.
++ * Used by stateful converters which share regular conversion tables
++ * but only use a subset of their mappings.
++ */
++U_CFUNC void
++ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
++ const USetAdder *sa,
++ UConverterUnicodeSet which,
++ UConverterSetFilter filter,
++ UErrorCode *pErrorCode);
+
+ #endif
+
+diff -ru icu.5483/source/test/cintltst/nucnvtst.c icu/source/test/cintltst/nucnvtst.c
+--- icu.5483/source/test/cintltst/nucnvtst.c 2009-06-02 12:47:25.000000000 +0100
++++ icu/source/test/cintltst/nucnvtst.c 2009-06-02 12:58:02.000000000 +0100
+@@ -3202,7 +3202,7 @@
+ 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A, 0x000D, 0x000A,
+ 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, 0x0051, 0x0052, 0x000D, 0x000A,
+ 0x3005, 0x3006, 0x3007, 0x30FC, 0x2015, 0x2010, 0xFF0F, 0x005C, 0x000D, 0x000A,
+- 0x301C, 0x2016, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A,
++ 0x3013, 0x2018, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A,
+ 0x201D, 0x3014, 0x000D, 0x000A,
+ 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x000D, 0x000A,
+ 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x000D, 0x000A,
+@@ -3730,7 +3730,7 @@
+ 0x52C8, 0x52CC, 0x52CF, 0x52D1, 0x52D4, 0x52D6, 0x52DB, 0x52DC, 0x000D, 0x000A,
+ 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, 0x0051, 0x0052, 0x000D, 0x000A,
+ 0x3005, 0x3006, 0x3007, 0x30FC, 0x2015, 0x2010, 0xFF0F, 0x005C, 0x000D, 0x000A,
+- 0x301C, 0x2016, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A,
++ 0x3013, 0x2018, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A,
+ 0x201D, 0x000D, 0x000A,
+ 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x000D, 0x000A,
+ 0x4F94, 0x4F97, 0x52BA, 0x52BB, 0x52BD, 0x52C0, 0x52C4, 0x52C6, 0x000D, 0x000A,
+diff -ru icu.5483/source/test/cintltst/udatatst.c icu/source/test/cintltst/udatatst.c
+--- icu.5483/source/test/cintltst/udatatst.c 2009-06-02 12:47:25.000000000 +0100
++++ icu/source/test/cintltst/udatatst.c 2009-06-02 13:09:15.000000000 +0100
+@@ -1260,6 +1260,11 @@
+ {"gb18030", "cnv", ucnv_swap},
+ /* MBCS conversion table file with extension */
+ {"*test4x", "cnv", ucnv_swap},
++ /*
++ * MBCS conversion table file without extension,
++ * to test swapping and preflighting of UTF-8-friendly mbcsIndex[].
++ */
++ {"jisx-212", "cnv", ucnv_swap},
+ #endif
+
+ #if !UCONFIG_NO_CONVERSION
+diff -ru icu.5483/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt
+--- icu.5483/source/test/testdata/conversion.txt 2009-06-02 12:47:25.000000000 +0100
++++ icu/source/test/testdata/conversion.txt 2009-06-02 12:49:51.000000000 +0100
+@@ -48,6 +48,15 @@
+ toUnicode {
+ Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }
+ Cases {
++ // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and
++ // using the Shift-JIS table for JIS X 0208 (ticket #5797)
++ {
++ "ISO-2022-JP",
++ :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b2842 },
++ "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e",
++ :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 },
++ :int{1}, :int{1}, "", "?", :bin{""}
++ }
+ // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBMPWithOffsets()
+ {
+ "ISO-8859-3",
+@@ -495,6 +504,15 @@
+ }
+ { "UTF-16BE", :bin{ 00 }, "", :intvector{}, :int{1}, :int{0}, "truncated", ".", :bin{ 00 } }
+ { "UTF-16BE", :bin{ d800dc }, "", :intvector{}, :int{1}, :int{0}, "truncated", ".", :bin{ d800dc } }
++ // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and
++ // using the Shift-JIS table for JIS X 0208 (ticket #5797)
++ {
++ "ISO-2022-JP",
++ "\u203e\xa5\u4e00\ufa10\u6f3e\u0391",
++ :bin{ 1b284a7e5c1b2442306c222e5f2126211b2842 },
++ :intvector{ 0,0,0,0,1,2,2,2,2,2,3,3,4,4,5,5,5,5,5 },
++ :int{1}, :int{0}, "", "?=\u3013", "" // U+3013 Geta Mark converts to 222e
++ }
+ // Verify that mappings that would result in byte values outside 20..7F (for SBCS)
+ // or 21..7E (for DBCS) are not used.
+ // ibm-9005_X110-2007.ucm (ISO 8859-7, <ESC>.F=1b2e46):
+@@ -1273,13 +1291,13 @@
+ // versions of ISO-2022-JP
+ {
+ "ISO-2022-JP",
+- "[\x00-\x0d\x10-\x1a\x1c-\x7f\u0391-\u03a1\uff61-\uff9f\u4e00\u4e01\uffe5]",
+- "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\uffe6-\U0010ffff]",
++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u203e\uff61-\uff9f\u4e00\u4e01\uffe5]",
++ "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\ufa0e-\ufa2d\uffe6-\U0010ffff]",
+ :int{0}
+ }
+ {
+ "ISO-2022-JP-2",
+- "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0385-\u038a\u0390-\u03a1\uff61-\uff9f\u4e00-\u4e05\uffe6]",
++ "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\uff61-\uff9f\u4e00-\u4e05\uffe6]",
+ "[\x0e\x0f\x1b\uffe7-\U0010ffff]",
+ :int{0}
+ }