Return to mbfilter_gb18030.c CVS log | Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / php / ext / mbstring / libmbfl / filters |
1.1 misho 1: /* 2: * "streamable kanji code filter and converter" 3: * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved. 4: * 5: * LICENSE NOTICES 6: * 7: * This file is part of "streamable kanji code filter and converter", 8: * which is distributed under the terms of GNU Lesser General Public 9: * License (version 2) as published by the Free Software Foundation. 10: * 11: * This software is distributed in the hope that it will be useful, 12: * but WITHOUT ANY WARRANTY; without even the implied warranty of 13: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14: * GNU Lesser General Public License for more details. 15: * 16: * You should have received a copy of the GNU Lesser General Public 17: * License along with "streamable kanji code filter and converter"; 18: * if not, write to the Free Software Foundation, Inc., 59 Temple Place, 19: * Suite 330, Boston, MA 02111-1307 USA 20: * 21: * The author of this file: 22: * 23: */ 24: /* 25: * the source code included in this files was separated from mbfilter_cp936.c 26: * by rui hirokawa <hirokawa@php.net> on 11 Aug 2011. 27: * 28: */ 29: 30: #ifdef HAVE_CONFIG_H 31: #include "config.h" 32: #endif 33: 34: #include "mbfilter.h" 35: #include "mbfilter_gb18030.h" 36: 37: #include "unicode_table_cp936.h" 38: #include "unicode_table_gb18030.h" 39: 40: static int mbfl_filt_ident_gb18030(int c, mbfl_identify_filter *filter); 41: 42: static const char *mbfl_encoding_gb18030_aliases[] = {"gb-18030", "gb-18030-2000", NULL}; 43: 44: const mbfl_encoding mbfl_encoding_gb18030 = { 45: mbfl_no_encoding_gb18030, 46: "GB18030", 47: "GB18030", 48: (const char *(*)[])&mbfl_encoding_gb18030_aliases, 49: NULL, 50: MBFL_ENCTYPE_MBCS | MBFL_ENCTYPE_GL_UNSAFE 51: }; 52: 53: const struct mbfl_identify_vtbl vtbl_identify_gb18030 = { 54: mbfl_no_encoding_gb18030, 55: mbfl_filt_ident_common_ctor, 56: mbfl_filt_ident_common_dtor, 57: mbfl_filt_ident_gb18030 58: }; 59: 60: const struct mbfl_convert_vtbl vtbl_gb18030_wchar = { 61: mbfl_no_encoding_gb18030, 62: mbfl_no_encoding_wchar, 63: mbfl_filt_conv_common_ctor, 64: mbfl_filt_conv_common_dtor, 65: mbfl_filt_conv_gb18030_wchar, 66: mbfl_filt_conv_common_flush 67: }; 68: 69: const struct mbfl_convert_vtbl vtbl_wchar_gb18030 = { 70: mbfl_no_encoding_wchar, 71: mbfl_no_encoding_gb18030, 72: mbfl_filt_conv_common_ctor, 73: mbfl_filt_conv_common_dtor, 74: mbfl_filt_conv_wchar_gb18030, 75: mbfl_filt_conv_common_flush 76: }; 77: 78: #define CK(statement) do { if ((statement) < 0) return (-1); } while (0) 79: 80: 81: int 82: mbfl_bisec_srch(int w, const unsigned short *tbl, int n) 83: { 84: int k, k1 = 0, k2 = n-1; 85: 86: while (k1 < k2) { 87: k = (k1+k2) >> 1; 88: if (w <= tbl[2*k+1]) { 89: k2 = k; 90: } else if (w >= tbl[2*k+2]) { 91: k1 = k + 1; 92: } else { 93: return -1; 94: } 95: } 96: return k1; 97: } 98: 99: int 100: mbfl_bisec_srch2(int w, const unsigned short tbl[], int n) 101: { 102: int k, k1 = 0, k2 = n; 103: 104: if (w == tbl[0]) { 105: return 0; 106: } 107: 108: while (k2 - k1 > 1) { 109: k = (k1 + k2) >> 1; 110: if (w < tbl[k]) { 111: k2 = k; 112: } else if (w > tbl[k]) { 113: k1 = k; 114: } else { 115: return k; 116: } 117: } 118: return -1; 119: } 120: 121: /* 122: * GB18030 => wchar 123: */ 124: int 125: mbfl_filt_conv_gb18030_wchar(int c, mbfl_convert_filter *filter) 126: { 127: int k; 128: int c1, c2, c3, w = -1; 129: 130: switch (filter->status) { 131: case 0: 132: if (c >= 0 && c < 0x80) { /* latin */ 133: CK((*filter->output_function)(c, filter->data)); 134: } else if (c == 0x80) { /* euro sign */ 135: CK((*filter->output_function)(0x20ac, filter->data)); 136: } else if (c == 0xff) { 137: CK((*filter->output_function)(0x00ff, filter->data)); 138: } else if (c > 0x80 && c < 0xff) { /* dbcs/qbcs lead byte */ 139: filter->status = 1; 140: filter->cache = c; 141: } else { 142: w = c & MBFL_WCSGROUP_MASK; 143: w |= MBFL_WCSGROUP_THROUGH; 144: CK((*filter->output_function)(w, filter->data)); 145: } 146: break; 147: 148: case 1: /* dbcs/qbcs second byte */ 149: c1 = filter->cache; 150: filter->status = 0; 151: 152: if (c1 >= 0x81 && c1 <= 0x84 && c >= 0x30 && c <= 0x39) { /* 4 byte range: Unicode BMP */ 153: filter->status = 2; 154: filter->cache = (c1 << 8) | c; 155: return c; 156: } else if (c1 >= 0x90 && c1 <= 0xe3 && c >= 0x30 && c <= 0x39) { 157: /* 4 byte range: Unicode 16 planes */ 158: filter->status = 2; 159: filter->cache = (c1 << 8) | c; 160: return c; 161: } else if (((c1 >= 0xaa && c1 <= 0xaf) || (c1 >= 0xf8 && c1 <= 0xfe)) && 162: (c >= 0xa1 && c <= 0xfe)) { /* UDA part1,2: U+E000-U+E4C5 */ 163: w = 94*(c1 >= 0xf8 ? c1 - 0xf2 : c1 - 0xaa) + (c - 0xa1) + 0xe000; 164: CK((*filter->output_function)(w, filter->data)); 165: } else if (c1 >= 0xa1 && c1 <= 0xa7 && c >= 0x40 && c < 0xa1 && c != 0x7f) { 166: /* UDA part3 : U+E4C6-U+E765*/ 167: w = 96*(c1 - 0xa1) + c - (c >= 0x80 ? 0x41 : 0x40) + 0xe4c6; 168: CK((*filter->output_function)(w, filter->data)); 169: } 170: 171: c2 = (c1 << 8) | c; 172: 173: if (w <= 0 && 174: ((c2 >= 0xa2ab && c2 <= 0xa9f0 + (0xe80f-0xe801)) || 175: (c2 >= 0xd7fa && c2 <= 0xd7fa + (0xe814-0xe810)) || 176: (c2 >= 0xfe50 && c2 <= 0xfe80 + (0xe864-0xe844)))) { 177: for (k = 0; k < mbfl_gb18030_pua_tbl_max; k++) { 178: if (c2 >= mbfl_gb18030_pua_tbl[k][2] && 179: c2 <= mbfl_gb18030_pua_tbl[k][2] + mbfl_gb18030_pua_tbl[k][1] 180: - mbfl_gb18030_pua_tbl[k][0]) { 181: w = c2 - mbfl_gb18030_pua_tbl[k][2] + mbfl_gb18030_pua_tbl[k][0]; 182: CK((*filter->output_function)(w, filter->data)); 183: break; 184: } 185: } 186: } 187: 188: if (w <= 0) { 189: if ((c1 >= 0xa1 && c1 <= 0xa9 && c >= 0xa1 && c <= 0xfe) || 190: (c1 >= 0xb0 && c1 <= 0xf7 && c >= 0xa1 && c <= 0xfe) || 191: (c1 >= 0x81 && c1 <= 0xa0 && c >= 0x40 && c <= 0xfe && c != 0x7f) || 192: (c1 >= 0xaa && c1 <= 0xfe && c >= 0x40 && c <= 0xa0 && c != 0x7f) || 193: (c1 >= 0xa8 && c1 <= 0xa9 && c >= 0x40 && c <= 0xa0 && c != 0x7f)) { 194: w = (c1 - 0x81)*192 + (c - 0x40); 195: if (w >= 0 && w < cp936_ucs_table_size) { 196: w = cp936_ucs_table[w]; 197: } else { 198: w = 0; 199: } 200: if (w <= 0) { 201: w = (c1 << 8) | c; 202: w &= MBFL_WCSPLANE_MASK; 203: w |= MBFL_WCSPLANE_GB18030; 204: } 205: CK((*filter->output_function)(w, filter->data)); 206: } else if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */ 207: CK((*filter->output_function)(c, filter->data)); 208: } else { 209: w = (c1 << 8) | c; 210: w &= MBFL_WCSGROUP_MASK; 211: w |= MBFL_WCSGROUP_THROUGH; 212: CK((*filter->output_function)(w, filter->data)); 213: } 214: } 215: break; 216: case 2: /* qbcs third byte */ 217: c1 = (filter->cache >> 8) & 0xff; 218: c2 = filter->cache & 0xff; 219: filter->status = 0; 220: filter->cache = 0; 221: if (((c1 >= 0x81 && c1 <= 0x84) || (c1 >= 0x90 && c1 <= 0xe3)) && 222: c2 >= 0x30 && c2 <= 0x39 && c >= 0x81 && c <= 0xfe) { 223: filter->cache = (c1 << 16) | (c2 << 8) | c; 224: filter->status = 3; 225: } else { 226: w = (c1 << 16) | (c2 << 8) | c; 227: w &= MBFL_WCSGROUP_MASK; 228: w |= MBFL_WCSGROUP_THROUGH; 229: CK((*filter->output_function)(w, filter->data)); 230: } 231: break; 232: 233: case 3: /* qbcs fourth byte */ 234: c1 = (filter->cache >> 16) & 0xff; 235: c2 = (filter->cache >> 8) & 0xff; 236: c3 = filter->cache & 0xff; 237: filter->status = 0; 238: filter->cache = 0; 239: if (((c1 >= 0x81 && c1 <= 0x84) || (c1 >= 0x90 && c1 <= 0xe3)) && 240: c2 >= 0x30 && c2 <= 0x39 && c3 >= 0x81 && c3 <= 0xfe && c >= 0x30 && c <= 0x39) { 241: if (c1 >= 0x90 && c1 <= 0xe3) { 242: w = ((c1 - 0x90)*10 + (c2 - 0x30)*126 + (c3 - 0x81))*10 + (c - 0x30) + 0x10000; 243: } else { /* Unicode BMP */ 244: w = (((c1 - 0x81)*10 + (c2 - 0x30))*126 + (c3 - 0x81))*10 + (c - 0x30); 245: if (w >= 0 && w <= 39419) { 246: k = mbfl_bisec_srch(w, mbfl_gb2uni_tbl, mbfl_gb_uni_max); 247: if (k<0) { 248: /* error */ 249: w = (c1 << 24) | (c2 << 16) | (c3 << 8) | c; 250: w &= MBFL_WCSGROUP_MASK; 251: w |= MBFL_WCSGROUP_THROUGH; 252: CK((*filter->output_function)(w, filter->data)); 253: return c; 254: } 255: w += mbfl_gb_uni_ofst[k]; 256: } else { 257: w = (c1 << 24) | (c2 << 16) | (c3 << 8) | c; 258: w &= MBFL_WCSGROUP_MASK; 259: w |= MBFL_WCSGROUP_THROUGH; 260: CK((*filter->output_function)(w, filter->data)); 261: return c; 262: } 263: } 264: CK((*filter->output_function)(w, filter->data)); 265: } else { 266: w = (c1 << 24) | (c2 << 16) | (c3 << 8) | c; 267: w &= MBFL_WCSGROUP_MASK; 268: w |= MBFL_WCSGROUP_THROUGH; 269: CK((*filter->output_function)(w, filter->data)); 270: } 271: break; 272: 273: default: 274: filter->status = 0; 275: break; 276: } 277: 278: return c; 279: } 280: 281: /* 282: * wchar => GB18030 283: */ 284: int 285: mbfl_filt_conv_wchar_gb18030(int c, mbfl_convert_filter *filter) 286: { 287: int k, k1, k2; 288: int c1, s = 0, s1 = 0; 289: 290: if (c >= ucs_a1_cp936_table_min && c < ucs_a1_cp936_table_max) { 291: s = ucs_a1_cp936_table[c - ucs_a1_cp936_table_min]; 292: } else if (c >= ucs_a2_cp936_table_min && c < ucs_a2_cp936_table_max) { 293: s = ucs_a2_cp936_table[c - ucs_a2_cp936_table_min]; 294: } else if (c >= ucs_a3_cp936_table_min && c < ucs_a3_cp936_table_max) { 295: s = ucs_a3_cp936_table[c - ucs_a3_cp936_table_min]; 296: } else if (c >= ucs_i_cp936_table_min && c < ucs_i_cp936_table_max) { 297: s = ucs_i_cp936_table[c - ucs_i_cp936_table_min]; 298: } else if (c >= ucs_ci_cp936_table_min && c < ucs_ci_cp936_table_max) { 299: /* U+F900-FA2F CJK Compatibility Ideographs */ 300: if (c == 0xf92c) { 301: s = 0xfd9c; 302: } else if (c == 0xf979) { 303: s = 0xfd9d; 304: } else if (c == 0xf995) { 305: s = 0xfd9e; 306: } else if (c == 0xf9e7) { 307: s = 0xfd9f; 308: } else if (c == 0xf9f1) { 309: s = 0xfda0; 310: } else if (c >= 0xfa0c && c <= 0xfa29) { 311: s = ucs_ci_s_cp936_table[c - 0xfa0c]; 312: } 313: } else if (c >= ucs_cf_cp936_table_min && c < ucs_cf_cp936_table_max) { 314: /* FE30h CJK Compatibility Forms */ 315: s = ucs_cf_cp936_table[c - ucs_cf_cp936_table_min]; 316: } else if (c >= ucs_sfv_cp936_table_min && c < ucs_sfv_cp936_table_max) { 317: /* U+FE50-FE6F Small Form Variants */ 318: s = ucs_sfv_cp936_table[c - ucs_sfv_cp936_table_min]; 319: } else if (c >= ucs_hff_cp936_table_min && c < ucs_hff_cp936_table_max) { 320: /* U+FF00-FFFF HW/FW Forms */ 321: if (c == 0xff04) { 322: s = 0xa1e7; 323: } else if (c == 0xff5e) { 324: s = 0xa1ab; 325: } else if (c >= 0xff01 && c <= 0xff5d) { 326: s = c - 0xff01 + 0xa3a1; 327: } else if (c >= 0xffe0 && c <= 0xffe5) { 328: s = ucs_hff_s_cp936_table[c-0xffe0]; 329: } 330: } 331: 332: if (c == 0x20ac) { /* euro-sign */ 333: s = 0xa2e3; 334: } 335: 336: if (s <= 0 && c >= mbfl_gb18030_c_tbl_key[0] && 337: c <= mbfl_gb18030_c_tbl_key[mbfl_gb18030_c_tbl_max-1]) { 338: k1 = mbfl_bisec_srch2(c, mbfl_gb18030_c_tbl_key, mbfl_gb18030_c_tbl_max); 339: if (k1 >= 0) { 340: s = mbfl_gb18030_c_tbl_val[k1]; 341: } 342: } 343: 344: if (c >= 0xe000 && c <= 0xe864) { /* PUA */ 345: if (c < 0xe766) { 346: if (c < 0xe4c6) { 347: c1 = c - 0xe000; 348: s = (c1 % 94) + 0xa1; c1 /= 94; 349: s |= (c1 < 0x06 ? c1 + 0xaa : c1 + 0xf2) << 8; 350: } else { 351: c1 = c - 0xe4c6; 352: s = ((c1 / 96) + 0xa1) << 8; c1 %= 96; 353: s |= c1 + (c1 >= 0x3f ? 0x41 : 0x40); 354: } 355: } else { 356: /* U+E766..U+E864 */ 357: k1 = 0; k2 = mbfl_gb18030_pua_tbl_max; 358: while (k1 < k2) { 359: k = (k1 + k2) >> 1; 360: if (c < mbfl_gb18030_pua_tbl[k][0]) { 361: k2 = k; 362: } else if (c > mbfl_gb18030_pua_tbl[k][1]) { 363: k1 = k + 1; 364: } else { 365: s = c - mbfl_gb18030_pua_tbl[k][0] + mbfl_gb18030_pua_tbl[k][2]; 366: break; 367: } 368: } 369: } 370: } 371: 372: if (s <= 0 && c >= 0x0080 && c <= 0xffff) { /* BMP */ 373: s = mbfl_bisec_srch(c, mbfl_uni2gb_tbl, mbfl_gb_uni_max); 374: if (s >= 0) { 375: c1 = c - mbfl_gb_uni_ofst[s]; 376: s = (c1 % 10) + 0x30; c1 /= 10; 377: s |= ((c1 % 126) + 0x81) << 8; c1 /= 126; 378: s |= ((c1 % 10) + 0x30) << 16; c1 /= 10; 379: s1 = c1 + 0x81; 380: } 381: } else if (c >= 0x10000 && c <= 0x10ffff) { /* Code set 3: Unicode U+10000..U+10FFFF */ 382: c1 = c - 0x10000; 383: s = (c1 % 10) + 0x30; c1 /= 10; 384: s |= ((c1 % 126) + 0x81) << 8; c1 /= 126; 385: s |= ((c1 % 10) + 0x30) << 16; c1 /= 10; 386: s1 = c1 + 0x90; 387: } 388: 389: if (s <= 0) { 390: c1 = c & ~MBFL_WCSPLANE_MASK; 391: if (c1 == MBFL_WCSPLANE_WINCP936) { 392: s = c & MBFL_WCSPLANE_MASK; 393: } 394: if (c == 0) { 395: s = 0; 396: } else if (s <= 0) { 397: s = -1; 398: } 399: } 400: if (s >= 0) { 401: if (s <= 0x80) { /* latin */ 402: CK((*filter->output_function)(s, filter->data)); 403: } else if (s1 > 0) { /* qbcs */ 404: CK((*filter->output_function)(s1 & 0xff, filter->data)); 405: CK((*filter->output_function)((s >> 16) & 0xff, filter->data)); 406: CK((*filter->output_function)((s >> 8) & 0xff, filter->data)); 407: CK((*filter->output_function)(s & 0xff, filter->data)); 408: } else { /* dbcs */ 409: CK((*filter->output_function)((s >> 8) & 0xff, filter->data)); 410: CK((*filter->output_function)(s & 0xff, filter->data)); 411: } 412: } else { 413: if (filter->illegal_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) { 414: CK(mbfl_filt_conv_illegal_output(c, filter)); 415: } 416: } 417: 418: return c; 419: } 420: 421: static int mbfl_filt_ident_gb18030(int c, mbfl_identify_filter *filter) 422: { 423: int c1; 424: 425: c1 = (filter->status >> 8) & 0xff; 426: filter->status &= 0xff; 427: 428: if (filter->status == 0) { 429: if (c <= 0x80 || c == 0xff) { 430: filter->status = 0; 431: } else { 432: filter->status = 1; 433: filter->status |= (c << 8); 434: } 435: } else if (filter->status == 1) { /* dbcs/qbcs 2nd byte */ 436: if (((c1 >= 0x81 && c1 <= 0x84) || (c1 >= 0x90 && c1 <= 0xe3)) && c >= 0x30 && c <= 0x39) { /* qbcs */ 437: filter->status = 2; 438: } else if (((c1 >= 0xaa && c1 <= 0xaf) || (c1 >= 0xf8 && c1 <= 0xfe)) && (c >= 0xa1 && c <= 0xfe)) { 439: filter->status = 0; /* UDA part 1,2 */ 440: } else if (c1 >= 0xa1 && c1 <= 0xa7 && c >= 0x40 && c < 0xa1 && c != 0x7f) { 441: filter->status = 0; /* UDA part 3 */ 442: } else if ((c1 >= 0xa1 && c1 <= 0xa9 && c >= 0xa1 && c <= 0xfe) || 443: (c1 >= 0xb0 && c1 <= 0xf7 && c >= 0xa1 && c <= 0xfe) || 444: (c1 >= 0x81 && c1 <= 0xa0 && c >= 0x40 && c <= 0xfe && c != 0x7f) || 445: (c1 >= 0xaa && c1 <= 0xfe && c >= 0x40 && c <= 0xa0 && c != 0x7f) || 446: (c1 >= 0xa8 && c1 <= 0xa9 && c >= 0x40 && c <= 0xa0 && c != 0x7f)) { 447: filter->status = 0; /* DBCS */ 448: } else { 449: filter->flag = 1; /* bad */ 450: filter->status = 0; 451: } 452: } else if (filter->status == 2) { /* qbcs 3rd byte */ 453: if (c > 0x80 && c < 0xff) { 454: filter->status = 3; 455: } else { 456: filter->flag = 1; /* bad */ 457: filter->status = 0; 458: } 459: } else if (filter->status == 3) { /* qbcs 4th byte */ 460: if (c >= 0x30 && c < 0x40) { 461: filter->status = 0; 462: } else { 463: filter->flag = 1; /* bad */ 464: filter->status = 0; 465: } 466: } else { /* bad */ 467: filter->flag = 1; 468: } 469: 470: return c; 471: } 472: 473: