Return to mbfilter_big5.c CVS log | Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / php / ext / mbstring / libmbfl / filters |
1.1 misho 1: /* 2: * "streamable kanji code filter and converter" 3: * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved. 4: * 5: * LICENSE NOTICES 6: * 7: * This file is part of "streamable kanji code filter and converter", 8: * which is distributed under the terms of GNU Lesser General Public 9: * License (version 2) as published by the Free Software Foundation. 10: * 11: * This software is distributed in the hope that it will be useful, 12: * but WITHOUT ANY WARRANTY; without even the implied warranty of 13: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14: * GNU Lesser General Public License for more details. 15: * 16: * You should have received a copy of the GNU Lesser General Public 17: * License along with "streamable kanji code filter and converter"; 18: * if not, write to the Free Software Foundation, Inc., 59 Temple Place, 19: * Suite 330, Boston, MA 02111-1307 USA 20: * 21: * The author of this file: Rui Hirokawa <hirokawa@php.net> 22: * 23: */ 24: /* 25: * The source code included in this files was separated from mbfilter_tw.c 26: * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002. 27: * 28: */ 29: 30: #ifdef HAVE_CONFIG_H 31: #include "config.h" 32: #endif 33: 34: #include "mbfilter.h" 35: #include "mbfilter_big5.h" 36: 37: #include "unicode_table_big5.h" 38: 39: static int mbfl_filt_ident_big5(int c, mbfl_identify_filter *filter); 40: 41: static const unsigned char mblen_table_big5[] = { /* 0x81-0xFE */ 42: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 43: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 44: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 45: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 46: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 47: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 48: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 49: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 50: 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 51: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 52: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 53: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 54: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 55: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 56: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 57: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 58: }; 59: 1.1.1.2 ! misho 60: static const char *mbfl_encoding_big5_aliases[] = {"CN-BIG5", "BIG-FIVE", "BIGFIVE", NULL}; 1.1 misho 61: 62: const mbfl_encoding mbfl_encoding_big5 = { 63: mbfl_no_encoding_big5, 64: "BIG-5", 65: "BIG5", 66: (const char *(*)[])&mbfl_encoding_big5_aliases, 67: mblen_table_big5, 1.1.1.2 ! misho 68: MBFL_ENCTYPE_MBCS | MBFL_ENCTYPE_GL_UNSAFE ! 69: }; ! 70: ! 71: const mbfl_encoding mbfl_encoding_cp950 = { ! 72: mbfl_no_encoding_cp950, ! 73: "CP950", ! 74: "BIG5", ! 75: NULL, ! 76: mblen_table_big5, ! 77: MBFL_ENCTYPE_MBCS | MBFL_ENCTYPE_GL_UNSAFE 1.1 misho 78: }; 79: 80: const struct mbfl_identify_vtbl vtbl_identify_big5 = { 81: mbfl_no_encoding_big5, 82: mbfl_filt_ident_common_ctor, 83: mbfl_filt_ident_common_dtor, 84: mbfl_filt_ident_big5 85: }; 86: 1.1.1.2 ! misho 87: const struct mbfl_identify_vtbl vtbl_identify_cp950 = { ! 88: mbfl_no_encoding_cp950, ! 89: mbfl_filt_ident_common_ctor, ! 90: mbfl_filt_ident_common_dtor, ! 91: mbfl_filt_ident_big5 ! 92: }; ! 93: 1.1 misho 94: const struct mbfl_convert_vtbl vtbl_big5_wchar = { 95: mbfl_no_encoding_big5, 96: mbfl_no_encoding_wchar, 97: mbfl_filt_conv_common_ctor, 98: mbfl_filt_conv_common_dtor, 99: mbfl_filt_conv_big5_wchar, 100: mbfl_filt_conv_common_flush 101: }; 102: 103: const struct mbfl_convert_vtbl vtbl_wchar_big5 = { 104: mbfl_no_encoding_wchar, 105: mbfl_no_encoding_big5, 106: mbfl_filt_conv_common_ctor, 107: mbfl_filt_conv_common_dtor, 108: mbfl_filt_conv_wchar_big5, 109: mbfl_filt_conv_common_flush 110: }; 111: 1.1.1.2 ! misho 112: const struct mbfl_convert_vtbl vtbl_cp950_wchar = { ! 113: mbfl_no_encoding_cp950, ! 114: mbfl_no_encoding_wchar, ! 115: mbfl_filt_conv_common_ctor, ! 116: mbfl_filt_conv_common_dtor, ! 117: mbfl_filt_conv_big5_wchar, ! 118: mbfl_filt_conv_common_flush ! 119: }; ! 120: ! 121: const struct mbfl_convert_vtbl vtbl_wchar_cp950 = { ! 122: mbfl_no_encoding_wchar, ! 123: mbfl_no_encoding_cp950, ! 124: mbfl_filt_conv_common_ctor, ! 125: mbfl_filt_conv_common_dtor, ! 126: mbfl_filt_conv_wchar_big5, ! 127: mbfl_filt_conv_common_flush ! 128: }; ! 129: 1.1 misho 130: #define CK(statement) do { if ((statement) < 0) return (-1); } while (0) 131: 1.1.1.2 ! misho 132: /* 63 + 94 = 157 or 94 */ ! 133: static unsigned short cp950_pua_tbl[][4] = { ! 134: {0xe000,0xe310,0xfa40,0xfefe}, ! 135: {0xe311,0xeeb7,0x8e40,0xa0fe}, ! 136: {0xeeb8,0xf6b0,0x8140,0x8dfe}, ! 137: {0xf6b1,0xf70e,0xc6a1,0xc6fe}, ! 138: {0xf70f,0xf848,0xc740,0xc8fe}, ! 139: }; ! 140: 1.1 misho 141: /* 142: * Big5 => wchar 143: */ 144: int 145: mbfl_filt_conv_big5_wchar(int c, mbfl_convert_filter *filter) 146: { 1.1.1.2 ! misho 147: int k; ! 148: int c1, w, c2; 1.1 misho 149: 150: switch (filter->status) { 151: case 0: 1.1.1.2 ! misho 152: if (filter->from->no_encoding == mbfl_no_encoding_cp950) { ! 153: c1 = 0x80; ! 154: } else { ! 155: c1 = 0xa0; ! 156: } ! 157: ! 158: if (c >= 0 && c <= 0x80) { /* latin */ 1.1 misho 159: CK((*filter->output_function)(c, filter->data)); 1.1.1.2 ! misho 160: } else if (c == 0xff) { ! 161: CK((*filter->output_function)(0xf8f8, filter->data)); ! 162: } else if (c > c1 && c < 0xff) { /* dbcs lead byte */ 1.1 misho 163: filter->status = 1; 164: filter->cache = c; 165: } else { 166: w = c & MBFL_WCSGROUP_MASK; 167: w |= MBFL_WCSGROUP_THROUGH; 168: CK((*filter->output_function)(w, filter->data)); 169: } 170: break; 171: 172: case 1: /* dbcs second byte */ 173: filter->status = 0; 174: c1 = filter->cache; 175: if ((c > 0x39 && c < 0x7f) | (c > 0xa0 && c < 0xff)) { 176: if (c < 0x7f){ 177: w = (c1 - 0xa1)*157 + (c - 0x40); 178: } else { 179: w = (c1 - 0xa1)*157 + (c - 0xa1) + 0x3f; 180: } 181: if (w >= 0 && w < big5_ucs_table_size) { 182: w = big5_ucs_table[w]; 183: } else { 184: w = 0; 185: } 1.1.1.2 ! misho 186: ! 187: if (filter->from->no_encoding == mbfl_no_encoding_cp950) { ! 188: /* PUA for CP950 */ ! 189: if (w <= 0 && ! 190: (((c1 >= 0xfa && c1 <= 0xfe) || (c1 >= 0x8e && c1 <= 0xa0) || ! 191: (c1 >= 0x81 && c1 <= 0x8d) ||(c1 >= 0xc7 && c1 <= 0xc8)) ! 192: && ((c > 0x39 && c < 0x7f) || (c > 0xa0 && c < 0xff))) || ! 193: ((c1 == 0xc6) && (c > 0xa0 && c < 0xff))) { ! 194: c2 = c1 << 8 | c; ! 195: for (k = 0; k < sizeof(cp950_pua_tbl)/(sizeof(unsigned short)*4); k++) { ! 196: if (c2 >= cp950_pua_tbl[k][2] && c2 <= cp950_pua_tbl[k][3]) { ! 197: break; ! 198: } ! 199: } ! 200: ! 201: if ((cp950_pua_tbl[k][2] & 0xff) == 0x40) { ! 202: w = 157*(c1 - (cp950_pua_tbl[k][2]>>8)) + c - (c >= 0xa1 ? 0x62 : 0x40) ! 203: + cp950_pua_tbl[k][0]; ! 204: } else { ! 205: w = c2 - cp950_pua_tbl[k][2] + cp950_pua_tbl[k][0]; ! 206: } ! 207: } ! 208: } ! 209: 1.1 misho 210: if (w <= 0) { 211: w = (c1 << 8) | c; 212: w &= MBFL_WCSPLANE_MASK; 213: w |= MBFL_WCSPLANE_BIG5; 214: } 215: CK((*filter->output_function)(w, filter->data)); 216: } else if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */ 217: CK((*filter->output_function)(c, filter->data)); 218: } else { 219: w = (c1 << 8) | c; 220: w &= MBFL_WCSGROUP_MASK; 221: w |= MBFL_WCSGROUP_THROUGH; 222: CK((*filter->output_function)(w, filter->data)); 223: } 224: break; 225: 226: default: 227: filter->status = 0; 228: break; 229: } 230: 231: return c; 232: } 233: 234: /* 235: * wchar => Big5 236: */ 237: int 238: mbfl_filt_conv_wchar_big5(int c, mbfl_convert_filter *filter) 239: { 1.1.1.2 ! misho 240: int k; ! 241: int c1, s, c2; 1.1 misho 242: 243: s = 0; 244: if (c >= ucs_a1_big5_table_min && c < ucs_a1_big5_table_max) { 245: s = ucs_a1_big5_table[c - ucs_a1_big5_table_min]; 246: } else if (c >= ucs_a2_big5_table_min && c < ucs_a2_big5_table_max) { 247: s = ucs_a2_big5_table[c - ucs_a2_big5_table_min]; 248: } else if (c >= ucs_a3_big5_table_min && c < ucs_a3_big5_table_max) { 249: s = ucs_a3_big5_table[c - ucs_a3_big5_table_min]; 250: } else if (c >= ucs_i_big5_table_min && c < ucs_i_big5_table_max) { 251: s = ucs_i_big5_table[c - ucs_i_big5_table_min]; 252: } else if (c >= ucs_pua_big5_table_min && c < ucs_pua_big5_table_max) { 253: s = ucs_pua_big5_table[c - ucs_pua_big5_table_min]; 254: } else if (c >= ucs_r1_big5_table_min && c < ucs_r1_big5_table_max) { 255: s = ucs_r1_big5_table[c - ucs_r1_big5_table_min]; 256: } else if (c >= ucs_r2_big5_table_min && c < ucs_r2_big5_table_max) { 257: s = ucs_r2_big5_table[c - ucs_r2_big5_table_min]; 258: } 1.1.1.2 ! misho 259: ! 260: if (filter->to->no_encoding == mbfl_no_encoding_cp950) { ! 261: if (c >= 0xe000 && c <= 0xf848) { /* PUA for CP950 */ ! 262: for (k = 0; k < sizeof(cp950_pua_tbl)/(sizeof(unsigned short)*4); k++) { ! 263: if (c <= cp950_pua_tbl[k][1]) { ! 264: break; ! 265: } ! 266: } ! 267: c1 = c - cp950_pua_tbl[k][0]; ! 268: if ((cp950_pua_tbl[k][2] & 0xff) == 0x40) { ! 269: c2 = cp950_pua_tbl[k][2] >> 8; ! 270: s = ((c1 / 157) + c2) << 8; c1 %= 157; ! 271: s |= c1 + (c1 >= 0x3f ? 0x62 : 0x40); ! 272: } else { ! 273: s = c1 + cp950_pua_tbl[k][2]; ! 274: } ! 275: } ! 276: ! 277: if (c == 0x80) { ! 278: s = 0x80; ! 279: } else if (c == 0xf8f8) { ! 280: s = 0xff; ! 281: } else if (c == 0x256d) { ! 282: s = 0xa27e; ! 283: } else if (c == 0x256e) { ! 284: s = 0xa2a1; ! 285: } else if (c == 0x256f) { ! 286: s = 0xa2a3; ! 287: } else if (c == 0x2570) { ! 288: s = 0xa2a2; ! 289: } ! 290: } ! 291: 1.1 misho 292: if (s <= 0) { 293: c1 = c & ~MBFL_WCSPLANE_MASK; 294: if (c1 == MBFL_WCSPLANE_BIG5) { 295: s = c & MBFL_WCSPLANE_MASK; 296: } 297: if (c == 0) { 298: s = 0; 299: } else if (s <= 0) { 300: s = -1; 301: } 302: } 303: if (s >= 0) { 1.1.1.2 ! misho 304: if (s <= 0x80 || s == 0xff) { /* latin */ 1.1 misho 305: CK((*filter->output_function)(s, filter->data)); 306: } else { 307: CK((*filter->output_function)((s >> 8) & 0xff, filter->data)); 308: CK((*filter->output_function)(s & 0xff, filter->data)); 309: } 310: } else { 311: if (filter->illegal_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) { 312: CK(mbfl_filt_conv_illegal_output(c, filter)); 313: } 314: } 315: 316: return c; 317: } 318: 319: static int mbfl_filt_ident_big5(int c, mbfl_identify_filter *filter) 320: { 1.1.1.2 ! misho 321: int c1; ! 322: if (filter->encoding->no_encoding == mbfl_no_encoding_cp950) { ! 323: c1 = 0x80; ! 324: } else { ! 325: c1 = 0xa0; ! 326: } ! 327: 1.1 misho 328: if (filter->status) { /* kanji second char */ 329: if (c < 0x40 || (c > 0x7e && c < 0xa1) ||c > 0xfe) { /* bad */ 330: filter->flag = 1; 331: } 332: filter->status = 0; 333: } else if (c >= 0 && c < 0x80) { /* latin ok */ 334: ; 1.1.1.2 ! misho 335: } else if (c > c1 && c < 0xff) { /* DBCS lead byte */ 1.1 misho 336: filter->status = 1; 337: } else { /* bad */ 338: filter->flag = 1; 339: } 340: 341: return c; 342: } 343: 344: