Return to mbfilter_utf8_mobile.c CVS log | Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / php / ext / mbstring / libmbfl / filters |
1.1 misho 1: /* 2: * "streamable kanji code filter and converter" 3: * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved. 4: * 5: * LICENSE NOTICES 6: * 7: * This file is part of "streamable kanji code filter and converter", 8: * which is distributed under the terms of GNU Lesser General Public 9: * License (version 2) as published by the Free Software Foundation. 10: * 11: * This software is distributed in the hope that it will be useful, 12: * but WITHOUT ANY WARRANTY; without even the implied warranty of 13: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14: * GNU Lesser General Public License for more details. 15: * 16: * You should have received a copy of the GNU Lesser General Public 17: * License along with "streamable kanji code filter and converter"; 18: * if not, write to the Free Software Foundation, Inc., 59 Temple Place, 19: * Suite 330, Boston, MA 02111-1307 USA 20: * 21: * The author of this file: 22: * 23: */ 24: /* 25: * The source code included in this files was separated from mbfilter.c 26: * by rui hrokawa <hirokawa@php.net> on 8 aug 2011. 27: * 28: */ 29: 30: #ifdef HAVE_CONFIG_H 31: #include "config.h" 32: #endif 33: 34: #include "mbfilter.h" 35: 36: #include "mbfilter_utf8_mobile.h" 37: #include "mbfilter_sjis_mobile.h" 38: 39: extern int mbfl_filt_ident_utf8(int c, mbfl_identify_filter *filter); 40: 41: extern const unsigned char mblen_table_utf8[]; 42: 43: static const char *mbfl_encoding_utf8_docomo_aliases[] = {"UTF-8-DOCOMO", "UTF8-DOCOMO", NULL}; 44: static const char *mbfl_encoding_utf8_kddi_a_aliases[] = {NULL}; 45: static const char *mbfl_encoding_utf8_kddi_b_aliases[] = {"UTF-8-Mobile#KDDI", "UTF-8-KDDI", "UTF8-KDDI", NULL}; 46: static const char *mbfl_encoding_utf8_sb_aliases[] = {"UTF-8-SOFTBANK", "UTF8-SOFTBANK", NULL}; 47: 48: const mbfl_encoding mbfl_encoding_utf8_docomo = { 49: mbfl_no_encoding_utf8_docomo, 50: "UTF-8-Mobile#DOCOMO", 51: "UTF-8", 52: (const char *(*)[])&mbfl_encoding_utf8_docomo_aliases, 53: mblen_table_utf8, 54: MBFL_ENCTYPE_MBCS 55: }; 56: 57: const mbfl_encoding mbfl_encoding_utf8_kddi_a = { 58: mbfl_no_encoding_utf8_kddi_a, 59: "UTF-8-Mobile#KDDI-A", 60: "UTF-8", 61: (const char *(*)[])&mbfl_encoding_utf8_kddi_a_aliases, 62: mblen_table_utf8, 63: MBFL_ENCTYPE_MBCS 64: }; 65: 66: const mbfl_encoding mbfl_encoding_utf8_kddi_b = { 67: mbfl_no_encoding_utf8_kddi_b, 68: "UTF-8-Mobile#KDDI-B", 69: "UTF-8", 70: (const char *(*)[])&mbfl_encoding_utf8_kddi_b_aliases, 71: mblen_table_utf8, 72: MBFL_ENCTYPE_MBCS 73: }; 74: 75: const mbfl_encoding mbfl_encoding_utf8_sb = { 76: mbfl_no_encoding_utf8_sb, 77: "UTF-8-Mobile#SOFTBANK", 78: "UTF-8", 79: (const char *(*)[])&mbfl_encoding_utf8_sb_aliases, 80: mblen_table_utf8, 81: MBFL_ENCTYPE_MBCS 82: }; 83: 84: const struct mbfl_identify_vtbl vtbl_identify_utf8_docomo = { 85: mbfl_no_encoding_utf8_docomo, 86: mbfl_filt_ident_common_ctor, 87: mbfl_filt_ident_common_dtor, 88: mbfl_filt_ident_utf8 89: }; 90: 91: const struct mbfl_identify_vtbl vtbl_identify_utf8_kddi_a = { 92: mbfl_no_encoding_utf8_kddi_a, 93: mbfl_filt_ident_common_ctor, 94: mbfl_filt_ident_common_dtor, 95: mbfl_filt_ident_utf8 96: }; 97: 98: const struct mbfl_identify_vtbl vtbl_identify_utf8_kddi_b = { 99: mbfl_no_encoding_utf8_kddi_b, 100: mbfl_filt_ident_common_ctor, 101: mbfl_filt_ident_common_dtor, 102: mbfl_filt_ident_utf8 103: }; 104: 105: const struct mbfl_identify_vtbl vtbl_identify_utf8_sb = { 106: mbfl_no_encoding_utf8_sb, 107: mbfl_filt_ident_common_ctor, 108: mbfl_filt_ident_common_dtor, 109: mbfl_filt_ident_utf8 110: }; 111: 112: const struct mbfl_convert_vtbl vtbl_utf8_docomo_wchar = { 113: mbfl_no_encoding_utf8_docomo, 114: mbfl_no_encoding_wchar, 115: mbfl_filt_conv_common_ctor, 116: mbfl_filt_conv_common_dtor, 117: mbfl_filt_conv_utf8_mobile_wchar, 118: mbfl_filt_conv_common_flush 119: }; 120: 121: const struct mbfl_convert_vtbl vtbl_wchar_utf8_docomo = { 122: mbfl_no_encoding_wchar, 123: mbfl_no_encoding_utf8_docomo, 124: mbfl_filt_conv_common_ctor, 125: mbfl_filt_conv_common_dtor, 126: mbfl_filt_conv_wchar_utf8_mobile, 127: mbfl_filt_conv_common_flush 128: }; 129: 130: const struct mbfl_convert_vtbl vtbl_utf8_kddi_a_wchar = { 131: mbfl_no_encoding_utf8_kddi_a, 132: mbfl_no_encoding_wchar, 133: mbfl_filt_conv_common_ctor, 134: mbfl_filt_conv_common_dtor, 135: mbfl_filt_conv_utf8_mobile_wchar, 136: mbfl_filt_conv_common_flush 137: }; 138: 139: const struct mbfl_convert_vtbl vtbl_wchar_utf8_kddi_a = { 140: mbfl_no_encoding_wchar, 141: mbfl_no_encoding_utf8_kddi_a, 142: mbfl_filt_conv_common_ctor, 143: mbfl_filt_conv_common_dtor, 144: mbfl_filt_conv_wchar_utf8_mobile, 145: mbfl_filt_conv_common_flush 146: }; 147: 148: const struct mbfl_convert_vtbl vtbl_utf8_kddi_b_wchar = { 149: mbfl_no_encoding_utf8_kddi_b, 150: mbfl_no_encoding_wchar, 151: mbfl_filt_conv_common_ctor, 152: mbfl_filt_conv_common_dtor, 153: mbfl_filt_conv_utf8_mobile_wchar, 154: mbfl_filt_conv_common_flush 155: }; 156: 157: const struct mbfl_convert_vtbl vtbl_wchar_utf8_kddi_b = { 158: mbfl_no_encoding_wchar, 159: mbfl_no_encoding_utf8_kddi_b, 160: mbfl_filt_conv_common_ctor, 161: mbfl_filt_conv_common_dtor, 162: mbfl_filt_conv_wchar_utf8_mobile, 163: mbfl_filt_conv_common_flush 164: }; 165: 166: const struct mbfl_convert_vtbl vtbl_utf8_sb_wchar = { 167: mbfl_no_encoding_utf8_sb, 168: mbfl_no_encoding_wchar, 169: mbfl_filt_conv_common_ctor, 170: mbfl_filt_conv_common_dtor, 171: mbfl_filt_conv_utf8_mobile_wchar, 172: mbfl_filt_conv_common_flush 173: }; 174: 175: const struct mbfl_convert_vtbl vtbl_wchar_utf8_sb = { 176: mbfl_no_encoding_wchar, 177: mbfl_no_encoding_utf8_sb, 178: mbfl_filt_conv_common_ctor, 179: mbfl_filt_conv_common_dtor, 180: mbfl_filt_conv_wchar_utf8_mobile, 181: mbfl_filt_conv_common_flush 182: }; 183: 184: #define CK(statement) do { if ((statement) < 0) return (-1); } while (0) 185: 186: /* 187: * UTF-8 => wchar 188: */ 189: int mbfl_filt_conv_utf8_mobile_wchar(int c, mbfl_convert_filter *filter) 190: { 191: int s, w = 0, flag = 0; 192: int s1 = 0, c1 = 0, snd = 0; 193: 194: if (c < 0x80) { 195: if (c >= 0) { 196: CK((*filter->output_function)(c, filter->data)); 197: } 198: filter->status = 0; 199: } else if (c < 0xc0) { 200: int status = filter->status & 0xff; 201: switch (status) { 202: case 0x10: /* 2byte code 2nd char: 0x80-0xbf */ 203: case 0x21: /* 3byte code 3rd char: 0x80-0xbf */ 204: case 0x32: /* 4byte code 4th char: 0x80-0xbf */ 205: filter->status = 0; 206: s = filter->cache | (c & 0x3f); 207: filter->cache = 0; 208: if ((status == 0x10 && s >= 0x80) || 209: (status == 0x21 && s >= 0x800 && (s < 0xd800 || s > 0xdfff)) || 210: (status == 0x32 && s >= 0x10000 && s < 0x110000)) { 211: 212: if (filter->from->no_encoding == mbfl_no_encoding_utf8_docomo && 213: mbfilter_conv_r_map_tbl(s, &s1, mbfl_docomo2uni_pua, 4) > 0) { 214: s = mbfilter_sjis_emoji_docomo2unicode(s1, &snd); 215: } else if (filter->from->no_encoding == mbfl_no_encoding_utf8_kddi_a && 216: mbfilter_conv_r_map_tbl(s, &s1, mbfl_kddi2uni_pua, 7) > 0) { 217: s = mbfilter_sjis_emoji_kddi2unicode(s1, &snd); 218: } else if (filter->from->no_encoding == mbfl_no_encoding_utf8_kddi_b && 219: mbfilter_conv_r_map_tbl(s, &s1, mbfl_kddi2uni_pua_b, 8) > 0) { 220: s = mbfilter_sjis_emoji_kddi2unicode(s1, &snd); 221: } else if (filter->from->no_encoding == mbfl_no_encoding_utf8_sb && 222: mbfilter_conv_r_map_tbl(s, &s1, mbfl_sb2uni_pua, 6) > 0) { 223: s = mbfilter_sjis_emoji_sb2unicode(s1, &snd); 224: } 225: 226: if (snd > 0) { 227: CK((*filter->output_function)(snd, filter->data)); 228: } 229: CK((*filter->output_function)(s, filter->data)); 230: } else { 231: w = s & MBFL_WCSGROUP_MASK; 232: flag = 1; 233: } 234: break; 235: case 0x20: /* 3byte code 2nd char: 0:0xa0-0xbf,D:0x80-9F,1-C,E-F:0x80-0x9f */ 236: s = filter->cache | ((c & 0x3f) << 6); 237: c1 = (s >> 12) & 0xf; 238: if ((c1 == 0x0 && c >= 0xa0) || 239: (c1 == 0xd && c < 0xa0) || 240: (c1 > 0x0 && c1 != 0xd)) { 241: filter->cache = s; 242: filter->status++; 243: } else { 244: w = s & MBFL_WCSGROUP_MASK; 245: flag = 1; 246: } 247: break; 248: case 0x31: /* 4byte code 3rd char: 0x80-0xbf */ 249: filter->cache |= ((c & 0x3f) << 6); 250: filter->status++; 251: break; 252: case 0x30: /* 4byte code 2nd char: 0:0x90-0xbf,1-3:0x80-0xbf,4:0x80-0x8f */ 253: s = filter->cache | ((c & 0x3f) << 12); 254: c1 = (s >> 18) & 0x7; 255: if ((c1 == 0x0 && c >= 0x90) || 256: (c1 > 0x0 && c1 < 0x4) || 257: (c1 == 0x4 && c < 0x90)) { 258: filter->cache = s; 259: filter->status++; 260: } else { 261: w = s & MBFL_WCSGROUP_MASK; 262: flag = 1; 263: } 264: break; 265: default: 266: w = c & MBFL_WCSGROUP_MASK; 267: flag = 1; 268: break; 269: } 270: } else if (c < 0xc2) { /* invalid: 0xc0,0xc1 */ 271: w = c & MBFL_WCSGROUP_MASK; 272: flag = 1; 273: } else if (c < 0xe0) { /* 2byte code first char: 0xc2-0xdf */ 274: if (filter->status == 0x0) { 275: filter->status = 0x10; 276: filter->cache = (c & 0x1f) << 6; 277: } else { 278: w = c & MBFL_WCSGROUP_MASK; 279: flag = 1; 280: } 281: } else if (c < 0xf0) { /* 3byte code first char: 0xe0-0xef */ 282: if (filter->status == 0x0) { 283: filter->status = 0x20; 284: filter->cache = (c & 0xf) << 12; 285: } else { 286: w = c & MBFL_WCSGROUP_MASK; 287: flag = 1; 288: } 289: } else if (c < 0xf5) { /* 4byte code first char: 0xf0-0xf4 */ 290: if (filter->status == 0x0) { 291: filter->status = 0x30; 292: filter->cache = (c & 0x7) << 18; 293: } else { 294: w = c & MBFL_WCSGROUP_MASK; 295: flag = 1; 296: } 297: } else { 298: w = c & MBFL_WCSGROUP_MASK; 299: flag = 1; 300: } 301: 302: if (flag) { 303: w |= MBFL_WCSGROUP_THROUGH; 304: CK((*filter->output_function)(w, filter->data)); 305: filter->status = 0; 306: filter->cache = 0; 307: } 308: 309: return c; 310: } 311: 312: /* 313: * wchar => UTF-8 314: */ 315: int mbfl_filt_conv_wchar_utf8_mobile(int c, mbfl_convert_filter *filter) 316: { 317: if (c >= 0 && c < 0x110000) { 318: int s1, c1; 319: 320: if ((filter->to->no_encoding == mbfl_no_encoding_utf8_docomo && 321: mbfilter_unicode2sjis_emoji_docomo(c, &s1, filter) > 0 && 322: mbfilter_conv_map_tbl(s1, &c1, mbfl_docomo2uni_pua, 4) > 0) || 323: (filter->to->no_encoding == mbfl_no_encoding_utf8_kddi_a && 324: mbfilter_unicode2sjis_emoji_kddi(c, &s1, filter) > 0 && 325: mbfilter_conv_map_tbl(s1, &c1, mbfl_kddi2uni_pua, 7) > 0) || 326: (filter->to->no_encoding == mbfl_no_encoding_utf8_kddi_b && 327: mbfilter_unicode2sjis_emoji_kddi(c, &s1, filter) > 0 && 328: mbfilter_conv_map_tbl(s1, &c1, mbfl_kddi2uni_pua_b, 8) > 0) || 329: (filter->to->no_encoding == mbfl_no_encoding_utf8_sb && 330: mbfilter_unicode2sjis_emoji_sb(c, &s1, filter) > 0 && 331: mbfilter_conv_map_tbl(s1, &c1, mbfl_sb2uni_pua, 6) > 0)) { 332: c = c1; 333: } 334: 335: if (filter->status == 1 && filter->cache > 0) { 336: return c; 337: } 338: 339: if (c < 0x80) { 340: CK((*filter->output_function)(c, filter->data)); 341: } else if (c < 0x800) { 342: CK((*filter->output_function)(((c >> 6) & 0x1f) | 0xc0, filter->data)); 343: CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data)); 344: } else if (c < 0x10000) { 345: CK((*filter->output_function)(((c >> 12) & 0x0f) | 0xe0, filter->data)); 346: CK((*filter->output_function)(((c >> 6) & 0x3f) | 0x80, filter->data)); 347: CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data)); 348: } else { 349: CK((*filter->output_function)(((c >> 18) & 0x07) | 0xf0, filter->data)); 350: CK((*filter->output_function)(((c >> 12) & 0x3f) | 0x80, filter->data)); 351: CK((*filter->output_function)(((c >> 6) & 0x3f) | 0x80, filter->data)); 352: CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data)); 353: } 354: } else { 355: if (filter->illegal_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) { 356: CK(mbfl_filt_conv_illegal_output(c, filter)); 357: } 358: } 359: 360: return c; 361: } 362: