Annotation of embedaddon/php/Zend/zend_multibyte.c, revision 1.1
1.1 ! misho 1: /*
! 2: +----------------------------------------------------------------------+
! 3: | Zend Engine |
! 4: +----------------------------------------------------------------------+
! 5: | Copyright (c) 1998-2012 Zend Technologies Ltd. (http://www.zend.com) |
! 6: +----------------------------------------------------------------------+
! 7: | This source file is subject to version 2.00 of the Zend license, |
! 8: | that is bundled with this package in the file LICENSE, and is |
! 9: | available through the world-wide-web at |
! 10: | http://www.zend.com/license/2_00.txt. |
! 11: | If you did not receive a copy of the Zend license and are unable to |
! 12: | obtain it through the world-wide-web, please send a note to |
! 13: | license@zend.com so we can mail you a copy immediately. |
! 14: +----------------------------------------------------------------------+
! 15: | Authors: Masaki Fujimoto <fujimoto@php.net> |
! 16: | Rui Hirokawa <hirokawa@php.net> |
! 17: +----------------------------------------------------------------------+
! 18: */
! 19:
! 20: /* $Id: zend_multibyte.c 321634 2012-01-01 13:15:04Z felipe $ */
! 21:
! 22: #include "zend.h"
! 23: #include "zend_compile.h"
! 24: #include "zend_operators.h"
! 25: #include "zend_multibyte.h"
! 26:
! 27: #ifdef ZEND_MULTIBYTE
! 28: static size_t zend_multibyte_encoding_filter(unsigned char **to, size_t *to_length, const char *to_encoding, const unsigned char *from, size_t from_length, const char *from_encoding TSRMLS_DC);
! 29: size_t sjis_input_filter(unsigned char **buf, size_t *length, const unsigned char *sjis, size_t sjis_length TSRMLS_DC);
! 30: size_t sjis_output_filter(unsigned char **buf, size_t *length, const unsigned char *sjis, size_t sjis_length TSRMLS_DC);
! 31: static char* zend_multibyte_assemble_encoding_list(zend_encoding **encoding_list, size_t encoding_list_size);
! 32: static int zend_multibyte_parse_encoding_list(const char *encoding_list,
! 33: size_t encoding_list_size, zend_encoding ***result, size_t *result_size);
! 34: static zend_encoding *zend_multibyte_find_script_encoding(zend_encoding *onetime_encoding TSRMLS_DC);
! 35: static zend_encoding *zend_multibyte_detect_unicode(TSRMLS_D);
! 36: static zend_encoding *zend_multibyte_detect_utf_encoding(const unsigned char *script, size_t script_size TSRMLS_DC);
! 37:
! 38: /*
! 39: * encodings
! 40: */
! 41: static const char *ucs2_aliases[] = {"ISO-10646-UCS-2", "UCS2" , "UNICODE", NULL};
! 42: static zend_encoding encoding_ucs2 = {
! 43: NULL,
! 44: NULL,
! 45: "UCS-2",
! 46: (const char *(*)[])&ucs2_aliases,
! 47: 0
! 48: };
! 49:
! 50: static zend_encoding encoding_ucs2be = {
! 51: NULL,
! 52: NULL,
! 53: "UCS-2BE",
! 54: NULL,
! 55: 0
! 56: };
! 57:
! 58: static zend_encoding encoding_ucs2le = {
! 59: NULL,
! 60: NULL,
! 61: "UCS-2LE",
! 62: NULL,
! 63: 0
! 64: };
! 65:
! 66: static const char *ucs4_aliases[] = {"ISO-10646-UCS-4", "UCS4", NULL};
! 67: static zend_encoding encoding_ucs4 = {
! 68: NULL,
! 69: NULL,
! 70: "UCS-4",
! 71: (const char *(*)[])&ucs4_aliases,
! 72: 0
! 73: };
! 74:
! 75: static zend_encoding encoding_ucs4be = {
! 76: NULL,
! 77: NULL,
! 78: "UCS-4BE",
! 79: NULL,
! 80: 0
! 81: };
! 82:
! 83: static zend_encoding encoding_ucs4le = {
! 84: NULL,
! 85: NULL,
! 86: "UCS-4LE",
! 87: NULL,
! 88: 0
! 89: };
! 90:
! 91: static const char *utf32_aliases[] = {"utf32", NULL};
! 92: static zend_encoding encoding_utf32 = {
! 93: NULL,
! 94: NULL,
! 95: "UTF-32",
! 96: (const char *(*)[])&utf32_aliases,
! 97: 0
! 98: };
! 99:
! 100: static zend_encoding encoding_utf32be = {
! 101: NULL,
! 102: NULL,
! 103: "UTF-32BE",
! 104: NULL,
! 105: 0
! 106: };
! 107:
! 108: static zend_encoding encoding_utf32le = {
! 109: NULL,
! 110: NULL,
! 111: "UTF-32LE",
! 112: NULL,
! 113: 0
! 114: };
! 115:
! 116: static const char *utf16_aliases[] = {"utf16", NULL};
! 117: static zend_encoding encoding_utf16 = {
! 118: NULL,
! 119: NULL,
! 120: "UTF-16",
! 121: (const char *(*)[])&utf16_aliases,
! 122: 0
! 123: };
! 124:
! 125: static zend_encoding encoding_utf16be = {
! 126: NULL,
! 127: NULL,
! 128: "UTF-16BE",
! 129: NULL,
! 130: 0
! 131: };
! 132:
! 133: static zend_encoding encoding_utf16le = {
! 134: NULL,
! 135: NULL,
! 136: "UTF-16LE",
! 137: NULL,
! 138: 0
! 139: };
! 140:
! 141: static const char *utf8_aliases[] = {"utf8", NULL};
! 142: static zend_encoding encoding_utf8 = {
! 143: NULL,
! 144: NULL,
! 145: "UTF-8",
! 146: (const char *(*)[])&utf8_aliases,
! 147: 1
! 148: };
! 149:
! 150: static const char *ascii_aliases[] = {"ANSI_X3.4-1968", "iso-ir-6", "ANSI_X3.4-1986", "ISO_646.irv:1991", "US-ASCII", "ISO646-US", "us", "IBM367", "cp367", "csASCII", NULL};
! 151: static zend_encoding encoding_ascii = {
! 152: NULL,
! 153: NULL,
! 154: "ASCII",
! 155: (const char *(*)[])&ascii_aliases,
! 156: 1
! 157: };
! 158:
! 159: static const char *euc_jp_aliases[] = {"EUC", "EUC_JP", "eucJP", "x-euc-jp", NULL};
! 160: static zend_encoding encoding_euc_jp = {
! 161: NULL,
! 162: NULL,
! 163: "EUC-JP",
! 164: (const char *(*)[])&euc_jp_aliases,
! 165: 1
! 166: };
! 167:
! 168: static const char *sjis_aliases[] = {"x-sjis", "SJIS", "SHIFT-JIS", NULL};
! 169: static zend_encoding encoding_sjis = {
! 170: sjis_input_filter,
! 171: sjis_output_filter,
! 172: "Shift_JIS",
! 173: (const char *(*)[])&sjis_aliases,
! 174: 0
! 175: };
! 176:
! 177: static const char *eucjp_win_aliases[] = {"eucJP-open", NULL};
! 178: static zend_encoding encoding_eucjp_win = {
! 179: NULL,
! 180: NULL,
! 181: "eucJP-win",
! 182: (const char *(*)[])&eucjp_win_aliases,
! 183: 1
! 184: };
! 185:
! 186: static const char *sjis_win_aliases[] = {"SJIS-open", "MS_Kanji", "Windows-31J", "CP932", NULL};
! 187: static zend_encoding encoding_sjis_win = {
! 188: /* sjis-filters does not care about diffs of Shift_JIS and CP932 */
! 189: sjis_input_filter,
! 190: sjis_output_filter,
! 191: "SJIS-win",
! 192: (const char *(*)[])&sjis_win_aliases,
! 193: 0
! 194: };
! 195:
! 196: static const char *jis_aliases[] = {"ISO-2022-JP", NULL};
! 197: static zend_encoding encoding_jis = {
! 198: NULL,
! 199: NULL,
! 200: "JIS",
! 201: (const char *(*)[])&jis_aliases,
! 202: 0
! 203: };
! 204:
! 205: static const char *euc_cn_aliases[] = {"CN-GB", "EUC_CN", "eucCN", "x-euc-cn", "gb2312", NULL};
! 206: static zend_encoding encoding_euc_cn = {
! 207: NULL,
! 208: NULL,
! 209: "EUC-CN",
! 210: (const char *(*)[])&euc_cn_aliases,
! 211: 1
! 212: };
! 213:
! 214: static const char *cp936_aliases[] = {"CP-936", NULL};
! 215: static zend_encoding encoding_cp936 = {
! 216: NULL,
! 217: NULL,
! 218: "CP936",
! 219: (const char *(*)[])&cp936_aliases,
! 220: 0
! 221: };
! 222:
! 223: static const char *hz_aliases[] = {"HZ-GB-2312", NULL};
! 224: static zend_encoding encoding_hz = {
! 225: NULL,
! 226: NULL,
! 227: "HZ",
! 228: (const char *(*)[])&hz_aliases,
! 229: 0
! 230: };
! 231:
! 232: static const char *euc_tw_aliases[] = {"EUC_TW", "eucTW", "x-euc-tw", NULL};
! 233: static zend_encoding encoding_euc_tw = {
! 234: NULL,
! 235: NULL,
! 236: "EUC-TW",
! 237: (const char *(*)[])&euc_tw_aliases,
! 238: 1
! 239: };
! 240:
! 241: static const char *big5_aliases[] = {"BIG5", "CN-BIG5", "BIG-FIVE", "BIGFIVE", "CP950", NULL};
! 242: static zend_encoding encoding_big5 = {
! 243: NULL,
! 244: NULL,
! 245: "BIG-5",
! 246: (const char *(*)[])&big5_aliases,
! 247: 0
! 248: };
! 249:
! 250: static const char *euc_kr_aliases[] = {"EUC_KR", "eucKR", "x-euc-kr", NULL};
! 251: static zend_encoding encoding_euc_kr = {
! 252: NULL,
! 253: NULL,
! 254: "EUC-KR",
! 255: (const char *(*)[])&euc_kr_aliases,
! 256: 1
! 257: };
! 258:
! 259: static const char *uhc_aliases[] = {"CP949", NULL};
! 260: static zend_encoding encoding_uhc = {
! 261: NULL,
! 262: NULL,
! 263: "UHC",
! 264: (const char *(*)[])&uhc_aliases,
! 265: 1
! 266: };
! 267:
! 268: static zend_encoding encoding_2022kr = {
! 269: NULL,
! 270: NULL,
! 271: "ISO-2022-KR",
! 272: NULL,
! 273: 0
! 274: };
! 275:
! 276: static const char *cp1252_aliases[] = {"cp1252", NULL};
! 277: static zend_encoding encoding_cp1252 = {
! 278: NULL,
! 279: NULL,
! 280: "Windows-1252",
! 281: (const char *(*)[])&cp1252_aliases,
! 282: 1
! 283: };
! 284:
! 285: static const char *iso_8859_1_aliases[] = {"ISO_8859-1", "latin1", NULL};
! 286: static zend_encoding encoding_8859_1 = {
! 287: NULL,
! 288: NULL,
! 289: "ISO-8859-1",
! 290: (const char *(*)[])&iso_8859_1_aliases,
! 291: 1
! 292: };
! 293:
! 294: static const char *iso_8859_2_aliases[] = {"ISO_8859-2", "latin2", NULL};
! 295: static zend_encoding encoding_8859_2 = {
! 296: NULL,
! 297: NULL,
! 298: "ISO-8859-2",
! 299: (const char *(*)[])&iso_8859_2_aliases,
! 300: 1
! 301: };
! 302:
! 303: static const char *iso_8859_3_aliases[] = {"ISO_8859-3", "latin3", NULL};
! 304: static zend_encoding encoding_8859_3 = {
! 305: NULL,
! 306: NULL,
! 307: "ISO-8859-3",
! 308: (const char *(*)[])&iso_8859_3_aliases,
! 309: 1
! 310: };
! 311:
! 312: static const char *iso_8859_4_aliases[] = {"ISO_8859-4", "latin4", NULL};
! 313: static zend_encoding encoding_8859_4 = {
! 314: NULL,
! 315: NULL,
! 316: "ISO-8859-4",
! 317: (const char *(*)[])&iso_8859_4_aliases,
! 318: 1
! 319: };
! 320:
! 321: static const char *iso_8859_5_aliases[] = {"ISO_8859-5", "cyrillic", NULL};
! 322: static zend_encoding encoding_8859_5 = {
! 323: NULL,
! 324: NULL,
! 325: "ISO-8859-5",
! 326: (const char *(*)[])&iso_8859_5_aliases,
! 327: 1
! 328: };
! 329:
! 330: static const char *iso_8859_6_aliases[] = {"ISO_8859-6", "arabic", NULL};
! 331: static zend_encoding encoding_8859_6 = {
! 332: NULL,
! 333: NULL,
! 334: "ISO-8859-6",
! 335: (const char *(*)[])&iso_8859_6_aliases,
! 336: 1
! 337: };
! 338:
! 339: static const char *iso_8859_7_aliases[] = {"ISO_8859-7", "greek", NULL};
! 340: static zend_encoding encoding_8859_7 = {
! 341: NULL,
! 342: NULL,
! 343: "ISO-8859-7",
! 344: (const char *(*)[])&iso_8859_7_aliases,
! 345: 1
! 346: };
! 347:
! 348: static const char *iso_8859_8_aliases[] = {"ISO_8859-8", "hebrew", NULL};
! 349: static zend_encoding encoding_8859_8 = {
! 350: NULL,
! 351: NULL,
! 352: "ISO-8859-8",
! 353: (const char *(*)[])&iso_8859_8_aliases,
! 354: 1
! 355: };
! 356:
! 357: static const char *iso_8859_9_aliases[] = {"ISO_8859-9", "latin5", NULL};
! 358: static zend_encoding encoding_8859_9 = {
! 359: NULL,
! 360: NULL,
! 361: "ISO-8859-9",
! 362: (const char *(*)[])&iso_8859_9_aliases,
! 363: 1
! 364: };
! 365:
! 366: static const char *iso_8859_10_aliases[] = {"ISO_8859-10", "latin6", NULL};
! 367: static zend_encoding encoding_8859_10 = {
! 368: NULL,
! 369: NULL,
! 370: "ISO-8859-10",
! 371: (const char *(*)[])&iso_8859_10_aliases,
! 372: 1
! 373: };
! 374:
! 375: static const char *iso_8859_13_aliases[] = {"ISO_8859-13", NULL};
! 376: static zend_encoding encoding_8859_13 = {
! 377: NULL,
! 378: NULL,
! 379: "ISO-8859-13",
! 380: (const char *(*)[])&iso_8859_13_aliases,
! 381: 1
! 382: };
! 383:
! 384: static const char *iso_8859_14_aliases[] = {"ISO_8859-14", "latin8", NULL};
! 385: static zend_encoding encoding_8859_14 = {
! 386: NULL,
! 387: NULL,
! 388: "ISO-8859-14",
! 389: (const char *(*)[])&iso_8859_14_aliases,
! 390: 1
! 391: };
! 392:
! 393: static const char *iso_8859_15_aliases[] = {"ISO_8859-15", NULL};
! 394: static zend_encoding encoding_8859_15 = {
! 395: NULL,
! 396: NULL,
! 397: "ISO-8859-15",
! 398: (const char *(*)[])&iso_8859_15_aliases,
! 399: 1
! 400: };
! 401:
! 402: static const char *cp1251_aliases[] = {"CP1251", "CP-1251", "WINDOWS-1251", NULL};
! 403: static zend_encoding encoding_cp1251 = {
! 404: NULL,
! 405: NULL,
! 406: "Windows-1251",
! 407: (const char *(*)[])&cp1251_aliases,
! 408: 1
! 409: };
! 410:
! 411: static const char *cp866_aliases[] = {"CP866", "CP-866", "IBM-866", NULL};
! 412: static zend_encoding encoding_cp866 = {
! 413: NULL,
! 414: NULL,
! 415: "CP866",
! 416: (const char *(*)[])&cp866_aliases,
! 417: 1
! 418: };
! 419:
! 420: static const char *koi8r_aliases[] = {"KOI8-R", "KOI8R", NULL};
! 421: static zend_encoding encoding_koi8r = {
! 422: NULL,
! 423: NULL,
! 424: "KOI8-R",
! 425: (const char *(*)[])&koi8r_aliases,
! 426: 1
! 427: };
! 428:
! 429: static const char *koi8u_aliases[] = {"KOI8-U", "KOI8U", NULL};
! 430: static zend_encoding encoding_koi8u = {
! 431: NULL,
! 432: NULL,
! 433: "KOI8-U",
! 434: (const char *(*)[])&koi8u_aliases,
! 435: 1
! 436: };
! 437:
! 438: static const char *cp1254_aliases[] = {"cp1254", NULL};
! 439: static zend_encoding encoding_cp1254 = {
! 440: NULL,
! 441: NULL,
! 442: "Windows-1254",
! 443: (const char *(*)[])&cp1254_aliases,
! 444: 1
! 445: };
! 446:
! 447: static const char *armscii8_aliases[] = { "ArmSCII8", "ARMSCII-8", "ARMSCII8", NULL};
! 448: static zend_encoding encoding_armscii8 = {
! 449: NULL,
! 450: NULL,
! 451: "ArmSCII-8",
! 452: (const char *(*)[])&armscii8_aliases,
! 453: 1
! 454: };
! 455:
! 456: static const char *cp850_aliases[] = {"IBM850", NULL};
! 457: static zend_encoding encoding_cp850 = {
! 458: NULL,
! 459: NULL,
! 460: "CP850",
! 461: (const char *(*)[])&cp850_aliases,
! 462: 1
! 463: };
! 464:
! 465: static zend_encoding *zend_encoding_table[] = {
! 466: &encoding_ucs4,
! 467: &encoding_ucs4be,
! 468: &encoding_ucs4le,
! 469: &encoding_ucs2,
! 470: &encoding_ucs2be,
! 471: &encoding_ucs2le,
! 472: &encoding_utf32,
! 473: &encoding_utf32be,
! 474: &encoding_utf32le,
! 475: &encoding_utf16,
! 476: &encoding_utf16be,
! 477: &encoding_utf16le,
! 478: &encoding_utf8,
! 479: &encoding_ascii,
! 480: &encoding_euc_jp,
! 481: &encoding_sjis,
! 482: &encoding_eucjp_win,
! 483: &encoding_sjis_win,
! 484: &encoding_jis,
! 485: &encoding_cp1252,
! 486: &encoding_8859_1,
! 487: &encoding_8859_2,
! 488: &encoding_8859_3,
! 489: &encoding_8859_4,
! 490: &encoding_8859_5,
! 491: &encoding_8859_6,
! 492: &encoding_8859_7,
! 493: &encoding_8859_8,
! 494: &encoding_8859_9,
! 495: &encoding_8859_10,
! 496: &encoding_8859_13,
! 497: &encoding_8859_14,
! 498: &encoding_8859_15,
! 499: &encoding_euc_cn,
! 500: &encoding_cp936,
! 501: &encoding_hz,
! 502: &encoding_euc_tw,
! 503: &encoding_big5,
! 504: &encoding_euc_kr,
! 505: &encoding_uhc,
! 506: &encoding_2022kr,
! 507: &encoding_cp1251,
! 508: &encoding_cp866,
! 509: &encoding_koi8r,
! 510: &encoding_koi8u,
! 511: &encoding_armscii8,
! 512: &encoding_cp1254,
! 513: &encoding_cp850,
! 514: NULL
! 515: };
! 516:
! 517:
! 518:
! 519: ZEND_API int zend_multibyte_set_script_encoding(const char *encoding_list,
! 520: size_t encoding_list_size TSRMLS_DC)
! 521: {
! 522: if (CG(script_encoding_list)) {
! 523: efree(CG(script_encoding_list));
! 524: CG(script_encoding_list) = NULL;
! 525: }
! 526: CG(script_encoding_list_size) = 0;
! 527:
! 528: if (!encoding_list) {
! 529: return 0;
! 530: }
! 531:
! 532: zend_multibyte_parse_encoding_list(encoding_list, encoding_list_size, &(CG(script_encoding_list)), &(CG(script_encoding_list_size)));
! 533:
! 534: return 0;
! 535: }
! 536:
! 537:
! 538: ZEND_API int zend_multibyte_set_internal_encoding(const char *encoding_name TSRMLS_DC)
! 539: {
! 540: CG(internal_encoding) = zend_multibyte_fetch_encoding(encoding_name);
! 541: return 0;
! 542: }
! 543:
! 544: ZEND_API int zend_multibyte_set_functions(zend_encoding_detector encoding_detector, zend_encoding_converter encoding_converter, zend_encoding_oddlen encoding_oddlen TSRMLS_DC)
! 545: {
! 546: CG(encoding_detector) = encoding_detector;
! 547: CG(encoding_converter) = encoding_converter;
! 548: CG(encoding_oddlen) = encoding_oddlen;
! 549: return 0;
! 550: }
! 551:
! 552:
! 553: ZEND_API int zend_multibyte_set_filter(zend_encoding *onetime_encoding TSRMLS_DC)
! 554: {
! 555: LANG_SCNG(script_encoding) = zend_multibyte_find_script_encoding(onetime_encoding TSRMLS_CC);
! 556: LANG_SCNG(internal_encoding) = CG(internal_encoding);
! 557:
! 558: /* judge input/output filter */
! 559: LANG_SCNG(input_filter) = NULL;
! 560: LANG_SCNG(output_filter) = NULL;
! 561:
! 562: if (!LANG_SCNG(script_encoding)) {
! 563: return 0;
! 564: }
! 565:
! 566: if (!LANG_SCNG(internal_encoding) || LANG_SCNG(script_encoding) == LANG_SCNG(internal_encoding)) {
! 567: /* if encoding specfic filters exist, use them */
! 568: if (LANG_SCNG(script_encoding)->input_filter && LANG_SCNG(script_encoding)->output_filter) {
! 569: LANG_SCNG(input_filter) = LANG_SCNG(script_encoding)->input_filter;
! 570: LANG_SCNG(output_filter) = LANG_SCNG(script_encoding)->output_filter;
! 571: return 0;
! 572: }
! 573:
! 574: if (!LANG_SCNG(script_encoding)->compatible) {
! 575: /* and if not, work around w/ script_encoding -> utf-8 -> script_encoding conversion */
! 576: LANG_SCNG(internal_encoding) = LANG_SCNG(script_encoding);
! 577: LANG_SCNG(input_filter) = zend_multibyte_script_encoding_filter;
! 578: LANG_SCNG(output_filter) = zend_multibyte_internal_encoding_filter;
! 579: return 0;
! 580: } else {
! 581: /* nothing to do in this case */
! 582: return 0;
! 583: }
! 584: }
! 585:
! 586: /* LANG_SCNG(internal_encoding) cannot be NULL here */
! 587: if (LANG_SCNG(internal_encoding)->compatible) {
! 588: LANG_SCNG(input_filter) = zend_multibyte_script_encoding_filter;
! 589: return 0;
! 590: } else if (LANG_SCNG(script_encoding)->compatible) {
! 591: LANG_SCNG(output_filter) = zend_multibyte_internal_encoding_filter;
! 592: return 0;
! 593: }
! 594:
! 595: /* both script and internal encodings are incompatible w/ flex */
! 596: LANG_SCNG(input_filter) = zend_multibyte_script_encoding_filter;
! 597: LANG_SCNG(output_filter) = zend_multibyte_internal_encoding_filter;
! 598:
! 599: return 0;
! 600: }
! 601:
! 602:
! 603: ZEND_API zend_encoding* zend_multibyte_fetch_encoding(const char *encoding_name)
! 604: {
! 605: int i, j;
! 606: zend_encoding *encoding;
! 607:
! 608: if (!encoding_name) {
! 609: return NULL;
! 610: }
! 611:
! 612: for (i = 0; (encoding = zend_encoding_table[i]) != NULL; i++) {
! 613: if (zend_binary_strcasecmp(encoding->name, strlen(encoding->name), encoding_name, strlen(encoding_name)) == 0) {
! 614: return encoding;
! 615: }
! 616: }
! 617:
! 618: for (i = 0; (encoding = zend_encoding_table[i]) != NULL; i++) {
! 619: if (encoding->aliases != NULL) {
! 620: for (j = 0; (*encoding->aliases)[j] != NULL; j++) {
! 621: if (zend_binary_strcasecmp((*encoding->aliases)[j], strlen((*encoding->aliases)[j]), encoding_name, strlen(encoding_name)) == 0) {
! 622: return encoding;
! 623: }
! 624: }
! 625: }
! 626: }
! 627:
! 628: return NULL;
! 629: }
! 630:
! 631:
! 632: ZEND_API size_t zend_multibyte_script_encoding_filter(unsigned char **to, size_t
! 633: *to_length, const unsigned char *from, size_t from_length TSRMLS_DC)
! 634: {
! 635: const char *name;
! 636:
! 637: if (LANG_SCNG(internal_encoding) == NULL || LANG_SCNG(internal_encoding)->compatible == 0) {
! 638: name = "UTF-8";
! 639: } else {
! 640: name = LANG_SCNG(internal_encoding)->name;
! 641: }
! 642:
! 643: return zend_multibyte_encoding_filter(to, to_length, name, from, from_length, LANG_SCNG(script_encoding)->name TSRMLS_CC);
! 644: }
! 645:
! 646: ZEND_API size_t zend_multibyte_internal_encoding_filter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length TSRMLS_DC)
! 647: {
! 648: const char *name;
! 649:
! 650: if (LANG_SCNG(script_encoding)->compatible == 0) {
! 651: name = "UTF-8";
! 652: } else {
! 653: name = LANG_SCNG(script_encoding)->name;
! 654: }
! 655:
! 656: return zend_multibyte_encoding_filter(to, to_length, LANG_SCNG(internal_encoding)->name, from, from_length, name TSRMLS_CC);
! 657: }
! 658:
! 659: static size_t zend_multibyte_encoding_filter(unsigned char **to, size_t *to_length, const char *to_encoding, const unsigned char *from, size_t from_length, const char *from_encoding TSRMLS_DC)
! 660: {
! 661: size_t oddlen;
! 662:
! 663: if (!CG(encoding_converter)) {
! 664: return 0;
! 665: }
! 666:
! 667: if (CG(encoding_oddlen)) {
! 668: oddlen = CG(encoding_oddlen)(from, from_length, from_encoding TSRMLS_CC);
! 669: if (oddlen > 0) {
! 670: from_length -= oddlen;
! 671: }
! 672: }
! 673:
! 674: if (CG(encoding_converter)(to, to_length, from, from_length, to_encoding, from_encoding TSRMLS_CC) != 0) {
! 675: return 0;
! 676: }
! 677:
! 678: return from_length;
! 679: }
! 680:
! 681:
! 682: /*
! 683: * Shift_JIS Input/Output Filter
! 684: */
! 685: static const unsigned char table_sjis[] = { /* 0x80-0x9f,0xE0-0xEF */
! 686: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 687: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 688: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 689: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 690: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 691: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 692: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 693: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 694: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
! 695: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
! 696: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 697: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 698: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 699: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 700: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
! 701: 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 0, 0, 0
! 702: };
! 703:
! 704: size_t sjis_input_filter(unsigned char **buf, size_t *length, const unsigned char *sjis, size_t sjis_length TSRMLS_DC)
! 705: {
! 706: const unsigned char *p;
! 707: unsigned char *q;
! 708: unsigned char c1, c2;
! 709:
! 710: *buf = (unsigned char*)emalloc(sjis_length * 3 / 2 + 1);
! 711: if (!*buf)
! 712: return 0;
! 713: *length = 0;
! 714:
! 715: p = sjis;
! 716: q = *buf;
! 717:
! 718: /* convert [SJIS -> EUC-JP] (for lex scan) -- some other better ways? */
! 719: while (*p && (p - sjis) < sjis_length) {
! 720: if (!(*p & 0x80)) {
! 721: *q++ = *p++;
! 722: continue;
! 723: }
! 724:
! 725: /* handling 8 bit code */
! 726: if (table_sjis[*p] == 1) {
! 727: /* 1 byte kana */
! 728: *q++ = 0x8e;
! 729: *q++ = *p++;
! 730: continue;
! 731: }
! 732:
! 733: if (!*(p+1)) {
! 734: *q++ = *p++;
! 735: break;
! 736: }
! 737:
! 738: if (table_sjis[*p] == 2) {
! 739: /* 2 byte kanji code */
! 740: c1 = *p++;
! 741: if (!*p || (p - sjis) >= sjis_length) {
! 742: break;
! 743: }
! 744: c2 = *p++;
! 745: c1 -= (c1 <= 0x9f) ? 0x71 : 0xb1;
! 746: c1 = (c1 << 1) + 1;
! 747: if (c2 >= 0x9e) {
! 748: c2 -= 0x7e;
! 749: c1++;
! 750: } else if (c2 > 0x7f) {
! 751: c2 -= 0x20;
! 752: } else {
! 753: c2 -= 0x1f;
! 754: }
! 755:
! 756: c1 |= 0x80;
! 757: c2 |= 0x80;
! 758:
! 759: *q++ = c1;
! 760: *q++ = c2;
! 761: } else {
! 762: /*
! 763: * for user defined chars (ATTENTION)
! 764: *
! 765: * THESE ARE NOT CODE FOR CONVERSION! :-P
! 766: * (using *ILLEGALLY* 3byte EUC-JP space)
! 767: *
! 768: * we cannot perfectly (== 1 to 1) convert these chars to EUC-JP.
! 769: * so, these code are for perfect RESTORING in sjis_output_filter()
! 770: */
! 771: c1 = *p++;
! 772: if (!*p || (p - sjis) >= sjis_length) {
! 773: break;
! 774: }
! 775: c2 = *p++;
! 776: *q++ = 0x8f;
! 777: /*
! 778: * MAP TO (EUC-JP):
! 779: * type A: 0xeba1 - 0xf4fe
! 780: * type B: 0xf5a1 - 0xfefe
! 781: * type C: 0xa1a1 - 0xa6fe
! 782: */
! 783: c1 -= (c1 > 0xf9) ? (0x79+0x71) : (0x0a+0xb1);
! 784: c1 = (c1 << 1) + 1;
! 785: if (c2 >= 0x9e) {
! 786: c2 -= 0x7e;
! 787: c1++;
! 788: } else if (c2 > 0x7f) {
! 789: c2 -= 0x20;
! 790: } else {
! 791: c2 -= 0x1f;
! 792: }
! 793:
! 794: c1 |= 0x80;
! 795: c2 |= 0x80;
! 796:
! 797: *q++ = c1;
! 798: *q++ = c2;
! 799: }
! 800: }
! 801: *q = '\0';
! 802: *length = q - *buf;
! 803:
! 804: return *length;
! 805: }
! 806:
! 807: static const unsigned char table_eucjp[] = { /* 0xA1-0xFE */
! 808: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 809: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 810: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 811: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 812: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 813: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 814: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 815: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 816: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
! 817: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 818: 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
! 819: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
! 820: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
! 821: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
! 822: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
! 823: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
! 824: };
! 825:
! 826: size_t sjis_output_filter(unsigned char **sjis, size_t *sjis_length, const unsigned char *buf, size_t length TSRMLS_DC)
! 827: {
! 828: unsigned char c1, c2;
! 829: unsigned char *p;
! 830: const unsigned char *q;
! 831:
! 832: if (!sjis || !sjis_length) {
! 833: return 0;
! 834: }
! 835:
! 836: /* always Shift_JIS <= EUC-JP */
! 837: *sjis = (unsigned char*)emalloc(length+1);
! 838: if (!sjis) {
! 839: return 0;
! 840: }
! 841: p = *sjis;
! 842: q = buf;
! 843:
! 844: /* restore converted strings [EUC-JP -> Shift_JIS] */
! 845: while (*q && (q - buf) < length) {
! 846: if (!(*q & 0x80)) {
! 847: *p++ = *q++;
! 848: continue;
! 849: }
! 850:
! 851: /* hankaku kana */
! 852: if (*q == 0x8e) {
! 853: q++;
! 854: if (*q) {
! 855: *p++ = *q++;
! 856: }
! 857: continue;
! 858: }
! 859:
! 860: /* 2 byte kanji code */
! 861: if (table_eucjp[*q] == 2) {
! 862: c1 = (*q++ & ~0x80) & 0xff;
! 863: if (*q) {
! 864: c2 = (*q++ & ~0x80) & 0xff;
! 865: } else {
! 866: q--;
! 867: break;
! 868: }
! 869:
! 870: c2 += (c1 & 0x01) ? 0x1f : 0x7d;
! 871: if (c2 >= 0x7f) {
! 872: c2++;
! 873: }
! 874: c1 = ((c1 - 0x21) >> 1) + 0x81;
! 875: if (c1 > 0x9f) {
! 876: c1 += 0x40;
! 877: }
! 878:
! 879: *p++ = c1;
! 880: *p++ = c2;
! 881: continue;
! 882: }
! 883:
! 884: if (*q == 0x8f) {
! 885: q++;
! 886: if (*q) {
! 887: c1 = (*q++ & ~0x80) & 0xff;
! 888: } else {
! 889: q--;
! 890: break;
! 891: }
! 892: if (*q) {
! 893: c2 = (*q++ & ~0x80) & 0xff;
! 894: } else {
! 895: q -= 2;
! 896: break;
! 897: }
! 898:
! 899: c2 += (c1 & 0x01) ? 0x1f : 0x7d;
! 900: if (c2 >= 0x7f) {
! 901: c2++;
! 902: }
! 903: c1 = ((c1 - 0x21) >> 1) + 0x81;
! 904: if (c1 > 0x9f) {
! 905: c1 += 0x40;
! 906: }
! 907:
! 908: if (c1 >= 0x81 && c1 <= 0x9f) {
! 909: c1 += 0x79;
! 910: } else {
! 911: c1 += 0x0a;
! 912: }
! 913:
! 914: *p++ = c1;
! 915: *p++ = c2;
! 916: continue;
! 917: }
! 918:
! 919: /* some other chars (may not happen) */
! 920: *p++ = *q++;
! 921: }
! 922: *p = '\0';
! 923: *sjis_length = p - *sjis;
! 924:
! 925: return q-buf; /* return length we actually read */
! 926: }
! 927:
! 928:
! 929: static char *zend_multibyte_assemble_encoding_list(zend_encoding **encoding_list, size_t encoding_list_size)
! 930: {
! 931: int i, list_size = 0;
! 932: const char *name;
! 933: char *list = NULL;
! 934:
! 935: if (!encoding_list || !encoding_list_size) {
! 936: return NULL;
! 937: }
! 938:
! 939: for (i = 0; i < encoding_list_size; i++) {
! 940: name = (*(encoding_list+i))->name;
! 941: if (name) {
! 942: list_size += strlen(name) + 1;
! 943: if (!list) {
! 944: list = (char*)emalloc(list_size);
! 945: if (!list) {
! 946: return NULL;
! 947: }
! 948: *list = '\0';
! 949: } else {
! 950: list = (char*)erealloc(list, list_size);
! 951: if (!list) {
! 952: return NULL;
! 953: }
! 954: strcat(list, ",");
! 955: }
! 956: strcat(list, name);
! 957: }
! 958: }
! 959: return list;
! 960: }
! 961:
! 962:
! 963: static int zend_multibyte_parse_encoding_list(const char *encoding_list,
! 964: size_t encoding_list_size, zend_encoding ***result, size_t *result_size)
! 965: {
! 966: int n, size;
! 967: char *p, *p1, *p2, *endp, *tmpstr;
! 968: zend_encoding **list, **entry, *encoding;
! 969:
! 970: list = NULL;
! 971: if (encoding_list == NULL || encoding_list_size <= 0) {
! 972: return -1;
! 973: } else {
! 974: /* copy the encoding_list string for work */
! 975: tmpstr = (char *)estrndup(encoding_list, encoding_list_size);
! 976: if (tmpstr == NULL) {
! 977: return -1;
! 978: }
! 979: /* count the number of listed encoding names */
! 980: endp = tmpstr + encoding_list_size;
! 981: n = 1;
! 982: p1 = tmpstr;
! 983: while ((p2 = zend_memnstr(p1, ",", 1, endp)) != NULL) {
! 984: p1 = p2 + 1;
! 985: n++;
! 986: }
! 987: size = n;
! 988: /* make list */
! 989: list = (zend_encoding**)ecalloc(size, sizeof(zend_encoding*));
! 990: if (list != NULL) {
! 991: entry = list;
! 992: n = 0;
! 993: p1 = tmpstr;
! 994: do {
! 995: p2 = p = zend_memnstr(p1, ",", 1, endp);
! 996: if (p == NULL) {
! 997: p = endp;
! 998: }
! 999: *p = '\0';
! 1000: /* trim spaces */
! 1001: while (p1 < p && (*p1 == ' ' || *p1 == '\t')) {
! 1002: p1++;
! 1003: }
! 1004: p--;
! 1005: while (p > p1 && (*p == ' ' || *p == '\t')) {
! 1006: *p = '\0';
! 1007: p--;
! 1008: }
! 1009: /* convert to the encoding number and check encoding */
! 1010: encoding = zend_multibyte_fetch_encoding(p1);
! 1011: if (encoding)
! 1012: {
! 1013: *entry++ = encoding;
! 1014: n++;
! 1015: }
! 1016: p1 = p2 + 1;
! 1017: } while (n < size && p2 != NULL);
! 1018: *result = list;
! 1019: *result_size = n;
! 1020: }
! 1021: efree(tmpstr);
! 1022: }
! 1023:
! 1024: if (list == NULL) {
! 1025: return -1;
! 1026: }
! 1027:
! 1028: return 0;
! 1029: }
! 1030:
! 1031:
! 1032: static zend_encoding* zend_multibyte_find_script_encoding(zend_encoding *onetime_encoding TSRMLS_DC)
! 1033: {
! 1034: zend_encoding *script_encoding;
! 1035: char *name, *list;
! 1036:
! 1037: /* onetime_encoding is prior to everything */
! 1038: if (onetime_encoding != NULL) {
! 1039: return onetime_encoding;
! 1040: }
! 1041:
! 1042: if (CG(detect_unicode)) {
! 1043: /* check out bom(byte order mark) and see if containing wchars */
! 1044: script_encoding = zend_multibyte_detect_unicode(TSRMLS_C);
! 1045: if (script_encoding != NULL) {
! 1046: /* bom or wchar detection is prior to 'script_encoding' option */
! 1047: return script_encoding;
! 1048: }
! 1049: }
! 1050:
! 1051: /* if no script_encoding specified, just leave alone */
! 1052: if (!CG(script_encoding_list) || !CG(script_encoding_list_size)) {
! 1053: return NULL;
! 1054: }
! 1055:
! 1056: /* if multiple encodings specified, detect automagically */
! 1057: if (CG(script_encoding_list_size) > 1 && CG(encoding_detector)) {
! 1058: list = zend_multibyte_assemble_encoding_list(CG(script_encoding_list),
! 1059: CG(script_encoding_list_size));
! 1060: name = CG(encoding_detector)(LANG_SCNG(script_org),
! 1061: LANG_SCNG(script_org_size), list TSRMLS_CC);
! 1062: if (list) {
! 1063: efree(list);
! 1064: }
! 1065: if (name) {
! 1066: script_encoding = zend_multibyte_fetch_encoding(name);
! 1067: efree(name);
! 1068: } else {
! 1069: script_encoding = NULL;
! 1070: }
! 1071: return script_encoding;
! 1072: }
! 1073:
! 1074: return *(CG(script_encoding_list));
! 1075: }
! 1076:
! 1077:
! 1078: static zend_encoding* zend_multibyte_detect_unicode(TSRMLS_D)
! 1079: {
! 1080: zend_encoding *script_encoding = NULL;
! 1081: int bom_size;
! 1082: unsigned char *script;
! 1083:
! 1084: if (LANG_SCNG(script_org_size) < sizeof(BOM_UTF32_LE)-1) {
! 1085: return NULL;
! 1086: }
! 1087:
! 1088: /* check out BOM */
! 1089: if (!memcmp(LANG_SCNG(script_org), BOM_UTF32_BE, sizeof(BOM_UTF32_BE)-1)) {
! 1090: script_encoding = &encoding_utf32be;
! 1091: bom_size = sizeof(BOM_UTF32_BE)-1;
! 1092: } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF32_LE, sizeof(BOM_UTF32_LE)-1)) {
! 1093: script_encoding = &encoding_utf32le;
! 1094: bom_size = sizeof(BOM_UTF32_LE)-1;
! 1095: } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF16_BE, sizeof(BOM_UTF16_BE)-1)) {
! 1096: script_encoding = &encoding_utf16be;
! 1097: bom_size = sizeof(BOM_UTF16_BE)-1;
! 1098: } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF16_LE, sizeof(BOM_UTF16_LE)-1)) {
! 1099: script_encoding = &encoding_utf16le;
! 1100: bom_size = sizeof(BOM_UTF16_LE)-1;
! 1101: } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF8, sizeof(BOM_UTF8)-1)) {
! 1102: script_encoding = &encoding_utf8;
! 1103: bom_size = sizeof(BOM_UTF8)-1;
! 1104: }
! 1105:
! 1106: if (script_encoding) {
! 1107: /* remove BOM */
! 1108: script = (unsigned char*)emalloc(LANG_SCNG(script_org_size)+1-bom_size);
! 1109: memcpy(script, LANG_SCNG(script_org)+bom_size, LANG_SCNG(script_org_size)+1-bom_size);
! 1110: efree(LANG_SCNG(script_org));
! 1111: LANG_SCNG(script_org) = script;
! 1112: LANG_SCNG(script_org_size) -= bom_size;
! 1113:
! 1114: return script_encoding;
! 1115: }
! 1116:
! 1117: /* script contains NULL bytes -> auto-detection */
! 1118: if (memchr(LANG_SCNG(script_org), 0, LANG_SCNG(script_org_size))) {
! 1119: /* make best effort if BOM is missing */
! 1120: return zend_multibyte_detect_utf_encoding(LANG_SCNG(script_org), LANG_SCNG(script_org_size) TSRMLS_CC);
! 1121: }
! 1122:
! 1123: return NULL;
! 1124: }
! 1125:
! 1126: static zend_encoding *zend_multibyte_detect_utf_encoding(const unsigned char *script, size_t script_size TSRMLS_DC)
! 1127: {
! 1128: const unsigned char *p;
! 1129: int wchar_size = 2;
! 1130: int le = 0;
! 1131:
! 1132: /* utf-16 or utf-32? */
! 1133: p = script;
! 1134: while ((p-script) < script_size) {
! 1135: p = memchr(p, 0, script_size-(p-script)-2);
! 1136: if (!p) {
! 1137: break;
! 1138: }
! 1139: if (*(p+1) == '\0' && *(p+2) == '\0') {
! 1140: wchar_size = 4;
! 1141: break;
! 1142: }
! 1143:
! 1144: /* searching for UTF-32 specific byte orders, so this will do */
! 1145: p += 4;
! 1146: }
! 1147:
! 1148: /* BE or LE? */
! 1149: p = script;
! 1150: while ((p-script) < script_size) {
! 1151: if (*p == '\0' && *(p+wchar_size-1) != '\0') {
! 1152: /* BE */
! 1153: le = 0;
! 1154: break;
! 1155: } else if (*p != '\0' && *(p+wchar_size-1) == '\0') {
! 1156: /* LE* */
! 1157: le = 1;
! 1158: break;
! 1159: }
! 1160: p += wchar_size;
! 1161: }
! 1162:
! 1163: if (wchar_size == 2) {
! 1164: return le ? &encoding_utf16le : &encoding_utf16be;
! 1165: } else {
! 1166: return le ? &encoding_utf32le : &encoding_utf32be;
! 1167: }
! 1168:
! 1169: return NULL;
! 1170: }
! 1171: #endif /* ZEND_MULTIBYTE */
! 1172:
! 1173: /*
! 1174: * Local variables:
! 1175: * tab-width: 4
! 1176: * c-basic-offset: 4
! 1177: * End:
! 1178: * vim600: sw=4 ts=4 tw=78
! 1179: * vim<600: sw=4 ts=4 tw=78
! 1180: */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>