Annotation of embedaddon/php/Zend/zend_multibyte.c, revision 1.1.1.1
1.1 misho 1: /*
2: +----------------------------------------------------------------------+
3: | Zend Engine |
4: +----------------------------------------------------------------------+
5: | Copyright (c) 1998-2012 Zend Technologies Ltd. (http://www.zend.com) |
6: +----------------------------------------------------------------------+
7: | This source file is subject to version 2.00 of the Zend license, |
8: | that is bundled with this package in the file LICENSE, and is |
9: | available through the world-wide-web at |
10: | http://www.zend.com/license/2_00.txt. |
11: | If you did not receive a copy of the Zend license and are unable to |
12: | obtain it through the world-wide-web, please send a note to |
13: | license@zend.com so we can mail you a copy immediately. |
14: +----------------------------------------------------------------------+
15: | Authors: Masaki Fujimoto <fujimoto@php.net> |
16: | Rui Hirokawa <hirokawa@php.net> |
17: +----------------------------------------------------------------------+
18: */
19:
20: /* $Id: zend_multibyte.c 321634 2012-01-01 13:15:04Z felipe $ */
21:
22: #include "zend.h"
23: #include "zend_compile.h"
24: #include "zend_operators.h"
25: #include "zend_multibyte.h"
26:
27: #ifdef ZEND_MULTIBYTE
28: static size_t zend_multibyte_encoding_filter(unsigned char **to, size_t *to_length, const char *to_encoding, const unsigned char *from, size_t from_length, const char *from_encoding TSRMLS_DC);
29: size_t sjis_input_filter(unsigned char **buf, size_t *length, const unsigned char *sjis, size_t sjis_length TSRMLS_DC);
30: size_t sjis_output_filter(unsigned char **buf, size_t *length, const unsigned char *sjis, size_t sjis_length TSRMLS_DC);
31: static char* zend_multibyte_assemble_encoding_list(zend_encoding **encoding_list, size_t encoding_list_size);
32: static int zend_multibyte_parse_encoding_list(const char *encoding_list,
33: size_t encoding_list_size, zend_encoding ***result, size_t *result_size);
34: static zend_encoding *zend_multibyte_find_script_encoding(zend_encoding *onetime_encoding TSRMLS_DC);
35: static zend_encoding *zend_multibyte_detect_unicode(TSRMLS_D);
36: static zend_encoding *zend_multibyte_detect_utf_encoding(const unsigned char *script, size_t script_size TSRMLS_DC);
37:
38: /*
39: * encodings
40: */
41: static const char *ucs2_aliases[] = {"ISO-10646-UCS-2", "UCS2" , "UNICODE", NULL};
42: static zend_encoding encoding_ucs2 = {
43: NULL,
44: NULL,
45: "UCS-2",
46: (const char *(*)[])&ucs2_aliases,
47: 0
48: };
49:
50: static zend_encoding encoding_ucs2be = {
51: NULL,
52: NULL,
53: "UCS-2BE",
54: NULL,
55: 0
56: };
57:
58: static zend_encoding encoding_ucs2le = {
59: NULL,
60: NULL,
61: "UCS-2LE",
62: NULL,
63: 0
64: };
65:
66: static const char *ucs4_aliases[] = {"ISO-10646-UCS-4", "UCS4", NULL};
67: static zend_encoding encoding_ucs4 = {
68: NULL,
69: NULL,
70: "UCS-4",
71: (const char *(*)[])&ucs4_aliases,
72: 0
73: };
74:
75: static zend_encoding encoding_ucs4be = {
76: NULL,
77: NULL,
78: "UCS-4BE",
79: NULL,
80: 0
81: };
82:
83: static zend_encoding encoding_ucs4le = {
84: NULL,
85: NULL,
86: "UCS-4LE",
87: NULL,
88: 0
89: };
90:
91: static const char *utf32_aliases[] = {"utf32", NULL};
92: static zend_encoding encoding_utf32 = {
93: NULL,
94: NULL,
95: "UTF-32",
96: (const char *(*)[])&utf32_aliases,
97: 0
98: };
99:
100: static zend_encoding encoding_utf32be = {
101: NULL,
102: NULL,
103: "UTF-32BE",
104: NULL,
105: 0
106: };
107:
108: static zend_encoding encoding_utf32le = {
109: NULL,
110: NULL,
111: "UTF-32LE",
112: NULL,
113: 0
114: };
115:
116: static const char *utf16_aliases[] = {"utf16", NULL};
117: static zend_encoding encoding_utf16 = {
118: NULL,
119: NULL,
120: "UTF-16",
121: (const char *(*)[])&utf16_aliases,
122: 0
123: };
124:
125: static zend_encoding encoding_utf16be = {
126: NULL,
127: NULL,
128: "UTF-16BE",
129: NULL,
130: 0
131: };
132:
133: static zend_encoding encoding_utf16le = {
134: NULL,
135: NULL,
136: "UTF-16LE",
137: NULL,
138: 0
139: };
140:
141: static const char *utf8_aliases[] = {"utf8", NULL};
142: static zend_encoding encoding_utf8 = {
143: NULL,
144: NULL,
145: "UTF-8",
146: (const char *(*)[])&utf8_aliases,
147: 1
148: };
149:
150: static const char *ascii_aliases[] = {"ANSI_X3.4-1968", "iso-ir-6", "ANSI_X3.4-1986", "ISO_646.irv:1991", "US-ASCII", "ISO646-US", "us", "IBM367", "cp367", "csASCII", NULL};
151: static zend_encoding encoding_ascii = {
152: NULL,
153: NULL,
154: "ASCII",
155: (const char *(*)[])&ascii_aliases,
156: 1
157: };
158:
159: static const char *euc_jp_aliases[] = {"EUC", "EUC_JP", "eucJP", "x-euc-jp", NULL};
160: static zend_encoding encoding_euc_jp = {
161: NULL,
162: NULL,
163: "EUC-JP",
164: (const char *(*)[])&euc_jp_aliases,
165: 1
166: };
167:
168: static const char *sjis_aliases[] = {"x-sjis", "SJIS", "SHIFT-JIS", NULL};
169: static zend_encoding encoding_sjis = {
170: sjis_input_filter,
171: sjis_output_filter,
172: "Shift_JIS",
173: (const char *(*)[])&sjis_aliases,
174: 0
175: };
176:
177: static const char *eucjp_win_aliases[] = {"eucJP-open", NULL};
178: static zend_encoding encoding_eucjp_win = {
179: NULL,
180: NULL,
181: "eucJP-win",
182: (const char *(*)[])&eucjp_win_aliases,
183: 1
184: };
185:
186: static const char *sjis_win_aliases[] = {"SJIS-open", "MS_Kanji", "Windows-31J", "CP932", NULL};
187: static zend_encoding encoding_sjis_win = {
188: /* sjis-filters does not care about diffs of Shift_JIS and CP932 */
189: sjis_input_filter,
190: sjis_output_filter,
191: "SJIS-win",
192: (const char *(*)[])&sjis_win_aliases,
193: 0
194: };
195:
196: static const char *jis_aliases[] = {"ISO-2022-JP", NULL};
197: static zend_encoding encoding_jis = {
198: NULL,
199: NULL,
200: "JIS",
201: (const char *(*)[])&jis_aliases,
202: 0
203: };
204:
205: static const char *euc_cn_aliases[] = {"CN-GB", "EUC_CN", "eucCN", "x-euc-cn", "gb2312", NULL};
206: static zend_encoding encoding_euc_cn = {
207: NULL,
208: NULL,
209: "EUC-CN",
210: (const char *(*)[])&euc_cn_aliases,
211: 1
212: };
213:
214: static const char *cp936_aliases[] = {"CP-936", NULL};
215: static zend_encoding encoding_cp936 = {
216: NULL,
217: NULL,
218: "CP936",
219: (const char *(*)[])&cp936_aliases,
220: 0
221: };
222:
223: static const char *hz_aliases[] = {"HZ-GB-2312", NULL};
224: static zend_encoding encoding_hz = {
225: NULL,
226: NULL,
227: "HZ",
228: (const char *(*)[])&hz_aliases,
229: 0
230: };
231:
232: static const char *euc_tw_aliases[] = {"EUC_TW", "eucTW", "x-euc-tw", NULL};
233: static zend_encoding encoding_euc_tw = {
234: NULL,
235: NULL,
236: "EUC-TW",
237: (const char *(*)[])&euc_tw_aliases,
238: 1
239: };
240:
241: static const char *big5_aliases[] = {"BIG5", "CN-BIG5", "BIG-FIVE", "BIGFIVE", "CP950", NULL};
242: static zend_encoding encoding_big5 = {
243: NULL,
244: NULL,
245: "BIG-5",
246: (const char *(*)[])&big5_aliases,
247: 0
248: };
249:
250: static const char *euc_kr_aliases[] = {"EUC_KR", "eucKR", "x-euc-kr", NULL};
251: static zend_encoding encoding_euc_kr = {
252: NULL,
253: NULL,
254: "EUC-KR",
255: (const char *(*)[])&euc_kr_aliases,
256: 1
257: };
258:
259: static const char *uhc_aliases[] = {"CP949", NULL};
260: static zend_encoding encoding_uhc = {
261: NULL,
262: NULL,
263: "UHC",
264: (const char *(*)[])&uhc_aliases,
265: 1
266: };
267:
268: static zend_encoding encoding_2022kr = {
269: NULL,
270: NULL,
271: "ISO-2022-KR",
272: NULL,
273: 0
274: };
275:
276: static const char *cp1252_aliases[] = {"cp1252", NULL};
277: static zend_encoding encoding_cp1252 = {
278: NULL,
279: NULL,
280: "Windows-1252",
281: (const char *(*)[])&cp1252_aliases,
282: 1
283: };
284:
285: static const char *iso_8859_1_aliases[] = {"ISO_8859-1", "latin1", NULL};
286: static zend_encoding encoding_8859_1 = {
287: NULL,
288: NULL,
289: "ISO-8859-1",
290: (const char *(*)[])&iso_8859_1_aliases,
291: 1
292: };
293:
294: static const char *iso_8859_2_aliases[] = {"ISO_8859-2", "latin2", NULL};
295: static zend_encoding encoding_8859_2 = {
296: NULL,
297: NULL,
298: "ISO-8859-2",
299: (const char *(*)[])&iso_8859_2_aliases,
300: 1
301: };
302:
303: static const char *iso_8859_3_aliases[] = {"ISO_8859-3", "latin3", NULL};
304: static zend_encoding encoding_8859_3 = {
305: NULL,
306: NULL,
307: "ISO-8859-3",
308: (const char *(*)[])&iso_8859_3_aliases,
309: 1
310: };
311:
312: static const char *iso_8859_4_aliases[] = {"ISO_8859-4", "latin4", NULL};
313: static zend_encoding encoding_8859_4 = {
314: NULL,
315: NULL,
316: "ISO-8859-4",
317: (const char *(*)[])&iso_8859_4_aliases,
318: 1
319: };
320:
321: static const char *iso_8859_5_aliases[] = {"ISO_8859-5", "cyrillic", NULL};
322: static zend_encoding encoding_8859_5 = {
323: NULL,
324: NULL,
325: "ISO-8859-5",
326: (const char *(*)[])&iso_8859_5_aliases,
327: 1
328: };
329:
330: static const char *iso_8859_6_aliases[] = {"ISO_8859-6", "arabic", NULL};
331: static zend_encoding encoding_8859_6 = {
332: NULL,
333: NULL,
334: "ISO-8859-6",
335: (const char *(*)[])&iso_8859_6_aliases,
336: 1
337: };
338:
339: static const char *iso_8859_7_aliases[] = {"ISO_8859-7", "greek", NULL};
340: static zend_encoding encoding_8859_7 = {
341: NULL,
342: NULL,
343: "ISO-8859-7",
344: (const char *(*)[])&iso_8859_7_aliases,
345: 1
346: };
347:
348: static const char *iso_8859_8_aliases[] = {"ISO_8859-8", "hebrew", NULL};
349: static zend_encoding encoding_8859_8 = {
350: NULL,
351: NULL,
352: "ISO-8859-8",
353: (const char *(*)[])&iso_8859_8_aliases,
354: 1
355: };
356:
357: static const char *iso_8859_9_aliases[] = {"ISO_8859-9", "latin5", NULL};
358: static zend_encoding encoding_8859_9 = {
359: NULL,
360: NULL,
361: "ISO-8859-9",
362: (const char *(*)[])&iso_8859_9_aliases,
363: 1
364: };
365:
366: static const char *iso_8859_10_aliases[] = {"ISO_8859-10", "latin6", NULL};
367: static zend_encoding encoding_8859_10 = {
368: NULL,
369: NULL,
370: "ISO-8859-10",
371: (const char *(*)[])&iso_8859_10_aliases,
372: 1
373: };
374:
375: static const char *iso_8859_13_aliases[] = {"ISO_8859-13", NULL};
376: static zend_encoding encoding_8859_13 = {
377: NULL,
378: NULL,
379: "ISO-8859-13",
380: (const char *(*)[])&iso_8859_13_aliases,
381: 1
382: };
383:
384: static const char *iso_8859_14_aliases[] = {"ISO_8859-14", "latin8", NULL};
385: static zend_encoding encoding_8859_14 = {
386: NULL,
387: NULL,
388: "ISO-8859-14",
389: (const char *(*)[])&iso_8859_14_aliases,
390: 1
391: };
392:
393: static const char *iso_8859_15_aliases[] = {"ISO_8859-15", NULL};
394: static zend_encoding encoding_8859_15 = {
395: NULL,
396: NULL,
397: "ISO-8859-15",
398: (const char *(*)[])&iso_8859_15_aliases,
399: 1
400: };
401:
402: static const char *cp1251_aliases[] = {"CP1251", "CP-1251", "WINDOWS-1251", NULL};
403: static zend_encoding encoding_cp1251 = {
404: NULL,
405: NULL,
406: "Windows-1251",
407: (const char *(*)[])&cp1251_aliases,
408: 1
409: };
410:
411: static const char *cp866_aliases[] = {"CP866", "CP-866", "IBM-866", NULL};
412: static zend_encoding encoding_cp866 = {
413: NULL,
414: NULL,
415: "CP866",
416: (const char *(*)[])&cp866_aliases,
417: 1
418: };
419:
420: static const char *koi8r_aliases[] = {"KOI8-R", "KOI8R", NULL};
421: static zend_encoding encoding_koi8r = {
422: NULL,
423: NULL,
424: "KOI8-R",
425: (const char *(*)[])&koi8r_aliases,
426: 1
427: };
428:
429: static const char *koi8u_aliases[] = {"KOI8-U", "KOI8U", NULL};
430: static zend_encoding encoding_koi8u = {
431: NULL,
432: NULL,
433: "KOI8-U",
434: (const char *(*)[])&koi8u_aliases,
435: 1
436: };
437:
438: static const char *cp1254_aliases[] = {"cp1254", NULL};
439: static zend_encoding encoding_cp1254 = {
440: NULL,
441: NULL,
442: "Windows-1254",
443: (const char *(*)[])&cp1254_aliases,
444: 1
445: };
446:
447: static const char *armscii8_aliases[] = { "ArmSCII8", "ARMSCII-8", "ARMSCII8", NULL};
448: static zend_encoding encoding_armscii8 = {
449: NULL,
450: NULL,
451: "ArmSCII-8",
452: (const char *(*)[])&armscii8_aliases,
453: 1
454: };
455:
456: static const char *cp850_aliases[] = {"IBM850", NULL};
457: static zend_encoding encoding_cp850 = {
458: NULL,
459: NULL,
460: "CP850",
461: (const char *(*)[])&cp850_aliases,
462: 1
463: };
464:
465: static zend_encoding *zend_encoding_table[] = {
466: &encoding_ucs4,
467: &encoding_ucs4be,
468: &encoding_ucs4le,
469: &encoding_ucs2,
470: &encoding_ucs2be,
471: &encoding_ucs2le,
472: &encoding_utf32,
473: &encoding_utf32be,
474: &encoding_utf32le,
475: &encoding_utf16,
476: &encoding_utf16be,
477: &encoding_utf16le,
478: &encoding_utf8,
479: &encoding_ascii,
480: &encoding_euc_jp,
481: &encoding_sjis,
482: &encoding_eucjp_win,
483: &encoding_sjis_win,
484: &encoding_jis,
485: &encoding_cp1252,
486: &encoding_8859_1,
487: &encoding_8859_2,
488: &encoding_8859_3,
489: &encoding_8859_4,
490: &encoding_8859_5,
491: &encoding_8859_6,
492: &encoding_8859_7,
493: &encoding_8859_8,
494: &encoding_8859_9,
495: &encoding_8859_10,
496: &encoding_8859_13,
497: &encoding_8859_14,
498: &encoding_8859_15,
499: &encoding_euc_cn,
500: &encoding_cp936,
501: &encoding_hz,
502: &encoding_euc_tw,
503: &encoding_big5,
504: &encoding_euc_kr,
505: &encoding_uhc,
506: &encoding_2022kr,
507: &encoding_cp1251,
508: &encoding_cp866,
509: &encoding_koi8r,
510: &encoding_koi8u,
511: &encoding_armscii8,
512: &encoding_cp1254,
513: &encoding_cp850,
514: NULL
515: };
516:
517:
518:
519: ZEND_API int zend_multibyte_set_script_encoding(const char *encoding_list,
520: size_t encoding_list_size TSRMLS_DC)
521: {
522: if (CG(script_encoding_list)) {
523: efree(CG(script_encoding_list));
524: CG(script_encoding_list) = NULL;
525: }
526: CG(script_encoding_list_size) = 0;
527:
528: if (!encoding_list) {
529: return 0;
530: }
531:
532: zend_multibyte_parse_encoding_list(encoding_list, encoding_list_size, &(CG(script_encoding_list)), &(CG(script_encoding_list_size)));
533:
534: return 0;
535: }
536:
537:
538: ZEND_API int zend_multibyte_set_internal_encoding(const char *encoding_name TSRMLS_DC)
539: {
540: CG(internal_encoding) = zend_multibyte_fetch_encoding(encoding_name);
541: return 0;
542: }
543:
544: ZEND_API int zend_multibyte_set_functions(zend_encoding_detector encoding_detector, zend_encoding_converter encoding_converter, zend_encoding_oddlen encoding_oddlen TSRMLS_DC)
545: {
546: CG(encoding_detector) = encoding_detector;
547: CG(encoding_converter) = encoding_converter;
548: CG(encoding_oddlen) = encoding_oddlen;
549: return 0;
550: }
551:
552:
553: ZEND_API int zend_multibyte_set_filter(zend_encoding *onetime_encoding TSRMLS_DC)
554: {
555: LANG_SCNG(script_encoding) = zend_multibyte_find_script_encoding(onetime_encoding TSRMLS_CC);
556: LANG_SCNG(internal_encoding) = CG(internal_encoding);
557:
558: /* judge input/output filter */
559: LANG_SCNG(input_filter) = NULL;
560: LANG_SCNG(output_filter) = NULL;
561:
562: if (!LANG_SCNG(script_encoding)) {
563: return 0;
564: }
565:
566: if (!LANG_SCNG(internal_encoding) || LANG_SCNG(script_encoding) == LANG_SCNG(internal_encoding)) {
567: /* if encoding specfic filters exist, use them */
568: if (LANG_SCNG(script_encoding)->input_filter && LANG_SCNG(script_encoding)->output_filter) {
569: LANG_SCNG(input_filter) = LANG_SCNG(script_encoding)->input_filter;
570: LANG_SCNG(output_filter) = LANG_SCNG(script_encoding)->output_filter;
571: return 0;
572: }
573:
574: if (!LANG_SCNG(script_encoding)->compatible) {
575: /* and if not, work around w/ script_encoding -> utf-8 -> script_encoding conversion */
576: LANG_SCNG(internal_encoding) = LANG_SCNG(script_encoding);
577: LANG_SCNG(input_filter) = zend_multibyte_script_encoding_filter;
578: LANG_SCNG(output_filter) = zend_multibyte_internal_encoding_filter;
579: return 0;
580: } else {
581: /* nothing to do in this case */
582: return 0;
583: }
584: }
585:
586: /* LANG_SCNG(internal_encoding) cannot be NULL here */
587: if (LANG_SCNG(internal_encoding)->compatible) {
588: LANG_SCNG(input_filter) = zend_multibyte_script_encoding_filter;
589: return 0;
590: } else if (LANG_SCNG(script_encoding)->compatible) {
591: LANG_SCNG(output_filter) = zend_multibyte_internal_encoding_filter;
592: return 0;
593: }
594:
595: /* both script and internal encodings are incompatible w/ flex */
596: LANG_SCNG(input_filter) = zend_multibyte_script_encoding_filter;
597: LANG_SCNG(output_filter) = zend_multibyte_internal_encoding_filter;
598:
599: return 0;
600: }
601:
602:
603: ZEND_API zend_encoding* zend_multibyte_fetch_encoding(const char *encoding_name)
604: {
605: int i, j;
606: zend_encoding *encoding;
607:
608: if (!encoding_name) {
609: return NULL;
610: }
611:
612: for (i = 0; (encoding = zend_encoding_table[i]) != NULL; i++) {
613: if (zend_binary_strcasecmp(encoding->name, strlen(encoding->name), encoding_name, strlen(encoding_name)) == 0) {
614: return encoding;
615: }
616: }
617:
618: for (i = 0; (encoding = zend_encoding_table[i]) != NULL; i++) {
619: if (encoding->aliases != NULL) {
620: for (j = 0; (*encoding->aliases)[j] != NULL; j++) {
621: if (zend_binary_strcasecmp((*encoding->aliases)[j], strlen((*encoding->aliases)[j]), encoding_name, strlen(encoding_name)) == 0) {
622: return encoding;
623: }
624: }
625: }
626: }
627:
628: return NULL;
629: }
630:
631:
632: ZEND_API size_t zend_multibyte_script_encoding_filter(unsigned char **to, size_t
633: *to_length, const unsigned char *from, size_t from_length TSRMLS_DC)
634: {
635: const char *name;
636:
637: if (LANG_SCNG(internal_encoding) == NULL || LANG_SCNG(internal_encoding)->compatible == 0) {
638: name = "UTF-8";
639: } else {
640: name = LANG_SCNG(internal_encoding)->name;
641: }
642:
643: return zend_multibyte_encoding_filter(to, to_length, name, from, from_length, LANG_SCNG(script_encoding)->name TSRMLS_CC);
644: }
645:
646: ZEND_API size_t zend_multibyte_internal_encoding_filter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length TSRMLS_DC)
647: {
648: const char *name;
649:
650: if (LANG_SCNG(script_encoding)->compatible == 0) {
651: name = "UTF-8";
652: } else {
653: name = LANG_SCNG(script_encoding)->name;
654: }
655:
656: return zend_multibyte_encoding_filter(to, to_length, LANG_SCNG(internal_encoding)->name, from, from_length, name TSRMLS_CC);
657: }
658:
659: static size_t zend_multibyte_encoding_filter(unsigned char **to, size_t *to_length, const char *to_encoding, const unsigned char *from, size_t from_length, const char *from_encoding TSRMLS_DC)
660: {
661: size_t oddlen;
662:
663: if (!CG(encoding_converter)) {
664: return 0;
665: }
666:
667: if (CG(encoding_oddlen)) {
668: oddlen = CG(encoding_oddlen)(from, from_length, from_encoding TSRMLS_CC);
669: if (oddlen > 0) {
670: from_length -= oddlen;
671: }
672: }
673:
674: if (CG(encoding_converter)(to, to_length, from, from_length, to_encoding, from_encoding TSRMLS_CC) != 0) {
675: return 0;
676: }
677:
678: return from_length;
679: }
680:
681:
682: /*
683: * Shift_JIS Input/Output Filter
684: */
685: static const unsigned char table_sjis[] = { /* 0x80-0x9f,0xE0-0xEF */
686: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
687: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
688: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
689: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
690: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
691: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
692: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
693: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
694: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
695: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
696: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
697: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
698: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
699: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
700: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
701: 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 0, 0, 0
702: };
703:
704: size_t sjis_input_filter(unsigned char **buf, size_t *length, const unsigned char *sjis, size_t sjis_length TSRMLS_DC)
705: {
706: const unsigned char *p;
707: unsigned char *q;
708: unsigned char c1, c2;
709:
710: *buf = (unsigned char*)emalloc(sjis_length * 3 / 2 + 1);
711: if (!*buf)
712: return 0;
713: *length = 0;
714:
715: p = sjis;
716: q = *buf;
717:
718: /* convert [SJIS -> EUC-JP] (for lex scan) -- some other better ways? */
719: while (*p && (p - sjis) < sjis_length) {
720: if (!(*p & 0x80)) {
721: *q++ = *p++;
722: continue;
723: }
724:
725: /* handling 8 bit code */
726: if (table_sjis[*p] == 1) {
727: /* 1 byte kana */
728: *q++ = 0x8e;
729: *q++ = *p++;
730: continue;
731: }
732:
733: if (!*(p+1)) {
734: *q++ = *p++;
735: break;
736: }
737:
738: if (table_sjis[*p] == 2) {
739: /* 2 byte kanji code */
740: c1 = *p++;
741: if (!*p || (p - sjis) >= sjis_length) {
742: break;
743: }
744: c2 = *p++;
745: c1 -= (c1 <= 0x9f) ? 0x71 : 0xb1;
746: c1 = (c1 << 1) + 1;
747: if (c2 >= 0x9e) {
748: c2 -= 0x7e;
749: c1++;
750: } else if (c2 > 0x7f) {
751: c2 -= 0x20;
752: } else {
753: c2 -= 0x1f;
754: }
755:
756: c1 |= 0x80;
757: c2 |= 0x80;
758:
759: *q++ = c1;
760: *q++ = c2;
761: } else {
762: /*
763: * for user defined chars (ATTENTION)
764: *
765: * THESE ARE NOT CODE FOR CONVERSION! :-P
766: * (using *ILLEGALLY* 3byte EUC-JP space)
767: *
768: * we cannot perfectly (== 1 to 1) convert these chars to EUC-JP.
769: * so, these code are for perfect RESTORING in sjis_output_filter()
770: */
771: c1 = *p++;
772: if (!*p || (p - sjis) >= sjis_length) {
773: break;
774: }
775: c2 = *p++;
776: *q++ = 0x8f;
777: /*
778: * MAP TO (EUC-JP):
779: * type A: 0xeba1 - 0xf4fe
780: * type B: 0xf5a1 - 0xfefe
781: * type C: 0xa1a1 - 0xa6fe
782: */
783: c1 -= (c1 > 0xf9) ? (0x79+0x71) : (0x0a+0xb1);
784: c1 = (c1 << 1) + 1;
785: if (c2 >= 0x9e) {
786: c2 -= 0x7e;
787: c1++;
788: } else if (c2 > 0x7f) {
789: c2 -= 0x20;
790: } else {
791: c2 -= 0x1f;
792: }
793:
794: c1 |= 0x80;
795: c2 |= 0x80;
796:
797: *q++ = c1;
798: *q++ = c2;
799: }
800: }
801: *q = '\0';
802: *length = q - *buf;
803:
804: return *length;
805: }
806:
807: static const unsigned char table_eucjp[] = { /* 0xA1-0xFE */
808: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
809: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
810: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
811: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
812: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
813: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
814: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
815: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
816: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
817: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
818: 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
819: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
820: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
821: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
822: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
823: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
824: };
825:
826: size_t sjis_output_filter(unsigned char **sjis, size_t *sjis_length, const unsigned char *buf, size_t length TSRMLS_DC)
827: {
828: unsigned char c1, c2;
829: unsigned char *p;
830: const unsigned char *q;
831:
832: if (!sjis || !sjis_length) {
833: return 0;
834: }
835:
836: /* always Shift_JIS <= EUC-JP */
837: *sjis = (unsigned char*)emalloc(length+1);
838: if (!sjis) {
839: return 0;
840: }
841: p = *sjis;
842: q = buf;
843:
844: /* restore converted strings [EUC-JP -> Shift_JIS] */
845: while (*q && (q - buf) < length) {
846: if (!(*q & 0x80)) {
847: *p++ = *q++;
848: continue;
849: }
850:
851: /* hankaku kana */
852: if (*q == 0x8e) {
853: q++;
854: if (*q) {
855: *p++ = *q++;
856: }
857: continue;
858: }
859:
860: /* 2 byte kanji code */
861: if (table_eucjp[*q] == 2) {
862: c1 = (*q++ & ~0x80) & 0xff;
863: if (*q) {
864: c2 = (*q++ & ~0x80) & 0xff;
865: } else {
866: q--;
867: break;
868: }
869:
870: c2 += (c1 & 0x01) ? 0x1f : 0x7d;
871: if (c2 >= 0x7f) {
872: c2++;
873: }
874: c1 = ((c1 - 0x21) >> 1) + 0x81;
875: if (c1 > 0x9f) {
876: c1 += 0x40;
877: }
878:
879: *p++ = c1;
880: *p++ = c2;
881: continue;
882: }
883:
884: if (*q == 0x8f) {
885: q++;
886: if (*q) {
887: c1 = (*q++ & ~0x80) & 0xff;
888: } else {
889: q--;
890: break;
891: }
892: if (*q) {
893: c2 = (*q++ & ~0x80) & 0xff;
894: } else {
895: q -= 2;
896: break;
897: }
898:
899: c2 += (c1 & 0x01) ? 0x1f : 0x7d;
900: if (c2 >= 0x7f) {
901: c2++;
902: }
903: c1 = ((c1 - 0x21) >> 1) + 0x81;
904: if (c1 > 0x9f) {
905: c1 += 0x40;
906: }
907:
908: if (c1 >= 0x81 && c1 <= 0x9f) {
909: c1 += 0x79;
910: } else {
911: c1 += 0x0a;
912: }
913:
914: *p++ = c1;
915: *p++ = c2;
916: continue;
917: }
918:
919: /* some other chars (may not happen) */
920: *p++ = *q++;
921: }
922: *p = '\0';
923: *sjis_length = p - *sjis;
924:
925: return q-buf; /* return length we actually read */
926: }
927:
928:
929: static char *zend_multibyte_assemble_encoding_list(zend_encoding **encoding_list, size_t encoding_list_size)
930: {
931: int i, list_size = 0;
932: const char *name;
933: char *list = NULL;
934:
935: if (!encoding_list || !encoding_list_size) {
936: return NULL;
937: }
938:
939: for (i = 0; i < encoding_list_size; i++) {
940: name = (*(encoding_list+i))->name;
941: if (name) {
942: list_size += strlen(name) + 1;
943: if (!list) {
944: list = (char*)emalloc(list_size);
945: if (!list) {
946: return NULL;
947: }
948: *list = '\0';
949: } else {
950: list = (char*)erealloc(list, list_size);
951: if (!list) {
952: return NULL;
953: }
954: strcat(list, ",");
955: }
956: strcat(list, name);
957: }
958: }
959: return list;
960: }
961:
962:
963: static int zend_multibyte_parse_encoding_list(const char *encoding_list,
964: size_t encoding_list_size, zend_encoding ***result, size_t *result_size)
965: {
966: int n, size;
967: char *p, *p1, *p2, *endp, *tmpstr;
968: zend_encoding **list, **entry, *encoding;
969:
970: list = NULL;
971: if (encoding_list == NULL || encoding_list_size <= 0) {
972: return -1;
973: } else {
974: /* copy the encoding_list string for work */
975: tmpstr = (char *)estrndup(encoding_list, encoding_list_size);
976: if (tmpstr == NULL) {
977: return -1;
978: }
979: /* count the number of listed encoding names */
980: endp = tmpstr + encoding_list_size;
981: n = 1;
982: p1 = tmpstr;
983: while ((p2 = zend_memnstr(p1, ",", 1, endp)) != NULL) {
984: p1 = p2 + 1;
985: n++;
986: }
987: size = n;
988: /* make list */
989: list = (zend_encoding**)ecalloc(size, sizeof(zend_encoding*));
990: if (list != NULL) {
991: entry = list;
992: n = 0;
993: p1 = tmpstr;
994: do {
995: p2 = p = zend_memnstr(p1, ",", 1, endp);
996: if (p == NULL) {
997: p = endp;
998: }
999: *p = '\0';
1000: /* trim spaces */
1001: while (p1 < p && (*p1 == ' ' || *p1 == '\t')) {
1002: p1++;
1003: }
1004: p--;
1005: while (p > p1 && (*p == ' ' || *p == '\t')) {
1006: *p = '\0';
1007: p--;
1008: }
1009: /* convert to the encoding number and check encoding */
1010: encoding = zend_multibyte_fetch_encoding(p1);
1011: if (encoding)
1012: {
1013: *entry++ = encoding;
1014: n++;
1015: }
1016: p1 = p2 + 1;
1017: } while (n < size && p2 != NULL);
1018: *result = list;
1019: *result_size = n;
1020: }
1021: efree(tmpstr);
1022: }
1023:
1024: if (list == NULL) {
1025: return -1;
1026: }
1027:
1028: return 0;
1029: }
1030:
1031:
1032: static zend_encoding* zend_multibyte_find_script_encoding(zend_encoding *onetime_encoding TSRMLS_DC)
1033: {
1034: zend_encoding *script_encoding;
1035: char *name, *list;
1036:
1037: /* onetime_encoding is prior to everything */
1038: if (onetime_encoding != NULL) {
1039: return onetime_encoding;
1040: }
1041:
1042: if (CG(detect_unicode)) {
1043: /* check out bom(byte order mark) and see if containing wchars */
1044: script_encoding = zend_multibyte_detect_unicode(TSRMLS_C);
1045: if (script_encoding != NULL) {
1046: /* bom or wchar detection is prior to 'script_encoding' option */
1047: return script_encoding;
1048: }
1049: }
1050:
1051: /* if no script_encoding specified, just leave alone */
1052: if (!CG(script_encoding_list) || !CG(script_encoding_list_size)) {
1053: return NULL;
1054: }
1055:
1056: /* if multiple encodings specified, detect automagically */
1057: if (CG(script_encoding_list_size) > 1 && CG(encoding_detector)) {
1058: list = zend_multibyte_assemble_encoding_list(CG(script_encoding_list),
1059: CG(script_encoding_list_size));
1060: name = CG(encoding_detector)(LANG_SCNG(script_org),
1061: LANG_SCNG(script_org_size), list TSRMLS_CC);
1062: if (list) {
1063: efree(list);
1064: }
1065: if (name) {
1066: script_encoding = zend_multibyte_fetch_encoding(name);
1067: efree(name);
1068: } else {
1069: script_encoding = NULL;
1070: }
1071: return script_encoding;
1072: }
1073:
1074: return *(CG(script_encoding_list));
1075: }
1076:
1077:
1078: static zend_encoding* zend_multibyte_detect_unicode(TSRMLS_D)
1079: {
1080: zend_encoding *script_encoding = NULL;
1081: int bom_size;
1082: unsigned char *script;
1083:
1084: if (LANG_SCNG(script_org_size) < sizeof(BOM_UTF32_LE)-1) {
1085: return NULL;
1086: }
1087:
1088: /* check out BOM */
1089: if (!memcmp(LANG_SCNG(script_org), BOM_UTF32_BE, sizeof(BOM_UTF32_BE)-1)) {
1090: script_encoding = &encoding_utf32be;
1091: bom_size = sizeof(BOM_UTF32_BE)-1;
1092: } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF32_LE, sizeof(BOM_UTF32_LE)-1)) {
1093: script_encoding = &encoding_utf32le;
1094: bom_size = sizeof(BOM_UTF32_LE)-1;
1095: } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF16_BE, sizeof(BOM_UTF16_BE)-1)) {
1096: script_encoding = &encoding_utf16be;
1097: bom_size = sizeof(BOM_UTF16_BE)-1;
1098: } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF16_LE, sizeof(BOM_UTF16_LE)-1)) {
1099: script_encoding = &encoding_utf16le;
1100: bom_size = sizeof(BOM_UTF16_LE)-1;
1101: } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF8, sizeof(BOM_UTF8)-1)) {
1102: script_encoding = &encoding_utf8;
1103: bom_size = sizeof(BOM_UTF8)-1;
1104: }
1105:
1106: if (script_encoding) {
1107: /* remove BOM */
1108: script = (unsigned char*)emalloc(LANG_SCNG(script_org_size)+1-bom_size);
1109: memcpy(script, LANG_SCNG(script_org)+bom_size, LANG_SCNG(script_org_size)+1-bom_size);
1110: efree(LANG_SCNG(script_org));
1111: LANG_SCNG(script_org) = script;
1112: LANG_SCNG(script_org_size) -= bom_size;
1113:
1114: return script_encoding;
1115: }
1116:
1117: /* script contains NULL bytes -> auto-detection */
1118: if (memchr(LANG_SCNG(script_org), 0, LANG_SCNG(script_org_size))) {
1119: /* make best effort if BOM is missing */
1120: return zend_multibyte_detect_utf_encoding(LANG_SCNG(script_org), LANG_SCNG(script_org_size) TSRMLS_CC);
1121: }
1122:
1123: return NULL;
1124: }
1125:
1126: static zend_encoding *zend_multibyte_detect_utf_encoding(const unsigned char *script, size_t script_size TSRMLS_DC)
1127: {
1128: const unsigned char *p;
1129: int wchar_size = 2;
1130: int le = 0;
1131:
1132: /* utf-16 or utf-32? */
1133: p = script;
1134: while ((p-script) < script_size) {
1135: p = memchr(p, 0, script_size-(p-script)-2);
1136: if (!p) {
1137: break;
1138: }
1139: if (*(p+1) == '\0' && *(p+2) == '\0') {
1140: wchar_size = 4;
1141: break;
1142: }
1143:
1144: /* searching for UTF-32 specific byte orders, so this will do */
1145: p += 4;
1146: }
1147:
1148: /* BE or LE? */
1149: p = script;
1150: while ((p-script) < script_size) {
1151: if (*p == '\0' && *(p+wchar_size-1) != '\0') {
1152: /* BE */
1153: le = 0;
1154: break;
1155: } else if (*p != '\0' && *(p+wchar_size-1) == '\0') {
1156: /* LE* */
1157: le = 1;
1158: break;
1159: }
1160: p += wchar_size;
1161: }
1162:
1163: if (wchar_size == 2) {
1164: return le ? &encoding_utf16le : &encoding_utf16be;
1165: } else {
1166: return le ? &encoding_utf32le : &encoding_utf32be;
1167: }
1168:
1169: return NULL;
1170: }
1171: #endif /* ZEND_MULTIBYTE */
1172:
1173: /*
1174: * Local variables:
1175: * tab-width: 4
1176: * c-basic-offset: 4
1177: * End:
1178: * vim600: sw=4 ts=4 tw=78
1179: * vim<600: sw=4 ts=4 tw=78
1180: */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>