Annotation of embedaddon/php/ext/mbstring/oniguruma/enc/gb18030.c, revision 1.1.1.1
1.1 misho 1: /**********************************************************************
2: gb18030.c - Oniguruma (regular expression library)
3: **********************************************************************/
4: /*-
5: * Copyright (c) 2005 KUBO Takehiro <kubo AT jiubao DOT org>
6: * K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
7: * All rights reserved.
8: *
9: * Redistribution and use in source and binary forms, with or without
10: * modification, are permitted provided that the following conditions
11: * are met:
12: * 1. Redistributions of source code must retain the above copyright
13: * notice, this list of conditions and the following disclaimer.
14: * 2. Redistributions in binary form must reproduce the above copyright
15: * notice, this list of conditions and the following disclaimer in the
16: * documentation and/or other materials provided with the distribution.
17: *
18: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21: * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28: * SUCH DAMAGE.
29: */
30:
31: #include "regenc.h"
32:
33: #if 1
34: #define DEBUG_GB18030(arg)
35: #else
36: #define DEBUG_GB18030(arg) printf arg
37: #endif
38:
39: enum {
40: C1, /* one-byte char */
41: C2, /* one-byte or second of two-byte char */
42: C4, /* one-byte or second or fourth of four-byte char */
43: CM /* first of two- or four-byte char or second of two-byte char */
44: };
45:
46: static const char GB18030_MAP[] = {
47: C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
48: C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
49: C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
50: C4, C4, C4, C4, C4, C4, C4, C4, C4, C4, C1, C1, C1, C1, C1, C1,
51: C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
52: C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
53: C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
54: C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C1,
55: C2, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
56: CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
57: CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
58: CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
59: CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
60: CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
61: CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
62: CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, C1
63: };
64:
65: static int
66: gb18030_mbc_enc_len(const UChar* p)
67: {
68: if (GB18030_MAP[*p] != CM)
69: return 1;
70: p++;
71: if (GB18030_MAP[*p] == C4)
72: return 4;
73: if (GB18030_MAP[*p] == C1)
74: return 1; /* illegal sequence */
75: return 2;
76: }
77:
78: static OnigCodePoint
79: gb18030_mbc_to_code(const UChar* p, const UChar* end)
80: {
81: return onigenc_mbn_mbc_to_code(ONIG_ENCODING_GB18030, p, end);
82: }
83:
84: static int
85: gb18030_code_to_mbc(OnigCodePoint code, UChar *buf)
86: {
87: return onigenc_mb4_code_to_mbc(ONIG_ENCODING_GB18030, code, buf);
88: }
89:
90: static int
91: gb18030_mbc_to_normalize(OnigAmbigType flag, const UChar** pp, const UChar* end,
92: UChar* lower)
93: {
94: return onigenc_mbn_mbc_to_normalize(ONIG_ENCODING_GB18030, flag,
95: pp, end, lower);
96: }
97:
98: static int
99: gb18030_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end)
100: {
101: return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_GB18030, flag, pp, end);
102: }
103:
104: static int
105: gb18030_is_code_ctype(OnigCodePoint code, unsigned int ctype)
106: {
107: return onigenc_mb4_is_code_ctype(ONIG_ENCODING_GB18030, code, ctype);
108: }
109:
110: enum state {
111: S_START,
112: S_one_C2,
113: S_one_C4,
114: S_one_CM,
115:
116: S_odd_CM_one_CX,
117: S_even_CM_one_CX,
118:
119: /* CMC4 : pair of "CM C4" */
120: S_one_CMC4,
121: S_odd_CMC4,
122: S_one_C4_odd_CMC4,
123: S_even_CMC4,
124: S_one_C4_even_CMC4,
125:
126: S_odd_CM_odd_CMC4,
127: S_even_CM_odd_CMC4,
128:
129: S_odd_CM_even_CMC4,
130: S_even_CM_even_CMC4,
131:
132: /* C4CM : pair of "C4 CM" */
133: S_odd_C4CM,
134: S_one_CM_odd_C4CM,
135: S_even_C4CM,
136: S_one_CM_even_C4CM,
137:
138: S_even_CM_odd_C4CM,
139: S_odd_CM_odd_C4CM,
140: S_even_CM_even_C4CM,
141: S_odd_CM_even_C4CM,
142: };
143:
144: static UChar*
145: gb18030_left_adjust_char_head(const UChar* start, const UChar* s)
146: {
147: const UChar *p;
148: enum state state = S_START;
149:
150: DEBUG_GB18030(("----------------\n"));
151: for (p = s; p >= start; p--) {
152: DEBUG_GB18030(("state %d --(%02x)-->\n", state, *p));
153: switch (state) {
154: case S_START:
155: switch (GB18030_MAP[*p]) {
156: case C1:
157: return (UChar *)s;
158: case C2:
159: state = S_one_C2; /* C2 */
160: break;
161: case C4:
162: state = S_one_C4; /* C4 */
163: break;
164: case CM:
165: state = S_one_CM; /* CM */
166: break;
167: }
168: break;
169: case S_one_C2: /* C2 */
170: switch (GB18030_MAP[*p]) {
171: case C1:
172: case C2:
173: case C4:
174: return (UChar *)s;
175: case CM:
176: state = S_odd_CM_one_CX; /* CM C2 */
177: break;
178: }
179: break;
180: case S_one_C4: /* C4 */
181: switch (GB18030_MAP[*p]) {
182: case C1:
183: case C2:
184: case C4:
185: return (UChar *)s;
186: case CM:
187: state = S_one_CMC4;
188: break;
189: }
190: break;
191: case S_one_CM: /* CM */
192: switch (GB18030_MAP[*p]) {
193: case C1:
194: case C2:
195: return (UChar *)s;
196: case C4:
197: state = S_odd_C4CM;
198: break;
199: case CM:
200: state = S_odd_CM_one_CX; /* CM CM */
201: break;
202: }
203: break;
204:
205: case S_odd_CM_one_CX: /* CM C2 */ /* CM CM */ /* CM CM CM C4 */
206: switch (GB18030_MAP[*p]) {
207: case C1:
208: case C2:
209: case C4:
210: return (UChar *)(s - 1);
211: case CM:
212: state = S_even_CM_one_CX;
213: break;
214: }
215: break;
216: case S_even_CM_one_CX: /* CM CM C2 */ /* CM CM CM */ /* CM CM C4 */
217: switch (GB18030_MAP[*p]) {
218: case C1:
219: case C2:
220: case C4:
221: return (UChar *)s;
222: case CM:
223: state = S_odd_CM_one_CX;
224: break;
225: }
226: break;
227:
228: case S_one_CMC4: /* CM C4 */
229: switch (GB18030_MAP[*p]) {
230: case C1:
231: case C2:
232: return (UChar *)(s - 1);
233: case C4:
234: state = S_one_C4_odd_CMC4; /* C4 CM C4 */
235: break;
236: case CM:
237: state = S_even_CM_one_CX; /* CM CM C4 */
238: break;
239: }
240: break;
241: case S_odd_CMC4: /* CM C4 CM C4 CM C4 */
242: switch (GB18030_MAP[*p]) {
243: case C1:
244: case C2:
245: return (UChar *)(s - 1);
246: case C4:
247: state = S_one_C4_odd_CMC4;
248: break;
249: case CM:
250: state = S_odd_CM_odd_CMC4;
251: break;
252: }
253: break;
254: case S_one_C4_odd_CMC4: /* C4 CM C4 */
255: switch (GB18030_MAP[*p]) {
256: case C1:
257: case C2:
258: case C4:
259: return (UChar *)(s - 1);
260: case CM:
261: state = S_even_CMC4; /* CM C4 CM C4 */
262: break;
263: }
264: break;
265: case S_even_CMC4: /* CM C4 CM C4 */
266: switch (GB18030_MAP[*p]) {
267: case C1:
268: case C2:
269: return (UChar *)(s - 3);
270: case C4:
271: state = S_one_C4_even_CMC4;
272: break;
273: case CM:
274: state = S_odd_CM_even_CMC4;
275: break;
276: }
277: break;
278: case S_one_C4_even_CMC4: /* C4 CM C4 CM C4 */
279: switch (GB18030_MAP[*p]) {
280: case C1:
281: case C2:
282: case C4:
283: return (UChar *)(s - 3);
284: case CM:
285: state = S_odd_CMC4;
286: break;
287: }
288: break;
289:
290: case S_odd_CM_odd_CMC4: /* CM CM C4 CM C4 CM C4 */
291: switch (GB18030_MAP[*p]) {
292: case C1:
293: case C2:
294: case C4:
295: return (UChar *)(s - 3);
296: case CM:
297: state = S_even_CM_odd_CMC4;
298: break;
299: }
300: break;
301: case S_even_CM_odd_CMC4: /* CM CM CM C4 CM C4 CM C4 */
302: switch (GB18030_MAP[*p]) {
303: case C1:
304: case C2:
305: case C4:
306: return (UChar *)(s - 1);
307: case CM:
308: state = S_odd_CM_odd_CMC4;
309: break;
310: }
311: break;
312:
313: case S_odd_CM_even_CMC4: /* CM CM C4 CM C4 */
314: switch (GB18030_MAP[*p]) {
315: case C1:
316: case C2:
317: case C4:
318: return (UChar *)(s - 1);
319: case CM:
320: state = S_even_CM_even_CMC4;
321: break;
322: }
323: break;
324: case S_even_CM_even_CMC4: /* CM CM CM C4 CM C4 */
325: switch (GB18030_MAP[*p]) {
326: case C1:
327: case C2:
328: case C4:
329: return (UChar *)(s - 3);
330: case CM:
331: state = S_odd_CM_even_CMC4;
332: break;
333: }
334: break;
335:
336: case S_odd_C4CM: /* C4 CM */ /* C4 CM C4 CM C4 CM*/
337: switch (GB18030_MAP[*p]) {
338: case C1:
339: case C2:
340: case C4:
341: return (UChar *)s;
342: case CM:
343: state = S_one_CM_odd_C4CM; /* CM C4 CM */
344: break;
345: }
346: break;
347: case S_one_CM_odd_C4CM: /* CM C4 CM */ /* CM C4 CM C4 CM C4 CM */
348: switch (GB18030_MAP[*p]) {
349: case C1:
350: case C2:
351: return (UChar *)(s - 2); /* |CM C4 CM */
352: case C4:
353: state = S_even_C4CM;
354: break;
355: case CM:
356: state = S_even_CM_odd_C4CM;
357: break;
358: }
359: break;
360: case S_even_C4CM: /* C4 CM C4 CM */
361: switch (GB18030_MAP[*p]) {
362: case C1:
363: case C2:
364: case C4:
365: return (UChar *)(s - 2); /* C4|CM C4 CM */
366: case CM:
367: state = S_one_CM_even_C4CM;
368: break;
369: }
370: break;
371: case S_one_CM_even_C4CM: /* CM C4 CM C4 CM */
372: switch (GB18030_MAP[*p]) {
373: case C1:
374: case C2:
375: return (UChar *)(s - 0); /*|CM C4 CM C4|CM */
376: case C4:
377: state = S_odd_C4CM;
378: break;
379: case CM:
380: state = S_even_CM_even_C4CM;
381: break;
382: }
383: break;
384:
385: case S_even_CM_odd_C4CM: /* CM CM C4 CM */
386: switch (GB18030_MAP[*p]) {
387: case C1:
388: case C2:
389: case C4:
390: return (UChar *)(s - 0); /* |CM CM|C4|CM */
391: case CM:
392: state = S_odd_CM_odd_C4CM;
393: break;
394: }
395: break;
396: case S_odd_CM_odd_C4CM: /* CM CM CM C4 CM */
397: switch (GB18030_MAP[*p]) {
398: case C1:
399: case C2:
400: case C4:
401: return (UChar *)(s - 2); /* |CM CM|CM C4 CM */
402: case CM:
403: state = S_even_CM_odd_C4CM;
404: break;
405: }
406: break;
407:
408: case S_even_CM_even_C4CM: /* CM CM C4 CM C4 CM */
409: switch (GB18030_MAP[*p]) {
410: case C1:
411: case C2:
412: case C4:
413: return (UChar *)(s - 2); /* |CM CM|C4|CM C4 CM */
414: case CM:
415: state = S_odd_CM_even_C4CM;
416: break;
417: }
418: break;
419: case S_odd_CM_even_C4CM: /* CM CM CM C4 CM C4 CM */
420: switch (GB18030_MAP[*p]) {
421: case C1:
422: case C2:
423: case C4:
424: return (UChar *)(s - 0); /* |CM CM|CM C4 CM C4|CM */
425: case CM:
426: state = S_even_CM_even_C4CM;
427: break;
428: }
429: break;
430: }
431: }
432:
433: DEBUG_GB18030(("state %d\n", state));
434: switch (state) {
435: case S_START: return (UChar *)(s - 0);
436: case S_one_C2: return (UChar *)(s - 0);
437: case S_one_C4: return (UChar *)(s - 0);
438: case S_one_CM: return (UChar *)(s - 0);
439:
440: case S_odd_CM_one_CX: return (UChar *)(s - 1);
441: case S_even_CM_one_CX: return (UChar *)(s - 0);
442:
443: case S_one_CMC4: return (UChar *)(s - 1);
444: case S_odd_CMC4: return (UChar *)(s - 1);
445: case S_one_C4_odd_CMC4: return (UChar *)(s - 1);
446: case S_even_CMC4: return (UChar *)(s - 3);
447: case S_one_C4_even_CMC4: return (UChar *)(s - 3);
448:
449: case S_odd_CM_odd_CMC4: return (UChar *)(s - 3);
450: case S_even_CM_odd_CMC4: return (UChar *)(s - 1);
451:
452: case S_odd_CM_even_CMC4: return (UChar *)(s - 1);
453: case S_even_CM_even_CMC4: return (UChar *)(s - 3);
454:
455: case S_odd_C4CM: return (UChar *)(s - 0);
456: case S_one_CM_odd_C4CM: return (UChar *)(s - 2);
457: case S_even_C4CM: return (UChar *)(s - 2);
458: case S_one_CM_even_C4CM: return (UChar *)(s - 0);
459:
460: case S_even_CM_odd_C4CM: return (UChar *)(s - 0);
461: case S_odd_CM_odd_C4CM: return (UChar *)(s - 2);
462: case S_even_CM_even_C4CM: return (UChar *)(s - 2);
463: case S_odd_CM_even_C4CM: return (UChar *)(s - 0);
464: }
465:
466: return (UChar* )s; /* never come here. (escape warning) */
467: }
468:
469: static int
470: gb18030_is_allowed_reverse_match(const UChar* s, const UChar* end)
471: {
472: return GB18030_MAP[*s] == C1 ? TRUE : FALSE;
473: }
474:
475: OnigEncodingType OnigEncodingGB18030 = {
476: gb18030_mbc_enc_len,
477: "GB18030", /* name */
478: 4, /* max enc length */
479: 1, /* min enc length */
480: ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE,
481: {
482: (OnigCodePoint )'\\' /* esc */
483: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
484: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
485: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
486: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
487: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
488: },
489: onigenc_is_mbc_newline_0x0a,
490: gb18030_mbc_to_code,
491: onigenc_mb4_code_to_mbclen,
492: gb18030_code_to_mbc,
493: gb18030_mbc_to_normalize,
494: gb18030_is_mbc_ambiguous,
495: onigenc_ascii_get_all_pair_ambig_codes,
496: onigenc_nothing_get_all_comp_ambig_codes,
497: gb18030_is_code_ctype,
498: onigenc_not_support_get_ctype_code_range,
499: gb18030_left_adjust_char_head,
500: gb18030_is_allowed_reverse_match
501: };
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>