Annotation of embedaddon/php/ext/mbstring/oniguruma/enc/gb18030.c, revision 1.1
1.1 ! misho 1: /**********************************************************************
! 2: gb18030.c - Oniguruma (regular expression library)
! 3: **********************************************************************/
! 4: /*-
! 5: * Copyright (c) 2005 KUBO Takehiro <kubo AT jiubao DOT org>
! 6: * K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
! 7: * All rights reserved.
! 8: *
! 9: * Redistribution and use in source and binary forms, with or without
! 10: * modification, are permitted provided that the following conditions
! 11: * are met:
! 12: * 1. Redistributions of source code must retain the above copyright
! 13: * notice, this list of conditions and the following disclaimer.
! 14: * 2. Redistributions in binary form must reproduce the above copyright
! 15: * notice, this list of conditions and the following disclaimer in the
! 16: * documentation and/or other materials provided with the distribution.
! 17: *
! 18: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
! 19: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
! 20: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
! 21: * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
! 22: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
! 23: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
! 24: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
! 25: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
! 26: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
! 27: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
! 28: * SUCH DAMAGE.
! 29: */
! 30:
! 31: #include "regenc.h"
! 32:
! 33: #if 1
! 34: #define DEBUG_GB18030(arg)
! 35: #else
! 36: #define DEBUG_GB18030(arg) printf arg
! 37: #endif
! 38:
! 39: enum {
! 40: C1, /* one-byte char */
! 41: C2, /* one-byte or second of two-byte char */
! 42: C4, /* one-byte or second or fourth of four-byte char */
! 43: CM /* first of two- or four-byte char or second of two-byte char */
! 44: };
! 45:
! 46: static const char GB18030_MAP[] = {
! 47: C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
! 48: C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
! 49: C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
! 50: C4, C4, C4, C4, C4, C4, C4, C4, C4, C4, C1, C1, C1, C1, C1, C1,
! 51: C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
! 52: C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
! 53: C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
! 54: C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C1,
! 55: C2, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
! 56: CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
! 57: CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
! 58: CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
! 59: CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
! 60: CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
! 61: CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
! 62: CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, C1
! 63: };
! 64:
! 65: static int
! 66: gb18030_mbc_enc_len(const UChar* p)
! 67: {
! 68: if (GB18030_MAP[*p] != CM)
! 69: return 1;
! 70: p++;
! 71: if (GB18030_MAP[*p] == C4)
! 72: return 4;
! 73: if (GB18030_MAP[*p] == C1)
! 74: return 1; /* illegal sequence */
! 75: return 2;
! 76: }
! 77:
! 78: static OnigCodePoint
! 79: gb18030_mbc_to_code(const UChar* p, const UChar* end)
! 80: {
! 81: return onigenc_mbn_mbc_to_code(ONIG_ENCODING_GB18030, p, end);
! 82: }
! 83:
! 84: static int
! 85: gb18030_code_to_mbc(OnigCodePoint code, UChar *buf)
! 86: {
! 87: return onigenc_mb4_code_to_mbc(ONIG_ENCODING_GB18030, code, buf);
! 88: }
! 89:
! 90: static int
! 91: gb18030_mbc_to_normalize(OnigAmbigType flag, const UChar** pp, const UChar* end,
! 92: UChar* lower)
! 93: {
! 94: return onigenc_mbn_mbc_to_normalize(ONIG_ENCODING_GB18030, flag,
! 95: pp, end, lower);
! 96: }
! 97:
! 98: static int
! 99: gb18030_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end)
! 100: {
! 101: return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_GB18030, flag, pp, end);
! 102: }
! 103:
! 104: static int
! 105: gb18030_is_code_ctype(OnigCodePoint code, unsigned int ctype)
! 106: {
! 107: return onigenc_mb4_is_code_ctype(ONIG_ENCODING_GB18030, code, ctype);
! 108: }
! 109:
! 110: enum state {
! 111: S_START,
! 112: S_one_C2,
! 113: S_one_C4,
! 114: S_one_CM,
! 115:
! 116: S_odd_CM_one_CX,
! 117: S_even_CM_one_CX,
! 118:
! 119: /* CMC4 : pair of "CM C4" */
! 120: S_one_CMC4,
! 121: S_odd_CMC4,
! 122: S_one_C4_odd_CMC4,
! 123: S_even_CMC4,
! 124: S_one_C4_even_CMC4,
! 125:
! 126: S_odd_CM_odd_CMC4,
! 127: S_even_CM_odd_CMC4,
! 128:
! 129: S_odd_CM_even_CMC4,
! 130: S_even_CM_even_CMC4,
! 131:
! 132: /* C4CM : pair of "C4 CM" */
! 133: S_odd_C4CM,
! 134: S_one_CM_odd_C4CM,
! 135: S_even_C4CM,
! 136: S_one_CM_even_C4CM,
! 137:
! 138: S_even_CM_odd_C4CM,
! 139: S_odd_CM_odd_C4CM,
! 140: S_even_CM_even_C4CM,
! 141: S_odd_CM_even_C4CM,
! 142: };
! 143:
! 144: static UChar*
! 145: gb18030_left_adjust_char_head(const UChar* start, const UChar* s)
! 146: {
! 147: const UChar *p;
! 148: enum state state = S_START;
! 149:
! 150: DEBUG_GB18030(("----------------\n"));
! 151: for (p = s; p >= start; p--) {
! 152: DEBUG_GB18030(("state %d --(%02x)-->\n", state, *p));
! 153: switch (state) {
! 154: case S_START:
! 155: switch (GB18030_MAP[*p]) {
! 156: case C1:
! 157: return (UChar *)s;
! 158: case C2:
! 159: state = S_one_C2; /* C2 */
! 160: break;
! 161: case C4:
! 162: state = S_one_C4; /* C4 */
! 163: break;
! 164: case CM:
! 165: state = S_one_CM; /* CM */
! 166: break;
! 167: }
! 168: break;
! 169: case S_one_C2: /* C2 */
! 170: switch (GB18030_MAP[*p]) {
! 171: case C1:
! 172: case C2:
! 173: case C4:
! 174: return (UChar *)s;
! 175: case CM:
! 176: state = S_odd_CM_one_CX; /* CM C2 */
! 177: break;
! 178: }
! 179: break;
! 180: case S_one_C4: /* C4 */
! 181: switch (GB18030_MAP[*p]) {
! 182: case C1:
! 183: case C2:
! 184: case C4:
! 185: return (UChar *)s;
! 186: case CM:
! 187: state = S_one_CMC4;
! 188: break;
! 189: }
! 190: break;
! 191: case S_one_CM: /* CM */
! 192: switch (GB18030_MAP[*p]) {
! 193: case C1:
! 194: case C2:
! 195: return (UChar *)s;
! 196: case C4:
! 197: state = S_odd_C4CM;
! 198: break;
! 199: case CM:
! 200: state = S_odd_CM_one_CX; /* CM CM */
! 201: break;
! 202: }
! 203: break;
! 204:
! 205: case S_odd_CM_one_CX: /* CM C2 */ /* CM CM */ /* CM CM CM C4 */
! 206: switch (GB18030_MAP[*p]) {
! 207: case C1:
! 208: case C2:
! 209: case C4:
! 210: return (UChar *)(s - 1);
! 211: case CM:
! 212: state = S_even_CM_one_CX;
! 213: break;
! 214: }
! 215: break;
! 216: case S_even_CM_one_CX: /* CM CM C2 */ /* CM CM CM */ /* CM CM C4 */
! 217: switch (GB18030_MAP[*p]) {
! 218: case C1:
! 219: case C2:
! 220: case C4:
! 221: return (UChar *)s;
! 222: case CM:
! 223: state = S_odd_CM_one_CX;
! 224: break;
! 225: }
! 226: break;
! 227:
! 228: case S_one_CMC4: /* CM C4 */
! 229: switch (GB18030_MAP[*p]) {
! 230: case C1:
! 231: case C2:
! 232: return (UChar *)(s - 1);
! 233: case C4:
! 234: state = S_one_C4_odd_CMC4; /* C4 CM C4 */
! 235: break;
! 236: case CM:
! 237: state = S_even_CM_one_CX; /* CM CM C4 */
! 238: break;
! 239: }
! 240: break;
! 241: case S_odd_CMC4: /* CM C4 CM C4 CM C4 */
! 242: switch (GB18030_MAP[*p]) {
! 243: case C1:
! 244: case C2:
! 245: return (UChar *)(s - 1);
! 246: case C4:
! 247: state = S_one_C4_odd_CMC4;
! 248: break;
! 249: case CM:
! 250: state = S_odd_CM_odd_CMC4;
! 251: break;
! 252: }
! 253: break;
! 254: case S_one_C4_odd_CMC4: /* C4 CM C4 */
! 255: switch (GB18030_MAP[*p]) {
! 256: case C1:
! 257: case C2:
! 258: case C4:
! 259: return (UChar *)(s - 1);
! 260: case CM:
! 261: state = S_even_CMC4; /* CM C4 CM C4 */
! 262: break;
! 263: }
! 264: break;
! 265: case S_even_CMC4: /* CM C4 CM C4 */
! 266: switch (GB18030_MAP[*p]) {
! 267: case C1:
! 268: case C2:
! 269: return (UChar *)(s - 3);
! 270: case C4:
! 271: state = S_one_C4_even_CMC4;
! 272: break;
! 273: case CM:
! 274: state = S_odd_CM_even_CMC4;
! 275: break;
! 276: }
! 277: break;
! 278: case S_one_C4_even_CMC4: /* C4 CM C4 CM C4 */
! 279: switch (GB18030_MAP[*p]) {
! 280: case C1:
! 281: case C2:
! 282: case C4:
! 283: return (UChar *)(s - 3);
! 284: case CM:
! 285: state = S_odd_CMC4;
! 286: break;
! 287: }
! 288: break;
! 289:
! 290: case S_odd_CM_odd_CMC4: /* CM CM C4 CM C4 CM C4 */
! 291: switch (GB18030_MAP[*p]) {
! 292: case C1:
! 293: case C2:
! 294: case C4:
! 295: return (UChar *)(s - 3);
! 296: case CM:
! 297: state = S_even_CM_odd_CMC4;
! 298: break;
! 299: }
! 300: break;
! 301: case S_even_CM_odd_CMC4: /* CM CM CM C4 CM C4 CM C4 */
! 302: switch (GB18030_MAP[*p]) {
! 303: case C1:
! 304: case C2:
! 305: case C4:
! 306: return (UChar *)(s - 1);
! 307: case CM:
! 308: state = S_odd_CM_odd_CMC4;
! 309: break;
! 310: }
! 311: break;
! 312:
! 313: case S_odd_CM_even_CMC4: /* CM CM C4 CM C4 */
! 314: switch (GB18030_MAP[*p]) {
! 315: case C1:
! 316: case C2:
! 317: case C4:
! 318: return (UChar *)(s - 1);
! 319: case CM:
! 320: state = S_even_CM_even_CMC4;
! 321: break;
! 322: }
! 323: break;
! 324: case S_even_CM_even_CMC4: /* CM CM CM C4 CM C4 */
! 325: switch (GB18030_MAP[*p]) {
! 326: case C1:
! 327: case C2:
! 328: case C4:
! 329: return (UChar *)(s - 3);
! 330: case CM:
! 331: state = S_odd_CM_even_CMC4;
! 332: break;
! 333: }
! 334: break;
! 335:
! 336: case S_odd_C4CM: /* C4 CM */ /* C4 CM C4 CM C4 CM*/
! 337: switch (GB18030_MAP[*p]) {
! 338: case C1:
! 339: case C2:
! 340: case C4:
! 341: return (UChar *)s;
! 342: case CM:
! 343: state = S_one_CM_odd_C4CM; /* CM C4 CM */
! 344: break;
! 345: }
! 346: break;
! 347: case S_one_CM_odd_C4CM: /* CM C4 CM */ /* CM C4 CM C4 CM C4 CM */
! 348: switch (GB18030_MAP[*p]) {
! 349: case C1:
! 350: case C2:
! 351: return (UChar *)(s - 2); /* |CM C4 CM */
! 352: case C4:
! 353: state = S_even_C4CM;
! 354: break;
! 355: case CM:
! 356: state = S_even_CM_odd_C4CM;
! 357: break;
! 358: }
! 359: break;
! 360: case S_even_C4CM: /* C4 CM C4 CM */
! 361: switch (GB18030_MAP[*p]) {
! 362: case C1:
! 363: case C2:
! 364: case C4:
! 365: return (UChar *)(s - 2); /* C4|CM C4 CM */
! 366: case CM:
! 367: state = S_one_CM_even_C4CM;
! 368: break;
! 369: }
! 370: break;
! 371: case S_one_CM_even_C4CM: /* CM C4 CM C4 CM */
! 372: switch (GB18030_MAP[*p]) {
! 373: case C1:
! 374: case C2:
! 375: return (UChar *)(s - 0); /*|CM C4 CM C4|CM */
! 376: case C4:
! 377: state = S_odd_C4CM;
! 378: break;
! 379: case CM:
! 380: state = S_even_CM_even_C4CM;
! 381: break;
! 382: }
! 383: break;
! 384:
! 385: case S_even_CM_odd_C4CM: /* CM CM C4 CM */
! 386: switch (GB18030_MAP[*p]) {
! 387: case C1:
! 388: case C2:
! 389: case C4:
! 390: return (UChar *)(s - 0); /* |CM CM|C4|CM */
! 391: case CM:
! 392: state = S_odd_CM_odd_C4CM;
! 393: break;
! 394: }
! 395: break;
! 396: case S_odd_CM_odd_C4CM: /* CM CM CM C4 CM */
! 397: switch (GB18030_MAP[*p]) {
! 398: case C1:
! 399: case C2:
! 400: case C4:
! 401: return (UChar *)(s - 2); /* |CM CM|CM C4 CM */
! 402: case CM:
! 403: state = S_even_CM_odd_C4CM;
! 404: break;
! 405: }
! 406: break;
! 407:
! 408: case S_even_CM_even_C4CM: /* CM CM C4 CM C4 CM */
! 409: switch (GB18030_MAP[*p]) {
! 410: case C1:
! 411: case C2:
! 412: case C4:
! 413: return (UChar *)(s - 2); /* |CM CM|C4|CM C4 CM */
! 414: case CM:
! 415: state = S_odd_CM_even_C4CM;
! 416: break;
! 417: }
! 418: break;
! 419: case S_odd_CM_even_C4CM: /* CM CM CM C4 CM C4 CM */
! 420: switch (GB18030_MAP[*p]) {
! 421: case C1:
! 422: case C2:
! 423: case C4:
! 424: return (UChar *)(s - 0); /* |CM CM|CM C4 CM C4|CM */
! 425: case CM:
! 426: state = S_even_CM_even_C4CM;
! 427: break;
! 428: }
! 429: break;
! 430: }
! 431: }
! 432:
! 433: DEBUG_GB18030(("state %d\n", state));
! 434: switch (state) {
! 435: case S_START: return (UChar *)(s - 0);
! 436: case S_one_C2: return (UChar *)(s - 0);
! 437: case S_one_C4: return (UChar *)(s - 0);
! 438: case S_one_CM: return (UChar *)(s - 0);
! 439:
! 440: case S_odd_CM_one_CX: return (UChar *)(s - 1);
! 441: case S_even_CM_one_CX: return (UChar *)(s - 0);
! 442:
! 443: case S_one_CMC4: return (UChar *)(s - 1);
! 444: case S_odd_CMC4: return (UChar *)(s - 1);
! 445: case S_one_C4_odd_CMC4: return (UChar *)(s - 1);
! 446: case S_even_CMC4: return (UChar *)(s - 3);
! 447: case S_one_C4_even_CMC4: return (UChar *)(s - 3);
! 448:
! 449: case S_odd_CM_odd_CMC4: return (UChar *)(s - 3);
! 450: case S_even_CM_odd_CMC4: return (UChar *)(s - 1);
! 451:
! 452: case S_odd_CM_even_CMC4: return (UChar *)(s - 1);
! 453: case S_even_CM_even_CMC4: return (UChar *)(s - 3);
! 454:
! 455: case S_odd_C4CM: return (UChar *)(s - 0);
! 456: case S_one_CM_odd_C4CM: return (UChar *)(s - 2);
! 457: case S_even_C4CM: return (UChar *)(s - 2);
! 458: case S_one_CM_even_C4CM: return (UChar *)(s - 0);
! 459:
! 460: case S_even_CM_odd_C4CM: return (UChar *)(s - 0);
! 461: case S_odd_CM_odd_C4CM: return (UChar *)(s - 2);
! 462: case S_even_CM_even_C4CM: return (UChar *)(s - 2);
! 463: case S_odd_CM_even_C4CM: return (UChar *)(s - 0);
! 464: }
! 465:
! 466: return (UChar* )s; /* never come here. (escape warning) */
! 467: }
! 468:
! 469: static int
! 470: gb18030_is_allowed_reverse_match(const UChar* s, const UChar* end)
! 471: {
! 472: return GB18030_MAP[*s] == C1 ? TRUE : FALSE;
! 473: }
! 474:
! 475: OnigEncodingType OnigEncodingGB18030 = {
! 476: gb18030_mbc_enc_len,
! 477: "GB18030", /* name */
! 478: 4, /* max enc length */
! 479: 1, /* min enc length */
! 480: ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE,
! 481: {
! 482: (OnigCodePoint )'\\' /* esc */
! 483: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
! 484: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
! 485: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
! 486: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
! 487: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
! 488: },
! 489: onigenc_is_mbc_newline_0x0a,
! 490: gb18030_mbc_to_code,
! 491: onigenc_mb4_code_to_mbclen,
! 492: gb18030_code_to_mbc,
! 493: gb18030_mbc_to_normalize,
! 494: gb18030_is_mbc_ambiguous,
! 495: onigenc_ascii_get_all_pair_ambig_codes,
! 496: onigenc_nothing_get_all_comp_ambig_codes,
! 497: gb18030_is_code_ctype,
! 498: onigenc_not_support_get_ctype_code_range,
! 499: gb18030_left_adjust_char_head,
! 500: gb18030_is_allowed_reverse_match
! 501: };
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>