Return to gb18030.c CVS log | Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / php / ext / mbstring / oniguruma / enc |
1.1 misho 1: /********************************************************************** 2: gb18030.c - Oniguruma (regular expression library) 3: **********************************************************************/ 4: /*- 5: * Copyright (c) 2005 KUBO Takehiro <kubo AT jiubao DOT org> 6: * K.Kosako <sndgk393 AT ybb DOT ne DOT jp> 7: * All rights reserved. 8: * 9: * Redistribution and use in source and binary forms, with or without 10: * modification, are permitted provided that the following conditions 11: * are met: 12: * 1. Redistributions of source code must retain the above copyright 13: * notice, this list of conditions and the following disclaimer. 14: * 2. Redistributions in binary form must reproduce the above copyright 15: * notice, this list of conditions and the following disclaimer in the 16: * documentation and/or other materials provided with the distribution. 17: * 18: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21: * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28: * SUCH DAMAGE. 29: */ 30: 31: #include "regenc.h" 32: 33: #if 1 34: #define DEBUG_GB18030(arg) 35: #else 36: #define DEBUG_GB18030(arg) printf arg 37: #endif 38: 39: enum { 40: C1, /* one-byte char */ 41: C2, /* one-byte or second of two-byte char */ 42: C4, /* one-byte or second or fourth of four-byte char */ 43: CM /* first of two- or four-byte char or second of two-byte char */ 44: }; 45: 46: static const char GB18030_MAP[] = { 47: C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, 48: C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, 49: C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, 50: C4, C4, C4, C4, C4, C4, C4, C4, C4, C4, C1, C1, C1, C1, C1, C1, 51: C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, 52: C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, 53: C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, 54: C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C1, 55: C2, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, 56: CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, 57: CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, 58: CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, 59: CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, 60: CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, 61: CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, 62: CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, C1 63: }; 64: 65: static int 66: gb18030_mbc_enc_len(const UChar* p) 67: { 68: if (GB18030_MAP[*p] != CM) 69: return 1; 70: p++; 71: if (GB18030_MAP[*p] == C4) 72: return 4; 73: if (GB18030_MAP[*p] == C1) 74: return 1; /* illegal sequence */ 75: return 2; 76: } 77: 78: static OnigCodePoint 79: gb18030_mbc_to_code(const UChar* p, const UChar* end) 80: { 81: return onigenc_mbn_mbc_to_code(ONIG_ENCODING_GB18030, p, end); 82: } 83: 84: static int 85: gb18030_code_to_mbc(OnigCodePoint code, UChar *buf) 86: { 87: return onigenc_mb4_code_to_mbc(ONIG_ENCODING_GB18030, code, buf); 88: } 89: 90: static int 91: gb18030_mbc_to_normalize(OnigAmbigType flag, const UChar** pp, const UChar* end, 92: UChar* lower) 93: { 94: return onigenc_mbn_mbc_to_normalize(ONIG_ENCODING_GB18030, flag, 95: pp, end, lower); 96: } 97: 98: static int 99: gb18030_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end) 100: { 101: return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_GB18030, flag, pp, end); 102: } 103: 104: static int 105: gb18030_is_code_ctype(OnigCodePoint code, unsigned int ctype) 106: { 107: return onigenc_mb4_is_code_ctype(ONIG_ENCODING_GB18030, code, ctype); 108: } 109: 110: enum state { 111: S_START, 112: S_one_C2, 113: S_one_C4, 114: S_one_CM, 115: 116: S_odd_CM_one_CX, 117: S_even_CM_one_CX, 118: 119: /* CMC4 : pair of "CM C4" */ 120: S_one_CMC4, 121: S_odd_CMC4, 122: S_one_C4_odd_CMC4, 123: S_even_CMC4, 124: S_one_C4_even_CMC4, 125: 126: S_odd_CM_odd_CMC4, 127: S_even_CM_odd_CMC4, 128: 129: S_odd_CM_even_CMC4, 130: S_even_CM_even_CMC4, 131: 132: /* C4CM : pair of "C4 CM" */ 133: S_odd_C4CM, 134: S_one_CM_odd_C4CM, 135: S_even_C4CM, 136: S_one_CM_even_C4CM, 137: 138: S_even_CM_odd_C4CM, 139: S_odd_CM_odd_C4CM, 140: S_even_CM_even_C4CM, 141: S_odd_CM_even_C4CM, 142: }; 143: 144: static UChar* 145: gb18030_left_adjust_char_head(const UChar* start, const UChar* s) 146: { 147: const UChar *p; 148: enum state state = S_START; 149: 150: DEBUG_GB18030(("----------------\n")); 151: for (p = s; p >= start; p--) { 152: DEBUG_GB18030(("state %d --(%02x)-->\n", state, *p)); 153: switch (state) { 154: case S_START: 155: switch (GB18030_MAP[*p]) { 156: case C1: 157: return (UChar *)s; 158: case C2: 159: state = S_one_C2; /* C2 */ 160: break; 161: case C4: 162: state = S_one_C4; /* C4 */ 163: break; 164: case CM: 165: state = S_one_CM; /* CM */ 166: break; 167: } 168: break; 169: case S_one_C2: /* C2 */ 170: switch (GB18030_MAP[*p]) { 171: case C1: 172: case C2: 173: case C4: 174: return (UChar *)s; 175: case CM: 176: state = S_odd_CM_one_CX; /* CM C2 */ 177: break; 178: } 179: break; 180: case S_one_C4: /* C4 */ 181: switch (GB18030_MAP[*p]) { 182: case C1: 183: case C2: 184: case C4: 185: return (UChar *)s; 186: case CM: 187: state = S_one_CMC4; 188: break; 189: } 190: break; 191: case S_one_CM: /* CM */ 192: switch (GB18030_MAP[*p]) { 193: case C1: 194: case C2: 195: return (UChar *)s; 196: case C4: 197: state = S_odd_C4CM; 198: break; 199: case CM: 200: state = S_odd_CM_one_CX; /* CM CM */ 201: break; 202: } 203: break; 204: 205: case S_odd_CM_one_CX: /* CM C2 */ /* CM CM */ /* CM CM CM C4 */ 206: switch (GB18030_MAP[*p]) { 207: case C1: 208: case C2: 209: case C4: 210: return (UChar *)(s - 1); 211: case CM: 212: state = S_even_CM_one_CX; 213: break; 214: } 215: break; 216: case S_even_CM_one_CX: /* CM CM C2 */ /* CM CM CM */ /* CM CM C4 */ 217: switch (GB18030_MAP[*p]) { 218: case C1: 219: case C2: 220: case C4: 221: return (UChar *)s; 222: case CM: 223: state = S_odd_CM_one_CX; 224: break; 225: } 226: break; 227: 228: case S_one_CMC4: /* CM C4 */ 229: switch (GB18030_MAP[*p]) { 230: case C1: 231: case C2: 232: return (UChar *)(s - 1); 233: case C4: 234: state = S_one_C4_odd_CMC4; /* C4 CM C4 */ 235: break; 236: case CM: 237: state = S_even_CM_one_CX; /* CM CM C4 */ 238: break; 239: } 240: break; 241: case S_odd_CMC4: /* CM C4 CM C4 CM C4 */ 242: switch (GB18030_MAP[*p]) { 243: case C1: 244: case C2: 245: return (UChar *)(s - 1); 246: case C4: 247: state = S_one_C4_odd_CMC4; 248: break; 249: case CM: 250: state = S_odd_CM_odd_CMC4; 251: break; 252: } 253: break; 254: case S_one_C4_odd_CMC4: /* C4 CM C4 */ 255: switch (GB18030_MAP[*p]) { 256: case C1: 257: case C2: 258: case C4: 259: return (UChar *)(s - 1); 260: case CM: 261: state = S_even_CMC4; /* CM C4 CM C4 */ 262: break; 263: } 264: break; 265: case S_even_CMC4: /* CM C4 CM C4 */ 266: switch (GB18030_MAP[*p]) { 267: case C1: 268: case C2: 269: return (UChar *)(s - 3); 270: case C4: 271: state = S_one_C4_even_CMC4; 272: break; 273: case CM: 274: state = S_odd_CM_even_CMC4; 275: break; 276: } 277: break; 278: case S_one_C4_even_CMC4: /* C4 CM C4 CM C4 */ 279: switch (GB18030_MAP[*p]) { 280: case C1: 281: case C2: 282: case C4: 283: return (UChar *)(s - 3); 284: case CM: 285: state = S_odd_CMC4; 286: break; 287: } 288: break; 289: 290: case S_odd_CM_odd_CMC4: /* CM CM C4 CM C4 CM C4 */ 291: switch (GB18030_MAP[*p]) { 292: case C1: 293: case C2: 294: case C4: 295: return (UChar *)(s - 3); 296: case CM: 297: state = S_even_CM_odd_CMC4; 298: break; 299: } 300: break; 301: case S_even_CM_odd_CMC4: /* CM CM CM C4 CM C4 CM C4 */ 302: switch (GB18030_MAP[*p]) { 303: case C1: 304: case C2: 305: case C4: 306: return (UChar *)(s - 1); 307: case CM: 308: state = S_odd_CM_odd_CMC4; 309: break; 310: } 311: break; 312: 313: case S_odd_CM_even_CMC4: /* CM CM C4 CM C4 */ 314: switch (GB18030_MAP[*p]) { 315: case C1: 316: case C2: 317: case C4: 318: return (UChar *)(s - 1); 319: case CM: 320: state = S_even_CM_even_CMC4; 321: break; 322: } 323: break; 324: case S_even_CM_even_CMC4: /* CM CM CM C4 CM C4 */ 325: switch (GB18030_MAP[*p]) { 326: case C1: 327: case C2: 328: case C4: 329: return (UChar *)(s - 3); 330: case CM: 331: state = S_odd_CM_even_CMC4; 332: break; 333: } 334: break; 335: 336: case S_odd_C4CM: /* C4 CM */ /* C4 CM C4 CM C4 CM*/ 337: switch (GB18030_MAP[*p]) { 338: case C1: 339: case C2: 340: case C4: 341: return (UChar *)s; 342: case CM: 343: state = S_one_CM_odd_C4CM; /* CM C4 CM */ 344: break; 345: } 346: break; 347: case S_one_CM_odd_C4CM: /* CM C4 CM */ /* CM C4 CM C4 CM C4 CM */ 348: switch (GB18030_MAP[*p]) { 349: case C1: 350: case C2: 351: return (UChar *)(s - 2); /* |CM C4 CM */ 352: case C4: 353: state = S_even_C4CM; 354: break; 355: case CM: 356: state = S_even_CM_odd_C4CM; 357: break; 358: } 359: break; 360: case S_even_C4CM: /* C4 CM C4 CM */ 361: switch (GB18030_MAP[*p]) { 362: case C1: 363: case C2: 364: case C4: 365: return (UChar *)(s - 2); /* C4|CM C4 CM */ 366: case CM: 367: state = S_one_CM_even_C4CM; 368: break; 369: } 370: break; 371: case S_one_CM_even_C4CM: /* CM C4 CM C4 CM */ 372: switch (GB18030_MAP[*p]) { 373: case C1: 374: case C2: 375: return (UChar *)(s - 0); /*|CM C4 CM C4|CM */ 376: case C4: 377: state = S_odd_C4CM; 378: break; 379: case CM: 380: state = S_even_CM_even_C4CM; 381: break; 382: } 383: break; 384: 385: case S_even_CM_odd_C4CM: /* CM CM C4 CM */ 386: switch (GB18030_MAP[*p]) { 387: case C1: 388: case C2: 389: case C4: 390: return (UChar *)(s - 0); /* |CM CM|C4|CM */ 391: case CM: 392: state = S_odd_CM_odd_C4CM; 393: break; 394: } 395: break; 396: case S_odd_CM_odd_C4CM: /* CM CM CM C4 CM */ 397: switch (GB18030_MAP[*p]) { 398: case C1: 399: case C2: 400: case C4: 401: return (UChar *)(s - 2); /* |CM CM|CM C4 CM */ 402: case CM: 403: state = S_even_CM_odd_C4CM; 404: break; 405: } 406: break; 407: 408: case S_even_CM_even_C4CM: /* CM CM C4 CM C4 CM */ 409: switch (GB18030_MAP[*p]) { 410: case C1: 411: case C2: 412: case C4: 413: return (UChar *)(s - 2); /* |CM CM|C4|CM C4 CM */ 414: case CM: 415: state = S_odd_CM_even_C4CM; 416: break; 417: } 418: break; 419: case S_odd_CM_even_C4CM: /* CM CM CM C4 CM C4 CM */ 420: switch (GB18030_MAP[*p]) { 421: case C1: 422: case C2: 423: case C4: 424: return (UChar *)(s - 0); /* |CM CM|CM C4 CM C4|CM */ 425: case CM: 426: state = S_even_CM_even_C4CM; 427: break; 428: } 429: break; 430: } 431: } 432: 433: DEBUG_GB18030(("state %d\n", state)); 434: switch (state) { 435: case S_START: return (UChar *)(s - 0); 436: case S_one_C2: return (UChar *)(s - 0); 437: case S_one_C4: return (UChar *)(s - 0); 438: case S_one_CM: return (UChar *)(s - 0); 439: 440: case S_odd_CM_one_CX: return (UChar *)(s - 1); 441: case S_even_CM_one_CX: return (UChar *)(s - 0); 442: 443: case S_one_CMC4: return (UChar *)(s - 1); 444: case S_odd_CMC4: return (UChar *)(s - 1); 445: case S_one_C4_odd_CMC4: return (UChar *)(s - 1); 446: case S_even_CMC4: return (UChar *)(s - 3); 447: case S_one_C4_even_CMC4: return (UChar *)(s - 3); 448: 449: case S_odd_CM_odd_CMC4: return (UChar *)(s - 3); 450: case S_even_CM_odd_CMC4: return (UChar *)(s - 1); 451: 452: case S_odd_CM_even_CMC4: return (UChar *)(s - 1); 453: case S_even_CM_even_CMC4: return (UChar *)(s - 3); 454: 455: case S_odd_C4CM: return (UChar *)(s - 0); 456: case S_one_CM_odd_C4CM: return (UChar *)(s - 2); 457: case S_even_C4CM: return (UChar *)(s - 2); 458: case S_one_CM_even_C4CM: return (UChar *)(s - 0); 459: 460: case S_even_CM_odd_C4CM: return (UChar *)(s - 0); 461: case S_odd_CM_odd_C4CM: return (UChar *)(s - 2); 462: case S_even_CM_even_C4CM: return (UChar *)(s - 2); 463: case S_odd_CM_even_C4CM: return (UChar *)(s - 0); 464: } 465: 466: return (UChar* )s; /* never come here. (escape warning) */ 467: } 468: 469: static int 470: gb18030_is_allowed_reverse_match(const UChar* s, const UChar* end) 471: { 472: return GB18030_MAP[*s] == C1 ? TRUE : FALSE; 473: } 474: 475: OnigEncodingType OnigEncodingGB18030 = { 476: gb18030_mbc_enc_len, 477: "GB18030", /* name */ 478: 4, /* max enc length */ 479: 1, /* min enc length */ 480: ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE, 481: { 482: (OnigCodePoint )'\\' /* esc */ 483: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ 484: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ 485: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ 486: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ 487: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ 488: }, 489: onigenc_is_mbc_newline_0x0a, 490: gb18030_mbc_to_code, 491: onigenc_mb4_code_to_mbclen, 492: gb18030_code_to_mbc, 493: gb18030_mbc_to_normalize, 494: gb18030_is_mbc_ambiguous, 495: onigenc_ascii_get_all_pair_ambig_codes, 496: onigenc_nothing_get_all_comp_ambig_codes, 497: gb18030_is_code_ctype, 498: onigenc_not_support_get_ctype_code_range, 499: gb18030_left_adjust_char_head, 500: gb18030_is_allowed_reverse_match 501: };