Return to sjis.c CVS log | Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / php / ext / mbstring / oniguruma / enc |
1.1 misho 1: /********************************************************************** 2: sjis.c - Oniguruma (regular expression library) 3: **********************************************************************/ 4: /*- 5: * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> 6: * All rights reserved. 7: * 8: * Redistribution and use in source and binary forms, with or without 9: * modification, are permitted provided that the following conditions 10: * are met: 11: * 1. Redistributions of source code must retain the above copyright 12: * notice, this list of conditions and the following disclaimer. 13: * 2. Redistributions in binary form must reproduce the above copyright 14: * notice, this list of conditions and the following disclaimer in the 15: * documentation and/or other materials provided with the distribution. 16: * 17: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20: * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27: * SUCH DAMAGE. 28: */ 29: 30: #include "regenc.h" 31: 32: static const int EncLen_SJIS[] = { 33: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 34: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 35: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 36: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 37: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 38: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 39: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 40: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 41: 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 42: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 43: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 44: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 45: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 46: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 47: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 48: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1 49: }; 50: 51: static const char SJIS_CAN_BE_TRAIL_TABLE[256] = { 52: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 53: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 54: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 56: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 57: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 58: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 59: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 60: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 61: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 62: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 63: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 64: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 65: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 66: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 67: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 68: }; 69: 70: #define SJIS_ISMB_FIRST(byte) (EncLen_SJIS[byte] > 1) 71: #define SJIS_ISMB_TRAIL(byte) SJIS_CAN_BE_TRAIL_TABLE[(byte)] 72: 73: static int 74: sjis_mbc_enc_len(const UChar* p) 75: { 76: return EncLen_SJIS[*p]; 77: } 78: 79: static int 80: sjis_code_to_mbclen(OnigCodePoint code) 81: { 82: if (code < 256) { 83: if (EncLen_SJIS[(int )code] == 1) 84: return 1; 85: else 86: return 0; 87: } 88: else if (code <= 0xffff) { 89: return 2; 90: } 91: else 92: return 0; 93: } 94: 95: static OnigCodePoint 96: sjis_mbc_to_code(const UChar* p, const UChar* end) 97: { 98: int c, i, len; 99: OnigCodePoint n; 100: 101: len = enc_len(ONIG_ENCODING_SJIS, p); 102: c = *p++; 103: n = c; 104: if (len == 1) return n; 105: 106: for (i = 1; i < len; i++) { 107: if (p >= end) break; 108: c = *p++; 109: n <<= 8; n += c; 110: } 111: return n; 112: } 113: 114: static int 115: sjis_code_to_mbc(OnigCodePoint code, UChar *buf) 116: { 117: UChar *p = buf; 118: 119: if ((code & 0xff00) != 0) *p++ = (UChar )(((code >> 8) & 0xff)); 120: *p++ = (UChar )(code & 0xff); 121: 122: #if 0 123: if (enc_len(ONIG_ENCODING_SJIS, buf) != (p - buf)) 124: return REGERR_INVALID_WIDE_CHAR_VALUE; 125: #endif 126: return p - buf; 127: } 128: 129: static int 130: sjis_mbc_to_normalize(OnigAmbigType flag, 131: const UChar** pp, const UChar* end, UChar* lower) 132: { 133: const UChar* p = *pp; 134: 135: if (ONIGENC_IS_MBC_ASCII(p)) { 136: if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) { 137: *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); 138: } 139: else { 140: *lower = *p; 141: } 142: 143: (*pp)++; 144: return 1; 145: } 146: else { 147: int len = enc_len(ONIG_ENCODING_SJIS, p); 148: 149: if (lower != p) { 150: int i; 151: for (i = 0; i < len; i++) { 152: *lower++ = *p++; 153: } 154: } 155: (*pp) += len; 156: return len; /* return byte length of converted char to lower */ 157: } 158: } 159: 160: static int 161: sjis_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end) 162: { 163: return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_SJIS, flag, pp, end); 164: 165: } 166: 167: static int 168: sjis_is_code_ctype(OnigCodePoint code, unsigned int ctype) 169: { 170: if (code < 128) 171: return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); 172: else { 173: if ((ctype & (ONIGENC_CTYPE_WORD | 174: ONIGENC_CTYPE_GRAPH | ONIGENC_CTYPE_PRINT)) != 0) { 175: return (sjis_code_to_mbclen(code) > 1 ? TRUE : FALSE); 176: } 177: } 178: 179: return FALSE; 180: } 181: 182: static UChar* 183: sjis_left_adjust_char_head(const UChar* start, const UChar* s) 184: { 185: const UChar *p; 186: int len; 187: 188: if (s <= start) return (UChar* )s; 189: p = s; 190: 191: if (SJIS_ISMB_TRAIL(*p)) { 192: while (p > start) { 193: if (! SJIS_ISMB_FIRST(*--p)) { 194: p++; 195: break; 196: } 197: } 198: } 199: len = enc_len(ONIG_ENCODING_SJIS, p); 200: if (p + len > s) return (UChar* )p; 201: p += len; 202: return (UChar* )(p + ((s - p) & ~1)); 203: } 204: 205: static int 206: sjis_is_allowed_reverse_match(const UChar* s, const UChar* end) 207: { 208: const UChar c = *s; 209: return (SJIS_ISMB_TRAIL(c) ? FALSE : TRUE); 210: } 211: 212: OnigEncodingType OnigEncodingSJIS = { 213: sjis_mbc_enc_len, 214: "Shift_JIS", /* name */ 215: 2, /* max byte length */ 216: 1, /* min byte length */ 217: ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE, 218: { 219: (OnigCodePoint )'\\' /* esc */ 220: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ 221: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ 222: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ 223: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ 224: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ 225: }, 226: onigenc_is_mbc_newline_0x0a, 227: sjis_mbc_to_code, 228: sjis_code_to_mbclen, 229: sjis_code_to_mbc, 230: sjis_mbc_to_normalize, 231: sjis_is_mbc_ambiguous, 232: onigenc_ascii_get_all_pair_ambig_codes, 233: onigenc_nothing_get_all_comp_ambig_codes, 234: sjis_is_code_ctype, 235: onigenc_not_support_get_ctype_code_range, 236: sjis_left_adjust_char_head, 237: sjis_is_allowed_reverse_match 238: };