Return to utf16_le.c CVS log | Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / php / ext / mbstring / oniguruma / enc |
1.1 misho 1: /********************************************************************** 2: utf16_le.c - Oniguruma (regular expression library) 3: **********************************************************************/ 4: /*- 5: * Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> 6: * All rights reserved. 7: * 8: * Redistribution and use in source and binary forms, with or without 9: * modification, are permitted provided that the following conditions 10: * are met: 11: * 1. Redistributions of source code must retain the above copyright 12: * notice, this list of conditions and the following disclaimer. 13: * 2. Redistributions in binary form must reproduce the above copyright 14: * notice, this list of conditions and the following disclaimer in the 15: * documentation and/or other materials provided with the distribution. 16: * 17: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20: * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27: * SUCH DAMAGE. 28: */ 29: 30: #include "regenc.h" 31: 32: #define UTF16_IS_SURROGATE_FIRST(c) (c >= 0xd8 && c <= 0xdb) 33: #define UTF16_IS_SURROGATE_SECOND(c) (c >= 0xdc && c <= 0xdf) 34: 35: static const int EncLen_UTF16[] = { 36: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 37: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 38: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 39: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 40: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 41: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 42: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 43: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 44: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 45: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 46: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 47: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 48: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 49: 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 50: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 51: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 52: }; 53: 54: static int 55: utf16le_code_to_mbclen(OnigCodePoint code) 56: { 57: return (code > 0xffff ? 4 : 2); 58: } 59: 60: static int 61: utf16le_mbc_enc_len(const UChar* p) 62: { 63: return EncLen_UTF16[*(p+1)]; 64: } 65: 66: static int 67: utf16le_is_mbc_newline(const UChar* p, const UChar* end) 68: { 69: if (p + 1 < end) { 70: if (*p == 0x0a && *(p+1) == 0x00) 71: return 1; 72: #ifdef USE_UNICODE_ALL_LINE_TERMINATORS 73: if ((*p == 0x0d || *p == 0x85) && *(p+1) == 0x00) 74: return 1; 75: if (*(p+1) == 0x20 && (*p == 0x29 || *p == 0x28)) 76: return 1; 77: #endif 78: } 79: return 0; 80: } 81: 82: static OnigCodePoint 83: utf16le_mbc_to_code(const UChar* p, const UChar* end) 84: { 85: OnigCodePoint code; 86: UChar c0 = *p; 87: UChar c1 = *(p+1); 88: 89: if (UTF16_IS_SURROGATE_FIRST(c1)) { 90: code = ((((c1 - 0xd8) << 2) + ((c0 & 0xc0) >> 6) + 1) << 16) 91: + ((((c0 & 0x3f) << 2) + (p[3] - 0xdc)) << 8) 92: + p[2]; 93: } 94: else { 95: code = c1 * 256 + p[0]; 96: } 97: return code; 98: } 99: 100: static int 101: utf16le_code_to_mbc(OnigCodePoint code, UChar *buf) 102: { 103: UChar* p = buf; 104: 105: if (code > 0xffff) { 106: unsigned int plane, high; 107: 108: plane = code >> 16; 109: high = (code & 0xff00) >> 8; 110: 111: *p++ = ((plane & 0x03) << 6) + (high >> 2); 112: *p++ = (plane >> 2) + 0xd8; 113: *p++ = (UChar )(code & 0xff); 114: *p = (high & 0x02) + 0xdc; 115: return 4; 116: } 117: else { 118: *p++ = (UChar )(code & 0xff); 119: *p++ = (UChar )((code & 0xff00) >> 8); 120: return 2; 121: } 122: } 123: 124: static int 125: utf16le_mbc_to_normalize(OnigAmbigType flag, const UChar** pp, const UChar* end, 126: UChar* lower) 127: { 128: const UChar* p = *pp; 129: 130: if (*(p+1) == 0) { 131: *(lower+1) = '\0'; 132: if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && 133: ONIGENC_IS_MBC_ASCII(p)) || 134: ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && 135: !ONIGENC_IS_MBC_ASCII(p))) { 136: *lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p); 137: } 138: else { 139: *lower = *p; 140: } 141: (*pp) += 2; 142: return 2; /* return byte length of converted char to lower */ 143: } 144: else { 145: int len = EncLen_UTF16[*(p+1)]; 146: if (lower != p) { 147: int i; 148: for (i = 0; i < len; i++) { 149: *lower++ = *p++; 150: } 151: } 152: (*pp) += len; 153: return len; /* return byte length of converted char to lower */ 154: } 155: } 156: 157: static int 158: utf16le_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end) 159: { 160: const UChar* p = *pp; 161: 162: (*pp) += EncLen_UTF16[*(p+1)]; 163: 164: if (*(p+1) == 0) { 165: int c, v; 166: 167: if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && 168: ONIGENC_IS_MBC_ASCII(p)) || 169: ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && 170: !ONIGENC_IS_MBC_ASCII(p))) { 171: c = *p; 172: v = ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(c, 173: (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); 174: if ((v | ONIGENC_CTYPE_LOWER) != 0) { 175: /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ 176: if (c >= 0xaa && c <= 0xba) 177: return FALSE; 178: else 179: return TRUE; 180: } 181: return (v != 0 ? TRUE : FALSE); 182: } 183: } 184: 185: return FALSE; 186: } 187: 188: static UChar* 189: utf16le_left_adjust_char_head(const UChar* start, const UChar* s) 190: { 191: if (s <= start) return (UChar* )s; 192: 193: if ((s - start) % 2 == 1) { 194: s--; 195: } 196: 197: if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1) 198: s -= 2; 199: 200: return (UChar* )s; 201: } 202: 203: OnigEncodingType OnigEncodingUTF16_LE = { 204: utf16le_mbc_enc_len, 205: "UTF-16LE", /* name */ 206: 4, /* max byte length */ 207: 2, /* min byte length */ 208: (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | 209: ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE ), 210: { 211: (OnigCodePoint )'\\' /* esc */ 212: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ 213: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ 214: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ 215: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ 216: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ 217: }, 218: utf16le_is_mbc_newline, 219: utf16le_mbc_to_code, 220: utf16le_code_to_mbclen, 221: utf16le_code_to_mbc, 222: utf16le_mbc_to_normalize, 223: utf16le_is_mbc_ambiguous, 224: onigenc_iso_8859_1_get_all_pair_ambig_codes, 225: onigenc_ess_tsett_get_all_comp_ambig_codes, 226: onigenc_unicode_is_code_ctype, 227: onigenc_unicode_get_ctype_code_range, 228: utf16le_left_adjust_char_head, 229: onigenc_always_false_is_allowed_reverse_match 230: };