Return to utf16_be.c CVS log | Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / php / ext / mbstring / oniguruma / enc |
1.1 misho 1: /********************************************************************** 2: utf16_be.c - Oniguruma (regular expression library) 3: **********************************************************************/ 4: /*- 5: * Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> 6: * All rights reserved. 7: * 8: * Redistribution and use in source and binary forms, with or without 9: * modification, are permitted provided that the following conditions 10: * are met: 11: * 1. Redistributions of source code must retain the above copyright 12: * notice, this list of conditions and the following disclaimer. 13: * 2. Redistributions in binary form must reproduce the above copyright 14: * notice, this list of conditions and the following disclaimer in the 15: * documentation and/or other materials provided with the distribution. 16: * 17: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20: * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27: * SUCH DAMAGE. 28: */ 29: 30: #include "regenc.h" 31: 32: #define UTF16_IS_SURROGATE_FIRST(c) (c >= 0xd8 && c <= 0xdb) 33: #define UTF16_IS_SURROGATE_SECOND(c) (c >= 0xdc && c <= 0xdf) 34: 35: static const int EncLen_UTF16[] = { 36: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 37: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 38: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 39: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 40: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 41: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 42: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 43: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 44: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 45: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 46: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 47: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 48: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 49: 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 50: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 51: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 52: }; 53: 54: static int 55: utf16be_mbc_enc_len(const UChar* p) 56: { 57: return EncLen_UTF16[*p]; 58: } 59: 60: static int 61: utf16be_is_mbc_newline(const UChar* p, const UChar* end) 62: { 63: if (p + 1 < end) { 64: if (*(p+1) == 0x0a && *p == 0x00) 65: return 1; 66: #ifdef USE_UNICODE_ALL_LINE_TERMINATORS 67: if ((*(p+1) == 0x0d || *(p+1) == 0x85) && *p == 0x00) 68: return 1; 69: if (*p == 0x20 && (*(p+1) == 0x29 || *(p+1) == 0x28)) 70: return 1; 71: #endif 72: } 73: return 0; 74: } 75: 76: static OnigCodePoint 77: utf16be_mbc_to_code(const UChar* p, const UChar* end) 78: { 79: OnigCodePoint code; 80: 81: if (UTF16_IS_SURROGATE_FIRST(*p)) { 82: code = ((((p[0] - 0xd8) << 2) + ((p[1] & 0xc0) >> 6) + 1) << 16) 83: + ((((p[1] & 0x3f) << 2) + (p[2] - 0xdc)) << 8) 84: + p[3]; 85: } 86: else { 87: code = p[0] * 256 + p[1]; 88: } 89: return code; 90: } 91: 92: static int 93: utf16be_code_to_mbclen(OnigCodePoint code) 94: { 95: return (code > 0xffff ? 4 : 2); 96: } 97: 98: static int 99: utf16be_code_to_mbc(OnigCodePoint code, UChar *buf) 100: { 101: UChar* p = buf; 102: 103: if (code > 0xffff) { 104: unsigned int plane, high; 105: 106: plane = code >> 16; 107: *p++ = (plane >> 2) + 0xd8; 108: high = (code & 0xff00) >> 8; 109: *p++ = ((plane & 0x03) << 6) + (high >> 2); 110: *p++ = (high & 0x02) + 0xdc; 111: *p = (UChar )(code & 0xff); 112: return 4; 113: } 114: else { 115: *p++ = (UChar )((code & 0xff00) >> 8); 116: *p++ = (UChar )(code & 0xff); 117: return 2; 118: } 119: } 120: 121: static int 122: utf16be_mbc_to_normalize(OnigAmbigType flag, const UChar** pp, const UChar* end, 123: UChar* lower) 124: { 125: const UChar* p = *pp; 126: 127: if (*p == 0) { 128: p++; 129: *lower++ = '\0'; 130: if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && 131: ONIGENC_IS_MBC_ASCII(p)) || 132: ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && 133: !ONIGENC_IS_MBC_ASCII(p))) { 134: *lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p); 135: } 136: else { 137: *lower = *p; 138: } 139: 140: (*pp) += 2; 141: return 2; /* return byte length of converted char to lower */ 142: } 143: else { 144: int len; 145: len = EncLen_UTF16[*p]; 146: if (lower != p) { 147: int i; 148: for (i = 0; i < len; i++) { 149: *lower++ = *p++; 150: } 151: } 152: (*pp) += len; 153: return len; /* return byte length of converted char to lower */ 154: } 155: } 156: 157: static int 158: utf16be_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end) 159: { 160: const UChar* p = *pp; 161: 162: (*pp) += EncLen_UTF16[*p]; 163: 164: if (*p == 0) { 165: int c, v; 166: 167: p++; 168: if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && 169: ONIGENC_IS_MBC_ASCII(p)) || 170: ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && 171: !ONIGENC_IS_MBC_ASCII(p))) { 172: c = *p; 173: v = ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(c, 174: (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); 175: 176: if ((v | ONIGENC_CTYPE_LOWER) != 0) { 177: /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ 178: if (c >= 0xaa && c <= 0xba) 179: return FALSE; 180: else 181: return TRUE; 182: } 183: return (v != 0 ? TRUE : FALSE); 184: } 185: } 186: 187: return FALSE; 188: } 189: 190: static UChar* 191: utf16be_left_adjust_char_head(const UChar* start, const UChar* s) 192: { 193: if (s <= start) return (UChar* )s; 194: 195: if ((s - start) % 2 == 1) { 196: s--; 197: } 198: 199: if (UTF16_IS_SURROGATE_SECOND(*s) && s > start + 1) 200: s -= 2; 201: 202: return (UChar* )s; 203: } 204: 205: OnigEncodingType OnigEncodingUTF16_BE = { 206: utf16be_mbc_enc_len, 207: "UTF-16BE", /* name */ 208: 4, /* max byte length */ 209: 2, /* min byte length */ 210: (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | 211: ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE ), 212: { 213: (OnigCodePoint )'\\' /* esc */ 214: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ 215: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ 216: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ 217: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ 218: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ 219: }, 220: utf16be_is_mbc_newline, 221: utf16be_mbc_to_code, 222: utf16be_code_to_mbclen, 223: utf16be_code_to_mbc, 224: utf16be_mbc_to_normalize, 225: utf16be_is_mbc_ambiguous, 226: onigenc_iso_8859_1_get_all_pair_ambig_codes, 227: onigenc_ess_tsett_get_all_comp_ambig_codes, 228: onigenc_unicode_is_code_ctype, 229: onigenc_unicode_get_ctype_code_range, 230: utf16be_left_adjust_char_head, 231: onigenc_always_false_is_allowed_reverse_match 232: };