Annotation of embedaddon/php/ext/mbstring/oniguruma/enc/utf16_le.c, revision 1.1
1.1 ! misho 1: /**********************************************************************
! 2: utf16_le.c - Oniguruma (regular expression library)
! 3: **********************************************************************/
! 4: /*-
! 5: * Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
! 6: * All rights reserved.
! 7: *
! 8: * Redistribution and use in source and binary forms, with or without
! 9: * modification, are permitted provided that the following conditions
! 10: * are met:
! 11: * 1. Redistributions of source code must retain the above copyright
! 12: * notice, this list of conditions and the following disclaimer.
! 13: * 2. Redistributions in binary form must reproduce the above copyright
! 14: * notice, this list of conditions and the following disclaimer in the
! 15: * documentation and/or other materials provided with the distribution.
! 16: *
! 17: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
! 18: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
! 19: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
! 20: * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
! 21: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
! 22: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
! 23: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
! 24: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
! 25: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
! 26: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
! 27: * SUCH DAMAGE.
! 28: */
! 29:
! 30: #include "regenc.h"
! 31:
! 32: #define UTF16_IS_SURROGATE_FIRST(c) (c >= 0xd8 && c <= 0xdb)
! 33: #define UTF16_IS_SURROGATE_SECOND(c) (c >= 0xdc && c <= 0xdf)
! 34:
! 35: static const int EncLen_UTF16[] = {
! 36: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
! 37: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
! 38: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
! 39: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
! 40: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
! 41: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
! 42: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
! 43: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
! 44: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
! 45: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
! 46: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
! 47: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
! 48: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
! 49: 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2,
! 50: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
! 51: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
! 52: };
! 53:
! 54: static int
! 55: utf16le_code_to_mbclen(OnigCodePoint code)
! 56: {
! 57: return (code > 0xffff ? 4 : 2);
! 58: }
! 59:
! 60: static int
! 61: utf16le_mbc_enc_len(const UChar* p)
! 62: {
! 63: return EncLen_UTF16[*(p+1)];
! 64: }
! 65:
! 66: static int
! 67: utf16le_is_mbc_newline(const UChar* p, const UChar* end)
! 68: {
! 69: if (p + 1 < end) {
! 70: if (*p == 0x0a && *(p+1) == 0x00)
! 71: return 1;
! 72: #ifdef USE_UNICODE_ALL_LINE_TERMINATORS
! 73: if ((*p == 0x0d || *p == 0x85) && *(p+1) == 0x00)
! 74: return 1;
! 75: if (*(p+1) == 0x20 && (*p == 0x29 || *p == 0x28))
! 76: return 1;
! 77: #endif
! 78: }
! 79: return 0;
! 80: }
! 81:
! 82: static OnigCodePoint
! 83: utf16le_mbc_to_code(const UChar* p, const UChar* end)
! 84: {
! 85: OnigCodePoint code;
! 86: UChar c0 = *p;
! 87: UChar c1 = *(p+1);
! 88:
! 89: if (UTF16_IS_SURROGATE_FIRST(c1)) {
! 90: code = ((((c1 - 0xd8) << 2) + ((c0 & 0xc0) >> 6) + 1) << 16)
! 91: + ((((c0 & 0x3f) << 2) + (p[3] - 0xdc)) << 8)
! 92: + p[2];
! 93: }
! 94: else {
! 95: code = c1 * 256 + p[0];
! 96: }
! 97: return code;
! 98: }
! 99:
! 100: static int
! 101: utf16le_code_to_mbc(OnigCodePoint code, UChar *buf)
! 102: {
! 103: UChar* p = buf;
! 104:
! 105: if (code > 0xffff) {
! 106: unsigned int plane, high;
! 107:
! 108: plane = code >> 16;
! 109: high = (code & 0xff00) >> 8;
! 110:
! 111: *p++ = ((plane & 0x03) << 6) + (high >> 2);
! 112: *p++ = (plane >> 2) + 0xd8;
! 113: *p++ = (UChar )(code & 0xff);
! 114: *p = (high & 0x02) + 0xdc;
! 115: return 4;
! 116: }
! 117: else {
! 118: *p++ = (UChar )(code & 0xff);
! 119: *p++ = (UChar )((code & 0xff00) >> 8);
! 120: return 2;
! 121: }
! 122: }
! 123:
! 124: static int
! 125: utf16le_mbc_to_normalize(OnigAmbigType flag, const UChar** pp, const UChar* end,
! 126: UChar* lower)
! 127: {
! 128: const UChar* p = *pp;
! 129:
! 130: if (*(p+1) == 0) {
! 131: *(lower+1) = '\0';
! 132: if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 &&
! 133: ONIGENC_IS_MBC_ASCII(p)) ||
! 134: ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 &&
! 135: !ONIGENC_IS_MBC_ASCII(p))) {
! 136: *lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p);
! 137: }
! 138: else {
! 139: *lower = *p;
! 140: }
! 141: (*pp) += 2;
! 142: return 2; /* return byte length of converted char to lower */
! 143: }
! 144: else {
! 145: int len = EncLen_UTF16[*(p+1)];
! 146: if (lower != p) {
! 147: int i;
! 148: for (i = 0; i < len; i++) {
! 149: *lower++ = *p++;
! 150: }
! 151: }
! 152: (*pp) += len;
! 153: return len; /* return byte length of converted char to lower */
! 154: }
! 155: }
! 156:
! 157: static int
! 158: utf16le_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end)
! 159: {
! 160: const UChar* p = *pp;
! 161:
! 162: (*pp) += EncLen_UTF16[*(p+1)];
! 163:
! 164: if (*(p+1) == 0) {
! 165: int c, v;
! 166:
! 167: if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 &&
! 168: ONIGENC_IS_MBC_ASCII(p)) ||
! 169: ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 &&
! 170: !ONIGENC_IS_MBC_ASCII(p))) {
! 171: c = *p;
! 172: v = ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(c,
! 173: (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER));
! 174: if ((v | ONIGENC_CTYPE_LOWER) != 0) {
! 175: /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */
! 176: if (c >= 0xaa && c <= 0xba)
! 177: return FALSE;
! 178: else
! 179: return TRUE;
! 180: }
! 181: return (v != 0 ? TRUE : FALSE);
! 182: }
! 183: }
! 184:
! 185: return FALSE;
! 186: }
! 187:
! 188: static UChar*
! 189: utf16le_left_adjust_char_head(const UChar* start, const UChar* s)
! 190: {
! 191: if (s <= start) return (UChar* )s;
! 192:
! 193: if ((s - start) % 2 == 1) {
! 194: s--;
! 195: }
! 196:
! 197: if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1)
! 198: s -= 2;
! 199:
! 200: return (UChar* )s;
! 201: }
! 202:
! 203: OnigEncodingType OnigEncodingUTF16_LE = {
! 204: utf16le_mbc_enc_len,
! 205: "UTF-16LE", /* name */
! 206: 4, /* max byte length */
! 207: 2, /* min byte length */
! 208: (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE |
! 209: ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE ),
! 210: {
! 211: (OnigCodePoint )'\\' /* esc */
! 212: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
! 213: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
! 214: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
! 215: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
! 216: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
! 217: },
! 218: utf16le_is_mbc_newline,
! 219: utf16le_mbc_to_code,
! 220: utf16le_code_to_mbclen,
! 221: utf16le_code_to_mbc,
! 222: utf16le_mbc_to_normalize,
! 223: utf16le_is_mbc_ambiguous,
! 224: onigenc_iso_8859_1_get_all_pair_ambig_codes,
! 225: onigenc_ess_tsett_get_all_comp_ambig_codes,
! 226: onigenc_unicode_is_code_ctype,
! 227: onigenc_unicode_get_ctype_code_range,
! 228: utf16le_left_adjust_char_head,
! 229: onigenc_always_false_is_allowed_reverse_match
! 230: };
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>