Return to utf32_le.c CVS log | Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / php / ext / mbstring / oniguruma / enc |
1.1 misho 1: /********************************************************************** 2: utf32_le.c - Oniguruma (regular expression library) 3: **********************************************************************/ 4: /*- 5: * Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> 6: * All rights reserved. 7: * 8: * Redistribution and use in source and binary forms, with or without 9: * modification, are permitted provided that the following conditions 10: * are met: 11: * 1. Redistributions of source code must retain the above copyright 12: * notice, this list of conditions and the following disclaimer. 13: * 2. Redistributions in binary form must reproduce the above copyright 14: * notice, this list of conditions and the following disclaimer in the 15: * documentation and/or other materials provided with the distribution. 16: * 17: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20: * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27: * SUCH DAMAGE. 28: */ 29: 30: #include "regenc.h" 31: 32: static int 33: utf32le_mbc_enc_len(const UChar* p) 34: { 35: return 4; 36: } 37: 38: static int 39: utf32le_is_mbc_newline(const UChar* p, const UChar* end) 40: { 41: if (p + 3 < end) { 42: if (*p == 0x0a && *(p+1) == 0 && *(p+2) == 0 && *(p+3) == 0) 43: return 1; 44: #ifdef USE_UNICODE_ALL_LINE_TERMINATORS 45: if ((*p == 0x0d || *p == 0x85) && *(p+1) == 0x00 46: && (p+2) == 0x00 && *(p+3) == 0x00) 47: return 1; 48: if (*(p+1) == 0x20 && (*p == 0x29 || *p == 0x28) 49: && *(p+2) == 0x00 && *(p+3) == 0x00) 50: return 1; 51: #endif 52: } 53: return 0; 54: } 55: 56: static OnigCodePoint 57: utf32le_mbc_to_code(const UChar* p, const UChar* end) 58: { 59: return (OnigCodePoint )(((p[3] * 256 + p[2]) * 256 + p[1]) * 256 + p[0]); 60: } 61: 62: static int 63: utf32le_code_to_mbclen(OnigCodePoint code) 64: { 65: return 4; 66: } 67: 68: static int 69: utf32le_code_to_mbc(OnigCodePoint code, UChar *buf) 70: { 71: UChar* p = buf; 72: 73: *p++ = (UChar ) (code & 0xff); 74: *p++ = (UChar )((code & 0xff00) >> 8); 75: *p++ = (UChar )((code & 0xff0000) >>16); 76: *p++ = (UChar )((code & 0xff000000) >>24); 77: return 4; 78: } 79: 80: static int 81: utf32le_mbc_to_normalize(OnigAmbigType flag, const UChar** pp, const UChar* end, 82: UChar* lower) 83: { 84: const UChar* p = *pp; 85: 86: if (*(p+1) == 0 && *(p+2) == 0 && *(p+3) == 0) { 87: if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && 88: ONIGENC_IS_MBC_ASCII(p)) || 89: ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && 90: !ONIGENC_IS_MBC_ASCII(p))) { 91: *lower++ = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p); 92: } 93: else { 94: *lower++ = *p; 95: } 96: *lower++ = '\0'; 97: *lower++ = '\0'; 98: *lower = '\0'; 99: 100: (*pp) += 4; 101: return 4; /* return byte length of converted char to lower */ 102: } 103: else { 104: int len = 4; 105: if (lower != p) { 106: int i; 107: for (i = 0; i < len; i++) { 108: *lower++ = *p++; 109: } 110: } 111: (*pp) += len; 112: return len; /* return byte length of converted char to lower */ 113: } 114: } 115: 116: static int 117: utf32le_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end) 118: { 119: const UChar* p = *pp; 120: 121: (*pp) += 4; 122: 123: if (*(p+1) == 0 && *(p+2) == 0 && *(p+3) == 0) { 124: int c, v; 125: 126: if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && 127: ONIGENC_IS_MBC_ASCII(p)) || 128: ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 && 129: !ONIGENC_IS_MBC_ASCII(p))) { 130: c = *p; 131: v = ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(c, 132: (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)); 133: if ((v | ONIGENC_CTYPE_LOWER) != 0) { 134: /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ 135: if (c >= 0xaa && c <= 0xba) 136: return FALSE; 137: else 138: return TRUE; 139: } 140: return (v != 0 ? TRUE : FALSE); 141: } 142: } 143: 144: return FALSE; 145: } 146: 147: static UChar* 148: utf32le_left_adjust_char_head(const UChar* start, const UChar* s) 149: { 150: int rem; 151: 152: if (s <= start) return (UChar* )s; 153: 154: rem = (s - start) % 4; 155: return (UChar* )(s - rem); 156: } 157: 158: OnigEncodingType OnigEncodingUTF32_LE = { 159: utf32le_mbc_enc_len, 160: "UTF-32LE", /* name */ 161: 4, /* max byte length */ 162: 4, /* min byte length */ 163: (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | 164: ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE ), 165: { 166: (OnigCodePoint )'\\' /* esc */ 167: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ 168: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ 169: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ 170: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ 171: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ 172: }, 173: utf32le_is_mbc_newline, 174: utf32le_mbc_to_code, 175: utf32le_code_to_mbclen, 176: utf32le_code_to_mbc, 177: utf32le_mbc_to_normalize, 178: utf32le_is_mbc_ambiguous, 179: onigenc_iso_8859_1_get_all_pair_ambig_codes, 180: onigenc_ess_tsett_get_all_comp_ambig_codes, 181: onigenc_unicode_is_code_ctype, 182: onigenc_unicode_get_ctype_code_range, 183: utf32le_left_adjust_char_head, 184: onigenc_always_false_is_allowed_reverse_match 185: };