Return to euc_kr.c CVS log | Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / php / ext / mbstring / oniguruma / enc |
1.1 misho 1: /********************************************************************** 2: euc_kr.c - Oniguruma (regular expression library) 3: **********************************************************************/ 4: /*- 5: * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> 6: * All rights reserved. 7: * 8: * Redistribution and use in source and binary forms, with or without 9: * modification, are permitted provided that the following conditions 10: * are met: 11: * 1. Redistributions of source code must retain the above copyright 12: * notice, this list of conditions and the following disclaimer. 13: * 2. Redistributions in binary form must reproduce the above copyright 14: * notice, this list of conditions and the following disclaimer in the 15: * documentation and/or other materials provided with the distribution. 16: * 17: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20: * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27: * SUCH DAMAGE. 28: */ 29: 30: #include "regenc.h" 31: 32: static const int EncLen_EUCKR[] = { 33: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 34: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 35: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 36: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 37: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 38: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 39: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 40: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 41: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 42: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 43: 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 44: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 45: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 46: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 47: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 48: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 49: }; 50: 51: static int 52: euckr_mbc_enc_len(const UChar* p) 53: { 54: return EncLen_EUCKR[*p]; 55: } 56: 57: static OnigCodePoint 58: euckr_mbc_to_code(const UChar* p, const UChar* end) 59: { 60: return onigenc_mbn_mbc_to_code(ONIG_ENCODING_EUC_KR, p, end); 61: } 62: 63: static int 64: euckr_code_to_mbc(OnigCodePoint code, UChar *buf) 65: { 66: return onigenc_mb2_code_to_mbc(ONIG_ENCODING_EUC_KR, code, buf); 67: } 68: 69: static int 70: euckr_mbc_to_normalize(OnigAmbigType flag, const UChar** pp, const UChar* end, 71: UChar* lower) 72: { 73: return onigenc_mbn_mbc_to_normalize(ONIG_ENCODING_EUC_KR, flag, 74: pp, end, lower); 75: } 76: 77: static int 78: euckr_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end) 79: { 80: return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_EUC_KR, flag, pp, end); 81: } 82: 83: static int 84: euckr_is_code_ctype(OnigCodePoint code, unsigned int ctype) 85: { 86: return onigenc_mb2_is_code_ctype(ONIG_ENCODING_EUC_KR, code, ctype); 87: } 88: 89: #define euckr_islead(c) ((c) < 0xa1 || (c) == 0xff) 90: 91: static UChar* 92: euckr_left_adjust_char_head(const UChar* start, const UChar* s) 93: { 94: /* Assumed in this encoding, 95: mb-trail bytes don't mix with single bytes. 96: */ 97: const UChar *p; 98: int len; 99: 100: if (s <= start) return (UChar* )s; 101: p = s; 102: 103: while (!euckr_islead(*p) && p > start) p--; 104: len = enc_len(ONIG_ENCODING_EUC_KR, p); 105: if (p + len > s) return (UChar* )p; 106: p += len; 107: return (UChar* )(p + ((s - p) & ~1)); 108: } 109: 110: static int 111: euckr_is_allowed_reverse_match(const UChar* s, const UChar* end) 112: { 113: const UChar c = *s; 114: if (c <= 0x7e) return TRUE; 115: else return FALSE; 116: } 117: 118: OnigEncodingType OnigEncodingEUC_KR = { 119: euckr_mbc_enc_len, 120: "EUC-KR", /* name */ 121: 2, /* max enc length */ 122: 1, /* min enc length */ 123: ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE, 124: { 125: (OnigCodePoint )'\\' /* esc */ 126: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ 127: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ 128: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ 129: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ 130: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ 131: }, 132: onigenc_is_mbc_newline_0x0a, 133: euckr_mbc_to_code, 134: onigenc_mb2_code_to_mbclen, 135: euckr_code_to_mbc, 136: euckr_mbc_to_normalize, 137: euckr_is_mbc_ambiguous, 138: onigenc_ascii_get_all_pair_ambig_codes, 139: onigenc_nothing_get_all_comp_ambig_codes, 140: euckr_is_code_ctype, 141: onigenc_not_support_get_ctype_code_range, 142: euckr_left_adjust_char_head, 143: euckr_is_allowed_reverse_match 144: }; 145: 146: /* Same with OnigEncodingEUC_KR except the name */ 147: OnigEncodingType OnigEncodingEUC_CN = { 148: euckr_mbc_enc_len, 149: "EUC-CN", /* name */ 150: 2, /* max enc length */ 151: 1, /* min enc length */ 152: ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE, 153: { 154: (OnigCodePoint )'\\' /* esc */ 155: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ 156: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ 157: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ 158: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ 159: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ 160: }, 161: onigenc_is_mbc_newline_0x0a, 162: euckr_mbc_to_code, 163: onigenc_mb2_code_to_mbclen, 164: euckr_code_to_mbc, 165: euckr_mbc_to_normalize, 166: euckr_is_mbc_ambiguous, 167: onigenc_ascii_get_all_pair_ambig_codes, 168: onigenc_nothing_get_all_comp_ambig_codes, 169: euckr_is_code_ctype, 170: onigenc_not_support_get_ctype_code_range, 171: euckr_left_adjust_char_head, 172: euckr_is_allowed_reverse_match 173: };