Annotation of embedaddon/php/ext/mbstring/oniguruma/enc/euc_jp.c, revision 1.1.1.1

1.1       misho       1: /**********************************************************************
                      2:   euc_jp.c -  Oniguruma (regular expression library)
                      3: **********************************************************************/
                      4: /*-
                      5:  * Copyright (c) 2002-2005  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
                      6:  * All rights reserved.
                      7:  *
                      8:  * Redistribution and use in source and binary forms, with or without
                      9:  * modification, are permitted provided that the following conditions
                     10:  * are met:
                     11:  * 1. Redistributions of source code must retain the above copyright
                     12:  *    notice, this list of conditions and the following disclaimer.
                     13:  * 2. Redistributions in binary form must reproduce the above copyright
                     14:  *    notice, this list of conditions and the following disclaimer in the
                     15:  *    documentation and/or other materials provided with the distribution.
                     16:  *
                     17:  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
                     18:  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
                     19:  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
                     20:  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
                     21:  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
                     22:  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
                     23:  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
                     24:  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
                     25:  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
                     26:  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
                     27:  * SUCH DAMAGE.
                     28:  */
                     29: 
                     30: #include "regenc.h"
                     31: 
                     32: #define eucjp_islead(c)    ((UChar )((c) - 0xa1) > 0xfe - 0xa1)
                     33: 
                     34: static const int EncLen_EUCJP[] = {
                     35:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     36:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     37:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     38:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     39:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     40:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     41:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     42:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     43:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
                     44:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     45:   1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                     46:   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                     47:   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                     48:   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                     49:   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                     50:   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
                     51: };
                     52: 
                     53: static int
                     54: eucjp_mbc_enc_len(const UChar* p)
                     55: {
                     56:   return EncLen_EUCJP[*p];
                     57: }
                     58: 
                     59: static OnigCodePoint
                     60: eucjp_mbc_to_code(const UChar* p, const UChar* end)
                     61: {
                     62:   int c, i, len;
                     63:   OnigCodePoint n;
                     64: 
                     65:   len = enc_len(ONIG_ENCODING_EUC_JP, p);
                     66:   n = (OnigCodePoint )*p++;
                     67:   if (len == 1) return n;
                     68: 
                     69:   for (i = 1; i < len; i++) {
                     70:     if (p >= end) break;
                     71:     c = *p++;
                     72:     n <<= 8;  n += c;
                     73:   }
                     74:   return n;
                     75: }
                     76: 
                     77: static int
                     78: eucjp_code_to_mbclen(OnigCodePoint code)
                     79: {
                     80:   if (ONIGENC_IS_CODE_ASCII(code)) return 1;
                     81:   else if ((code & 0xff0000) != 0) return 3;
                     82:   else if ((code &   0xff00) != 0) return 2;
                     83:   else return 0;
                     84: }
                     85: 
                     86: #if 0
                     87: static int
                     88: eucjp_code_to_mbc_first(OnigCodePoint code)
                     89: {
                     90:   int first;
                     91: 
                     92:   if ((code & 0xff0000) != 0) {
                     93:     first = (code >> 16) & 0xff;
                     94:   }
                     95:   else if ((code & 0xff00) != 0) {
                     96:     first = (code >> 8) & 0xff;
                     97:   }
                     98:   else {
                     99:     return (int )code;
                    100:   }
                    101:   return first;
                    102: }
                    103: #endif
                    104: 
                    105: static int
                    106: eucjp_code_to_mbc(OnigCodePoint code, UChar *buf)
                    107: {
                    108:   UChar *p = buf;
                    109: 
                    110:   if ((code & 0xff0000) != 0) *p++ = (UChar )(((code >> 16) & 0xff));
                    111:   if ((code &   0xff00) != 0) *p++ = (UChar )(((code >>  8) & 0xff));
                    112:   *p++ = (UChar )(code & 0xff);
                    113: 
                    114: #if 1
                    115:   if (enc_len(ONIG_ENCODING_EUC_JP, buf) != (p - buf))
                    116:     return ONIGENCERR_INVALID_WIDE_CHAR_VALUE;
                    117: #endif  
                    118:   return p - buf;
                    119: }
                    120: 
                    121: static int
                    122: eucjp_mbc_to_normalize(OnigAmbigType flag,
                    123:                       const UChar** pp, const UChar* end, UChar* lower)
                    124: {
                    125:   int len;
                    126:   const UChar* p = *pp;
                    127: 
                    128:   if (ONIGENC_IS_MBC_ASCII(p)) {
                    129:     if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) {
                    130:       *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
                    131:     }
                    132:     else {
                    133:       *lower = *p;
                    134:     }
                    135: 
                    136:     (*pp)++;
                    137:     return 1;
                    138:   }
                    139:   else {
                    140:     len = enc_len(ONIG_ENCODING_EUC_JP, p);
                    141:     if (lower != p) {
                    142:       int i;
                    143:       for (i = 0; i < len; i++) {
                    144:        *lower++ = *p++;
                    145:       }
                    146:     }
                    147:     (*pp) += len;
                    148:     return len; /* return byte length of converted char to lower */
                    149:   }
                    150: }
                    151: 
                    152: static int
                    153: eucjp_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end)
                    154: {
                    155:   return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_EUC_JP, flag, pp, end);
                    156: }
                    157: 
                    158: static int
                    159: eucjp_is_code_ctype(OnigCodePoint code, unsigned int ctype)
                    160: {
                    161:   if (code < 128)
                    162:     return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
                    163:   else {
                    164:     if ((ctype & (ONIGENC_CTYPE_WORD |
                    165:                   ONIGENC_CTYPE_GRAPH | ONIGENC_CTYPE_PRINT)) != 0) {
                    166:       return (eucjp_code_to_mbclen(code) > 1 ? TRUE : FALSE);
                    167:     }
                    168:   }
                    169: 
                    170:   return FALSE;
                    171: }
                    172: 
                    173: static UChar*
                    174: eucjp_left_adjust_char_head(const UChar* start, const UChar* s)
                    175: {
                    176:   /* In this encoding
                    177:      mb-trail bytes doesn't mix with single bytes.
                    178:   */
                    179:   const UChar *p;
                    180:   int len;
                    181: 
                    182:   if (s <= start) return (UChar* )s;
                    183:   p = s;
                    184: 
                    185:   while (!eucjp_islead(*p) && p > start) p--;
                    186:   len = enc_len(ONIG_ENCODING_EUC_JP, p);
                    187:   if (p + len > s) return (UChar* )p;
                    188:   p += len;
                    189:   return (UChar* )(p + ((s - p) & ~1));
                    190: }
                    191: 
                    192: static int
                    193: eucjp_is_allowed_reverse_match(const UChar* s, const UChar* end)
                    194: {
                    195:   const UChar c = *s;
                    196:   if (c <= 0x7e || c == 0x8e || c == 0x8f)
                    197:     return TRUE;
                    198:   else
                    199:     return FALSE;
                    200: }
                    201: 
                    202: OnigEncodingType OnigEncodingEUC_JP = {
                    203:   eucjp_mbc_enc_len,
                    204:   "EUC-JP",   /* name */
                    205:   3,          /* max enc length */
                    206:   1,          /* min enc length */
                    207:   ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE,
                    208:   {
                    209:       (OnigCodePoint )'\\'                       /* esc */
                    210:     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.'  */
                    211:     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*'  */
                    212:     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
                    213:     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
                    214:     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
                    215:   },
                    216:   onigenc_is_mbc_newline_0x0a,
                    217:   eucjp_mbc_to_code,
                    218:   eucjp_code_to_mbclen,
                    219:   eucjp_code_to_mbc,
                    220:   eucjp_mbc_to_normalize,
                    221:   eucjp_is_mbc_ambiguous,
                    222:   onigenc_ascii_get_all_pair_ambig_codes,
                    223:   onigenc_nothing_get_all_comp_ambig_codes,
                    224:   eucjp_is_code_ctype,
                    225:   onigenc_not_support_get_ctype_code_range,
                    226:   eucjp_left_adjust_char_head,
                    227:   eucjp_is_allowed_reverse_match
                    228: };

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>