Annotation of embedaddon/php/ext/mbstring/oniguruma/enc/sjis.c, revision 1.1.1.1

1.1       misho       1: /**********************************************************************
                      2:   sjis.c -  Oniguruma (regular expression library)
                      3: **********************************************************************/
                      4: /*-
                      5:  * Copyright (c) 2002-2005  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
                      6:  * All rights reserved.
                      7:  *
                      8:  * Redistribution and use in source and binary forms, with or without
                      9:  * modification, are permitted provided that the following conditions
                     10:  * are met:
                     11:  * 1. Redistributions of source code must retain the above copyright
                     12:  *    notice, this list of conditions and the following disclaimer.
                     13:  * 2. Redistributions in binary form must reproduce the above copyright
                     14:  *    notice, this list of conditions and the following disclaimer in the
                     15:  *    documentation and/or other materials provided with the distribution.
                     16:  *
                     17:  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
                     18:  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
                     19:  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
                     20:  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
                     21:  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
                     22:  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
                     23:  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
                     24:  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
                     25:  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
                     26:  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
                     27:  * SUCH DAMAGE.
                     28:  */
                     29: 
                     30: #include "regenc.h"
                     31: 
                     32: static const int EncLen_SJIS[] = {
                     33:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     34:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     35:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     36:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     37:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     38:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     39:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     40:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     41:   1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                     42:   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                     43:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     44:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     45:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     46:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     47:   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                     48:   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1
                     49: };
                     50: 
                     51: static const char SJIS_CAN_BE_TRAIL_TABLE[256] = {
                     52:   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     53:   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     54:   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     55:   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     56:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     57:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     58:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     59:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
                     60:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     61:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     62:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     63:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     64:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     65:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     66:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     67:   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
                     68: };
                     69: 
                     70: #define SJIS_ISMB_FIRST(byte)  (EncLen_SJIS[byte] > 1)
                     71: #define SJIS_ISMB_TRAIL(byte)  SJIS_CAN_BE_TRAIL_TABLE[(byte)]
                     72: 
                     73: static int
                     74: sjis_mbc_enc_len(const UChar* p)
                     75: {
                     76:   return EncLen_SJIS[*p];
                     77: }
                     78: 
                     79: static int
                     80: sjis_code_to_mbclen(OnigCodePoint code)
                     81: {
                     82:   if (code < 256) {
                     83:     if (EncLen_SJIS[(int )code] == 1)
                     84:       return 1;
                     85:     else
                     86:       return 0;
                     87:   }
                     88:   else if (code <= 0xffff) {
                     89:     return 2;
                     90:   }
                     91:   else
                     92:     return 0;
                     93: }
                     94: 
                     95: static OnigCodePoint
                     96: sjis_mbc_to_code(const UChar* p, const UChar* end)
                     97: {
                     98:   int c, i, len;
                     99:   OnigCodePoint n;
                    100: 
                    101:   len = enc_len(ONIG_ENCODING_SJIS, p);
                    102:   c = *p++;
                    103:   n = c;
                    104:   if (len == 1) return n;
                    105: 
                    106:   for (i = 1; i < len; i++) {
                    107:     if (p >= end) break;
                    108:     c = *p++;
                    109:     n <<= 8;  n += c;
                    110:   }
                    111:   return n;
                    112: }
                    113: 
                    114: static int
                    115: sjis_code_to_mbc(OnigCodePoint code, UChar *buf)
                    116: {
                    117:   UChar *p = buf;
                    118: 
                    119:   if ((code & 0xff00) != 0) *p++ = (UChar )(((code >>  8) & 0xff));
                    120:   *p++ = (UChar )(code & 0xff);
                    121: 
                    122: #if 0
                    123:   if (enc_len(ONIG_ENCODING_SJIS, buf) != (p - buf))
                    124:     return REGERR_INVALID_WIDE_CHAR_VALUE;
                    125: #endif
                    126:   return p - buf;
                    127: }
                    128: 
                    129: static int
                    130: sjis_mbc_to_normalize(OnigAmbigType flag,
                    131:                      const UChar** pp, const UChar* end, UChar* lower)
                    132: {
                    133:   const UChar* p = *pp;
                    134: 
                    135:   if (ONIGENC_IS_MBC_ASCII(p)) {
                    136:     if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) {
                    137:       *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
                    138:     }
                    139:     else {
                    140:       *lower = *p;
                    141:     }
                    142: 
                    143:     (*pp)++;
                    144:     return 1;
                    145:   }
                    146:   else {
                    147:     int len = enc_len(ONIG_ENCODING_SJIS, p);
                    148: 
                    149:     if (lower != p) {
                    150:       int i;
                    151:       for (i = 0; i < len; i++) {
                    152:        *lower++ = *p++;
                    153:       }
                    154:     }
                    155:     (*pp) += len;
                    156:     return len; /* return byte length of converted char to lower */
                    157:   }
                    158: }
                    159: 
                    160: static int
                    161: sjis_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end)
                    162: {
                    163:   return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_SJIS, flag, pp, end);
                    164:                                       
                    165: }
                    166: 
                    167: static int
                    168: sjis_is_code_ctype(OnigCodePoint code, unsigned int ctype)
                    169: {
                    170:   if (code < 128)
                    171:     return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
                    172:   else {
                    173:     if ((ctype & (ONIGENC_CTYPE_WORD |
                    174:                   ONIGENC_CTYPE_GRAPH | ONIGENC_CTYPE_PRINT)) != 0) {
                    175:       return (sjis_code_to_mbclen(code) > 1 ? TRUE : FALSE);
                    176:     }
                    177:   }
                    178: 
                    179:   return FALSE;
                    180: }
                    181: 
                    182: static UChar*
                    183: sjis_left_adjust_char_head(const UChar* start, const UChar* s)
                    184: {
                    185:   const UChar *p;
                    186:   int len;
                    187: 
                    188:   if (s <= start) return (UChar* )s;
                    189:   p = s;
                    190: 
                    191:   if (SJIS_ISMB_TRAIL(*p)) {
                    192:     while (p > start) {
                    193:       if (! SJIS_ISMB_FIRST(*--p)) {
                    194:        p++;
                    195:        break;
                    196:       }
                    197:     } 
                    198:   }
                    199:   len = enc_len(ONIG_ENCODING_SJIS, p);
                    200:   if (p + len > s) return (UChar* )p;
                    201:   p += len;
                    202:   return (UChar* )(p + ((s - p) & ~1));
                    203: }
                    204: 
                    205: static int
                    206: sjis_is_allowed_reverse_match(const UChar* s, const UChar* end)
                    207: {
                    208:   const UChar c = *s;
                    209:   return (SJIS_ISMB_TRAIL(c) ? FALSE : TRUE);
                    210: }
                    211: 
                    212: OnigEncodingType OnigEncodingSJIS = {
                    213:   sjis_mbc_enc_len,
                    214:   "Shift_JIS",   /* name */
                    215:   2,             /* max byte length */
                    216:   1,             /* min byte length */
                    217:   ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE,
                    218:   {
                    219:       (OnigCodePoint )'\\'                       /* esc */
                    220:     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.'  */
                    221:     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*'  */
                    222:     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
                    223:     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
                    224:     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
                    225:   },
                    226:   onigenc_is_mbc_newline_0x0a,
                    227:   sjis_mbc_to_code,
                    228:   sjis_code_to_mbclen,
                    229:   sjis_code_to_mbc,
                    230:   sjis_mbc_to_normalize,
                    231:   sjis_is_mbc_ambiguous,
                    232:   onigenc_ascii_get_all_pair_ambig_codes,
                    233:   onigenc_nothing_get_all_comp_ambig_codes,
                    234:   sjis_is_code_ctype,
                    235:   onigenc_not_support_get_ctype_code_range,
                    236:   sjis_left_adjust_char_head,
                    237:   sjis_is_allowed_reverse_match
                    238: };

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>