Annotation of embedaddon/php/ext/mbstring/oniguruma/enc/utf32_be.c, revision 1.1.1.1

1.1       misho       1: /**********************************************************************
                      2:   utf32_be.c -  Oniguruma (regular expression library)
                      3: **********************************************************************/
                      4: /*-
                      5:  * Copyright (c) 2002-2006  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
                      6:  * All rights reserved.
                      7:  *
                      8:  * Redistribution and use in source and binary forms, with or without
                      9:  * modification, are permitted provided that the following conditions
                     10:  * are met:
                     11:  * 1. Redistributions of source code must retain the above copyright
                     12:  *    notice, this list of conditions and the following disclaimer.
                     13:  * 2. Redistributions in binary form must reproduce the above copyright
                     14:  *    notice, this list of conditions and the following disclaimer in the
                     15:  *    documentation and/or other materials provided with the distribution.
                     16:  *
                     17:  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
                     18:  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
                     19:  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
                     20:  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
                     21:  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
                     22:  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
                     23:  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
                     24:  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
                     25:  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
                     26:  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
                     27:  * SUCH DAMAGE.
                     28:  */
                     29: 
                     30: #include "regenc.h"
                     31: 
                     32: static int
                     33: utf32be_mbc_enc_len(const UChar* p)
                     34: {
                     35:   return 4;
                     36: }
                     37: 
                     38: static int
                     39: utf32be_is_mbc_newline(const UChar* p, const UChar* end)
                     40: {
                     41:   if (p + 3 < end) {
                     42:     if (*(p+3) == 0x0a && *(p+2) == 0 && *(p+1) == 0 && *p == 0)
                     43:       return 1;
                     44: #ifdef USE_UNICODE_ALL_LINE_TERMINATORS
                     45:     if ((*(p+3) == 0x0d || *(p+3) == 0x85)
                     46:        && *(p+2) == 0 && *(p+1) == 0 && *p == 0x00)
                     47:       return 1;
                     48:     if (*(p+2) == 0x20 && (*(p+3) == 0x29 || *(p+3) == 0x28)
                     49:        && *(p+1) == 0 && *p == 0)
                     50:       return 1;
                     51: #endif
                     52:   }
                     53:   return 0;
                     54: }
                     55: 
                     56: static OnigCodePoint
                     57: utf32be_mbc_to_code(const UChar* p, const UChar* end)
                     58: {
                     59:   return (OnigCodePoint )(((p[0] * 256 + p[1]) * 256 + p[2]) * 256 + p[3]);
                     60: }
                     61: 
                     62: static int
                     63: utf32be_code_to_mbclen(OnigCodePoint code)
                     64: {
                     65:   return 4;
                     66: }
                     67: 
                     68: static int
                     69: utf32be_code_to_mbc(OnigCodePoint code, UChar *buf)
                     70: {
                     71:   UChar* p = buf;
                     72: 
                     73:   *p++ = (UChar )((code & 0xff000000) >>24);
                     74:   *p++ = (UChar )((code & 0xff0000)   >>16);
                     75:   *p++ = (UChar )((code & 0xff00)     >> 8);
                     76:   *p++ = (UChar ) (code & 0xff);
                     77:   return 4;
                     78: }
                     79: 
                     80: static int
                     81: utf32be_mbc_to_normalize(OnigAmbigType flag, const UChar** pp, const UChar* end,
                     82:                          UChar* lower)
                     83: {
                     84:   const UChar* p = *pp;
                     85: 
                     86:   if (*(p+2) == 0 && *(p+1) == 0 && *p == 0) {
                     87:     p += 3;
                     88:     *lower++ = '\0';
                     89:     *lower++ = '\0';
                     90:     *lower++ = '\0';
                     91:     if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 &&
                     92:         ONIGENC_IS_MBC_ASCII(p)) ||
                     93:        ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 &&
                     94:         !ONIGENC_IS_MBC_ASCII(p))) {
                     95:       *lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p);
                     96:     }
                     97:     else {
                     98:       *lower = *p;
                     99:     }
                    100: 
                    101:     (*pp) += 4;
                    102:     return 4;  /* return byte length of converted char to lower */
                    103:   }
                    104:   else {
                    105:     int len = 4;
                    106:     if (lower != p) {
                    107:       int i;
                    108:       for (i = 0; i < len; i++) {
                    109:        *lower++ = *p++;
                    110:       }
                    111:     }
                    112:     (*pp) += len;
                    113:     return len; /* return byte length of converted char to lower */
                    114:   }
                    115: }
                    116: 
                    117: static int
                    118: utf32be_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end)
                    119: {
                    120:   const UChar* p = *pp;
                    121: 
                    122:   (*pp) += 4;
                    123: 
                    124:   if (*(p+2) == 0 && *(p+1) == 0 && *p == 0) {
                    125:     int c, v;
                    126: 
                    127:     p += 3;
                    128:     if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 &&
                    129:         ONIGENC_IS_MBC_ASCII(p)) ||
                    130:        ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 &&
                    131:         !ONIGENC_IS_MBC_ASCII(p))) {
                    132:       c = *p;
                    133:       v = ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(c,
                    134:                        (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER));
                    135:       if ((v | ONIGENC_CTYPE_LOWER) != 0) {
                    136:         /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */
                    137:         if (c >= 0xaa && c <= 0xba)
                    138:           return FALSE;
                    139:         else
                    140:           return TRUE;
                    141:       }
                    142:       return (v != 0 ? TRUE : FALSE);
                    143:     }
                    144:   }
                    145: 
                    146:   return FALSE;
                    147: }
                    148: 
                    149: static UChar*
                    150: utf32be_left_adjust_char_head(const UChar* start, const UChar* s)
                    151: {
                    152:   int rem;
                    153: 
                    154:   if (s <= start) return (UChar* )s;
                    155: 
                    156:   rem = (s - start) % 4;
                    157:   return (UChar* )(s - rem);
                    158: }
                    159: 
                    160: OnigEncodingType OnigEncodingUTF32_BE = {
                    161:   utf32be_mbc_enc_len,
                    162:   "UTF-32BE",   /* name */
                    163:   4,            /* max byte length */
                    164:   4,            /* min byte length */
                    165:   (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE |
                    166:    ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE ),
                    167:   {
                    168:       (OnigCodePoint )'\\'                       /* esc */
                    169:     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.'  */
                    170:     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*'  */
                    171:     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
                    172:     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
                    173:     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
                    174:   },
                    175:   utf32be_is_mbc_newline,
                    176:   utf32be_mbc_to_code,
                    177:   utf32be_code_to_mbclen,
                    178:   utf32be_code_to_mbc,
                    179:   utf32be_mbc_to_normalize,
                    180:   utf32be_is_mbc_ambiguous,
                    181:   onigenc_iso_8859_1_get_all_pair_ambig_codes,
                    182:   onigenc_ess_tsett_get_all_comp_ambig_codes,
                    183:   onigenc_unicode_is_code_ctype,
                    184:   onigenc_unicode_get_ctype_code_range,
                    185:   utf32be_left_adjust_char_head,
                    186:   onigenc_always_false_is_allowed_reverse_match
                    187: };

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>