Annotation of embedaddon/php/ext/mbstring/oniguruma/enc/utf32_le.c, revision 1.1.1.1

1.1       misho       1: /**********************************************************************
                      2:   utf32_le.c -  Oniguruma (regular expression library)
                      3: **********************************************************************/
                      4: /*-
                      5:  * Copyright (c) 2002-2006  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
                      6:  * All rights reserved.
                      7:  *
                      8:  * Redistribution and use in source and binary forms, with or without
                      9:  * modification, are permitted provided that the following conditions
                     10:  * are met:
                     11:  * 1. Redistributions of source code must retain the above copyright
                     12:  *    notice, this list of conditions and the following disclaimer.
                     13:  * 2. Redistributions in binary form must reproduce the above copyright
                     14:  *    notice, this list of conditions and the following disclaimer in the
                     15:  *    documentation and/or other materials provided with the distribution.
                     16:  *
                     17:  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
                     18:  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
                     19:  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
                     20:  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
                     21:  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
                     22:  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
                     23:  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
                     24:  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
                     25:  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
                     26:  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
                     27:  * SUCH DAMAGE.
                     28:  */
                     29: 
                     30: #include "regenc.h"
                     31: 
                     32: static int
                     33: utf32le_mbc_enc_len(const UChar* p)
                     34: {
                     35:   return 4;
                     36: }
                     37: 
                     38: static int
                     39: utf32le_is_mbc_newline(const UChar* p, const UChar* end)
                     40: {
                     41:   if (p + 3 < end) {
                     42:     if (*p == 0x0a && *(p+1) == 0 && *(p+2) == 0 && *(p+3) == 0)
                     43:       return 1;
                     44: #ifdef USE_UNICODE_ALL_LINE_TERMINATORS
                     45:     if ((*p == 0x0d || *p == 0x85) && *(p+1) == 0x00
                     46:        && (p+2) == 0x00 && *(p+3) == 0x00)
                     47:       return 1;
                     48:     if (*(p+1) == 0x20 && (*p == 0x29 || *p == 0x28)
                     49:        && *(p+2) == 0x00 && *(p+3) == 0x00)
                     50:       return 1;
                     51: #endif
                     52:   }
                     53:   return 0;
                     54: }
                     55: 
                     56: static OnigCodePoint
                     57: utf32le_mbc_to_code(const UChar* p, const UChar* end)
                     58: {
                     59:   return (OnigCodePoint )(((p[3] * 256 + p[2]) * 256 + p[1]) * 256 + p[0]);
                     60: }
                     61: 
                     62: static int
                     63: utf32le_code_to_mbclen(OnigCodePoint code)
                     64: {
                     65:   return 4;
                     66: }
                     67: 
                     68: static int
                     69: utf32le_code_to_mbc(OnigCodePoint code, UChar *buf)
                     70: {
                     71:   UChar* p = buf;
                     72: 
                     73:   *p++ = (UChar ) (code & 0xff);
                     74:   *p++ = (UChar )((code & 0xff00)     >> 8);
                     75:   *p++ = (UChar )((code & 0xff0000)   >>16);
                     76:   *p++ = (UChar )((code & 0xff000000) >>24);
                     77:   return 4;
                     78: }
                     79: 
                     80: static int
                     81: utf32le_mbc_to_normalize(OnigAmbigType flag, const UChar** pp, const UChar* end,
                     82:                          UChar* lower)
                     83: {
                     84:   const UChar* p = *pp;
                     85: 
                     86:   if (*(p+1) == 0 && *(p+2) == 0 && *(p+3) == 0) {
                     87:     if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 &&
                     88:         ONIGENC_IS_MBC_ASCII(p)) ||
                     89:        ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 &&
                     90:         !ONIGENC_IS_MBC_ASCII(p))) {
                     91:       *lower++ = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p);
                     92:     }
                     93:     else {
                     94:       *lower++ = *p;
                     95:     }
                     96:     *lower++ = '\0';
                     97:     *lower++ = '\0';
                     98:     *lower   = '\0';
                     99: 
                    100:     (*pp) += 4;
                    101:     return 4;  /* return byte length of converted char to lower */
                    102:   }
                    103:   else {
                    104:     int len = 4;
                    105:     if (lower != p) {
                    106:       int i;
                    107:       for (i = 0; i < len; i++) {
                    108:        *lower++ = *p++;
                    109:       }
                    110:     }
                    111:     (*pp) += len;
                    112:     return len; /* return byte length of converted char to lower */
                    113:   }
                    114: }
                    115: 
                    116: static int
                    117: utf32le_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end)
                    118: {
                    119:   const UChar* p = *pp;
                    120: 
                    121:   (*pp) += 4;
                    122: 
                    123:   if (*(p+1) == 0 && *(p+2) == 0 && *(p+3) == 0) {
                    124:     int c, v;
                    125: 
                    126:     if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 &&
                    127:         ONIGENC_IS_MBC_ASCII(p)) ||
                    128:        ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 &&
                    129:         !ONIGENC_IS_MBC_ASCII(p))) {
                    130:       c = *p;
                    131:       v = ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(c,
                    132:                        (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER));
                    133:       if ((v | ONIGENC_CTYPE_LOWER) != 0) {
                    134:         /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */
                    135:         if (c >= 0xaa && c <= 0xba)
                    136:           return FALSE;
                    137:         else
                    138:           return TRUE;
                    139:       }
                    140:       return (v != 0 ? TRUE : FALSE);
                    141:     }
                    142:   }
                    143: 
                    144:   return FALSE;
                    145: }
                    146: 
                    147: static UChar*
                    148: utf32le_left_adjust_char_head(const UChar* start, const UChar* s)
                    149: {
                    150:   int rem;
                    151: 
                    152:   if (s <= start) return (UChar* )s;
                    153: 
                    154:   rem = (s - start) % 4;
                    155:   return (UChar* )(s - rem);
                    156: }
                    157: 
                    158: OnigEncodingType OnigEncodingUTF32_LE = {
                    159:   utf32le_mbc_enc_len,
                    160:   "UTF-32LE",   /* name */
                    161:   4,            /* max byte length */
                    162:   4,            /* min byte length */
                    163:   (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE |
                    164:    ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE ),
                    165:   {
                    166:       (OnigCodePoint )'\\'                       /* esc */
                    167:     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.'  */
                    168:     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*'  */
                    169:     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
                    170:     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
                    171:     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
                    172:   },
                    173:   utf32le_is_mbc_newline,
                    174:   utf32le_mbc_to_code,
                    175:   utf32le_code_to_mbclen,
                    176:   utf32le_code_to_mbc,
                    177:   utf32le_mbc_to_normalize,
                    178:   utf32le_is_mbc_ambiguous,
                    179:   onigenc_iso_8859_1_get_all_pair_ambig_codes,
                    180:   onigenc_ess_tsett_get_all_comp_ambig_codes,
                    181:   onigenc_unicode_is_code_ctype,
                    182:   onigenc_unicode_get_ctype_code_range,
                    183:   utf32le_left_adjust_char_head,
                    184:   onigenc_always_false_is_allowed_reverse_match
                    185: };

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>