Annotation of embedaddon/pcre/pcre_xclass.c, revision 1.1.1.4

1.1       misho       1: /*************************************************
                      2: *      Perl-Compatible Regular Expressions       *
                      3: *************************************************/
                      4: 
                      5: /* PCRE is a library of functions to support regular expressions whose syntax
                      6: and semantics are as close as possible to those of the Perl 5 language.
                      7: 
                      8:                        Written by Philip Hazel
1.1.1.3   misho       9:            Copyright (c) 1997-2013 University of Cambridge
1.1       misho      10: 
                     11: -----------------------------------------------------------------------------
                     12: Redistribution and use in source and binary forms, with or without
                     13: modification, are permitted provided that the following conditions are met:
                     14: 
                     15:     * Redistributions of source code must retain the above copyright notice,
                     16:       this list of conditions and the following disclaimer.
                     17: 
                     18:     * Redistributions in binary form must reproduce the above copyright
                     19:       notice, this list of conditions and the following disclaimer in the
                     20:       documentation and/or other materials provided with the distribution.
                     21: 
                     22:     * Neither the name of the University of Cambridge nor the names of its
                     23:       contributors may be used to endorse or promote products derived from
                     24:       this software without specific prior written permission.
                     25: 
                     26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
                     27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
                     28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
                     29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
                     30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
                     31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
                     32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
                     33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
                     34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
                     35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
                     36: POSSIBILITY OF SUCH DAMAGE.
                     37: -----------------------------------------------------------------------------
                     38: */
                     39: 
                     40: 
                     41: /* This module contains an internal function that is used to match an extended
                     42: class. It is used by both pcre_exec() and pcre_def_exec(). */
                     43: 
                     44: 
                     45: #ifdef HAVE_CONFIG_H
                     46: #include "config.h"
                     47: #endif
                     48: 
                     49: #include "pcre_internal.h"
                     50: 
                     51: 
                     52: /*************************************************
                     53: *       Match character against an XCLASS        *
                     54: *************************************************/
                     55: 
                     56: /* This function is called to match a character against an extended class that
                     57: might contain values > 255 and/or Unicode properties.
                     58: 
                     59: Arguments:
                     60:   c           the character
                     61:   data        points to the flag byte of the XCLASS data
                     62: 
                     63: Returns:      TRUE if character matches, else FALSE
                     64: */
                     65: 
                     66: BOOL
1.1.1.3   misho      67: PRIV(xclass)(pcre_uint32 c, const pcre_uchar *data, BOOL utf)
1.1       misho      68: {
1.1.1.3   misho      69: pcre_uchar t;
1.1       misho      70: BOOL negated = (*data & XCL_NOT) != 0;
                     71: 
1.1.1.2   misho      72: (void)utf;
                     73: #ifdef COMPILE_PCRE8
                     74: /* In 8 bit mode, this must always be TRUE. Help the compiler to know that. */
                     75: utf = TRUE;
                     76: #endif
                     77: 
1.1       misho      78: /* Character values < 256 are matched against a bitmap, if one is present. If
                     79: not, we still carry on, because there may be ranges that start below 256 in the
                     80: additional data. */
                     81: 
                     82: if (c < 256)
                     83:   {
1.1.1.2   misho      84:   if ((*data & XCL_MAP) != 0 &&
                     85:     (((pcre_uint8 *)(data + 1))[c/8] & (1 << (c&7))) != 0)
                     86:     return !negated; /* char found */
1.1       misho      87:   }
                     88: 
                     89: /* First skip the bit map if present. Then match against the list of Unicode
                     90: properties or large chars or ranges that end with a large char. We won't ever
                     91: encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */
                     92: 
1.1.1.2   misho      93: if ((*data++ & XCL_MAP) != 0) data += 32 / sizeof(pcre_uchar);
1.1       misho      94: 
                     95: while ((t = *data++) != XCL_END)
                     96:   {
1.1.1.3   misho      97:   pcre_uint32 x, y;
1.1       misho      98:   if (t == XCL_SINGLE)
                     99:     {
1.1.1.2   misho     100: #ifdef SUPPORT_UTF
                    101:     if (utf)
                    102:       {
                    103:       GETCHARINC(x, data); /* macro generates multiple statements */
                    104:       }
                    105:     else
                    106: #endif
                    107:       x = *data++;
1.1       misho     108:     if (c == x) return !negated;
                    109:     }
                    110:   else if (t == XCL_RANGE)
                    111:     {
1.1.1.2   misho     112: #ifdef SUPPORT_UTF
                    113:     if (utf)
                    114:       {
                    115:       GETCHARINC(x, data); /* macro generates multiple statements */
                    116:       GETCHARINC(y, data); /* macro generates multiple statements */
                    117:       }
                    118:     else
                    119: #endif
                    120:       {
                    121:       x = *data++;
                    122:       y = *data++;
                    123:       }
1.1       misho     124:     if (c >= x && c <= y) return !negated;
                    125:     }
                    126: 
                    127: #ifdef SUPPORT_UCP
                    128:   else  /* XCL_PROP & XCL_NOTPROP */
                    129:     {
                    130:     const ucd_record *prop = GET_UCD(c);
1.1.1.4 ! misho     131:     BOOL isprop = t == XCL_PROP;
1.1       misho     132: 
                    133:     switch(*data)
                    134:       {
                    135:       case PT_ANY:
1.1.1.4 ! misho     136:       if (isprop) return !negated;
1.1       misho     137:       break;
                    138: 
                    139:       case PT_LAMP:
                    140:       if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1.1.1.4 ! misho     141:            prop->chartype == ucp_Lt) == isprop) return !negated;
1.1       misho     142:       break;
                    143: 
                    144:       case PT_GC:
1.1.1.4 ! misho     145:       if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == isprop)
1.1       misho     146:         return !negated;
                    147:       break;
                    148: 
                    149:       case PT_PC:
1.1.1.4 ! misho     150:       if ((data[1] == prop->chartype) == isprop) return !negated;
1.1       misho     151:       break;
                    152: 
                    153:       case PT_SC:
1.1.1.4 ! misho     154:       if ((data[1] == prop->script) == isprop) return !negated;
1.1       misho     155:       break;
                    156: 
                    157:       case PT_ALNUM:
1.1.1.2   misho     158:       if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1.1.1.4 ! misho     159:            PRIV(ucp_gentype)[prop->chartype] == ucp_N) == isprop)
1.1       misho     160:         return !negated;
                    161:       break;
                    162: 
1.1.1.4 ! misho     163:       /* Perl space used to exclude VT, but from Perl 5.18 it is included,
        !           164:       which means that Perl space and POSIX space are now identical. PCRE
        !           165:       was changed at release 8.34. */
1.1       misho     166: 
1.1.1.4 ! misho     167:       case PT_SPACE:    /* Perl space */
1.1       misho     168:       case PT_PXSPACE:  /* POSIX space */
1.1.1.4 ! misho     169:       switch(c)
        !           170:         {
        !           171:         HSPACE_CASES:
        !           172:         VSPACE_CASES:
        !           173:         if (isprop) return !negated;
        !           174:         break;
        !           175: 
        !           176:         default:
        !           177:         if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == isprop)
        !           178:           return !negated;
        !           179:         break;
        !           180:         }
1.1       misho     181:       break;
                    182: 
                    183:       case PT_WORD:
1.1.1.2   misho     184:       if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
                    185:            PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE)
1.1.1.4 ! misho     186:              == isprop)
1.1       misho     187:         return !negated;
                    188:       break;
                    189: 
1.1.1.3   misho     190:       case PT_UCNC:
                    191:       if (c < 0xa0)
                    192:         {
                    193:         if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1.1.1.4 ! misho     194:              c == CHAR_GRAVE_ACCENT) == isprop)
1.1.1.3   misho     195:           return !negated;
                    196:         }
                    197:       else
                    198:         {
1.1.1.4 ! misho     199:         if ((c < 0xd800 || c > 0xdfff) == isprop)
1.1.1.3   misho     200:           return !negated;
                    201:         }
                    202:       break;
                    203: 
1.1.1.4 ! misho     204:       /* The following three properties can occur only in an XCLASS, as there
        !           205:       is no \p or \P coding for them. */
        !           206: 
        !           207:       /* Graphic character. Implement this as not Z (space or separator) and
        !           208:       not C (other), except for Cf (format) with a few exceptions. This seems
        !           209:       to be what Perl does. The exceptional characters are:
        !           210: 
        !           211:       U+061C           Arabic Letter Mark
        !           212:       U+180E           Mongolian Vowel Separator
        !           213:       U+2066 - U+2069  Various "isolate"s
        !           214:       */
        !           215: 
        !           216:       case PT_PXGRAPH:
        !           217:       if ((PRIV(ucp_gentype)[prop->chartype] != ucp_Z &&
        !           218:             (PRIV(ucp_gentype)[prop->chartype] != ucp_C ||
        !           219:               (prop->chartype == ucp_Cf &&
        !           220:                 c != 0x061c && c != 0x180e && (c < 0x2066 || c > 0x2069))
        !           221:          )) == isprop)
        !           222:         return !negated;
        !           223:       break;
        !           224: 
        !           225:       /* Printable character: same as graphic, with the addition of Zs, i.e.
        !           226:       not Zl and not Zp, and U+180E. */
        !           227: 
        !           228:       case PT_PXPRINT:
        !           229:       if ((prop->chartype != ucp_Zl &&
        !           230:            prop->chartype != ucp_Zp &&
        !           231:             (PRIV(ucp_gentype)[prop->chartype] != ucp_C ||
        !           232:               (prop->chartype == ucp_Cf &&
        !           233:                 c != 0x061c && (c < 0x2066 || c > 0x2069))
        !           234:          )) == isprop)
        !           235:         return !negated;
        !           236:       break;
        !           237: 
        !           238:       /* Punctuation: all Unicode punctuation, plus ASCII characters that
        !           239:       Unicode treats as symbols rather than punctuation, for Perl
        !           240:       compatibility (these are $+<=>^`|~). */
        !           241: 
        !           242:       case PT_PXPUNCT:
        !           243:       if ((PRIV(ucp_gentype)[prop->chartype] == ucp_P ||
        !           244:             (c < 256 && PRIV(ucp_gentype)[prop->chartype] == ucp_S)) == isprop)
        !           245:         return !negated;
        !           246:       break;
        !           247: 
1.1       misho     248:       /* This should never occur, but compilers may mutter if there is no
                    249:       default. */
                    250: 
                    251:       default:
                    252:       return FALSE;
                    253:       }
                    254: 
                    255:     data += 2;
                    256:     }
                    257: #endif  /* SUPPORT_UCP */
                    258:   }
                    259: 
                    260: return negated;   /* char did not match */
                    261: }
                    262: 
                    263: /* End of pcre_xclass.c */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>