File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / pcre / pcre_xclass.c
Revision 1.1.1.4 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Sun Jun 15 19:46:03 2014 UTC (9 years, 11 months ago) by misho
Branches: pcre, MAIN
CVS tags: v8_34, HEAD
pcre 8.34

    1: /*************************************************
    2: *      Perl-Compatible Regular Expressions       *
    3: *************************************************/
    4: 
    5: /* PCRE is a library of functions to support regular expressions whose syntax
    6: and semantics are as close as possible to those of the Perl 5 language.
    7: 
    8:                        Written by Philip Hazel
    9:            Copyright (c) 1997-2013 University of Cambridge
   10: 
   11: -----------------------------------------------------------------------------
   12: Redistribution and use in source and binary forms, with or without
   13: modification, are permitted provided that the following conditions are met:
   14: 
   15:     * Redistributions of source code must retain the above copyright notice,
   16:       this list of conditions and the following disclaimer.
   17: 
   18:     * Redistributions in binary form must reproduce the above copyright
   19:       notice, this list of conditions and the following disclaimer in the
   20:       documentation and/or other materials provided with the distribution.
   21: 
   22:     * Neither the name of the University of Cambridge nor the names of its
   23:       contributors may be used to endorse or promote products derived from
   24:       this software without specific prior written permission.
   25: 
   26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
   27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
   30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   36: POSSIBILITY OF SUCH DAMAGE.
   37: -----------------------------------------------------------------------------
   38: */
   39: 
   40: 
   41: /* This module contains an internal function that is used to match an extended
   42: class. It is used by both pcre_exec() and pcre_def_exec(). */
   43: 
   44: 
   45: #ifdef HAVE_CONFIG_H
   46: #include "config.h"
   47: #endif
   48: 
   49: #include "pcre_internal.h"
   50: 
   51: 
   52: /*************************************************
   53: *       Match character against an XCLASS        *
   54: *************************************************/
   55: 
   56: /* This function is called to match a character against an extended class that
   57: might contain values > 255 and/or Unicode properties.
   58: 
   59: Arguments:
   60:   c           the character
   61:   data        points to the flag byte of the XCLASS data
   62: 
   63: Returns:      TRUE if character matches, else FALSE
   64: */
   65: 
   66: BOOL
   67: PRIV(xclass)(pcre_uint32 c, const pcre_uchar *data, BOOL utf)
   68: {
   69: pcre_uchar t;
   70: BOOL negated = (*data & XCL_NOT) != 0;
   71: 
   72: (void)utf;
   73: #ifdef COMPILE_PCRE8
   74: /* In 8 bit mode, this must always be TRUE. Help the compiler to know that. */
   75: utf = TRUE;
   76: #endif
   77: 
   78: /* Character values < 256 are matched against a bitmap, if one is present. If
   79: not, we still carry on, because there may be ranges that start below 256 in the
   80: additional data. */
   81: 
   82: if (c < 256)
   83:   {
   84:   if ((*data & XCL_MAP) != 0 &&
   85:     (((pcre_uint8 *)(data + 1))[c/8] & (1 << (c&7))) != 0)
   86:     return !negated; /* char found */
   87:   }
   88: 
   89: /* First skip the bit map if present. Then match against the list of Unicode
   90: properties or large chars or ranges that end with a large char. We won't ever
   91: encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */
   92: 
   93: if ((*data++ & XCL_MAP) != 0) data += 32 / sizeof(pcre_uchar);
   94: 
   95: while ((t = *data++) != XCL_END)
   96:   {
   97:   pcre_uint32 x, y;
   98:   if (t == XCL_SINGLE)
   99:     {
  100: #ifdef SUPPORT_UTF
  101:     if (utf)
  102:       {
  103:       GETCHARINC(x, data); /* macro generates multiple statements */
  104:       }
  105:     else
  106: #endif
  107:       x = *data++;
  108:     if (c == x) return !negated;
  109:     }
  110:   else if (t == XCL_RANGE)
  111:     {
  112: #ifdef SUPPORT_UTF
  113:     if (utf)
  114:       {
  115:       GETCHARINC(x, data); /* macro generates multiple statements */
  116:       GETCHARINC(y, data); /* macro generates multiple statements */
  117:       }
  118:     else
  119: #endif
  120:       {
  121:       x = *data++;
  122:       y = *data++;
  123:       }
  124:     if (c >= x && c <= y) return !negated;
  125:     }
  126: 
  127: #ifdef SUPPORT_UCP
  128:   else  /* XCL_PROP & XCL_NOTPROP */
  129:     {
  130:     const ucd_record *prop = GET_UCD(c);
  131:     BOOL isprop = t == XCL_PROP;
  132: 
  133:     switch(*data)
  134:       {
  135:       case PT_ANY:
  136:       if (isprop) return !negated;
  137:       break;
  138: 
  139:       case PT_LAMP:
  140:       if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
  141:            prop->chartype == ucp_Lt) == isprop) return !negated;
  142:       break;
  143: 
  144:       case PT_GC:
  145:       if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == isprop)
  146:         return !negated;
  147:       break;
  148: 
  149:       case PT_PC:
  150:       if ((data[1] == prop->chartype) == isprop) return !negated;
  151:       break;
  152: 
  153:       case PT_SC:
  154:       if ((data[1] == prop->script) == isprop) return !negated;
  155:       break;
  156: 
  157:       case PT_ALNUM:
  158:       if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
  159:            PRIV(ucp_gentype)[prop->chartype] == ucp_N) == isprop)
  160:         return !negated;
  161:       break;
  162: 
  163:       /* Perl space used to exclude VT, but from Perl 5.18 it is included,
  164:       which means that Perl space and POSIX space are now identical. PCRE
  165:       was changed at release 8.34. */
  166: 
  167:       case PT_SPACE:    /* Perl space */
  168:       case PT_PXSPACE:  /* POSIX space */
  169:       switch(c)
  170:         {
  171:         HSPACE_CASES:
  172:         VSPACE_CASES:
  173:         if (isprop) return !negated;
  174:         break;
  175: 
  176:         default:
  177:         if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == isprop)
  178:           return !negated;
  179:         break;
  180:         }
  181:       break;
  182: 
  183:       case PT_WORD:
  184:       if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
  185:            PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE)
  186:              == isprop)
  187:         return !negated;
  188:       break;
  189: 
  190:       case PT_UCNC:
  191:       if (c < 0xa0)
  192:         {
  193:         if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
  194:              c == CHAR_GRAVE_ACCENT) == isprop)
  195:           return !negated;
  196:         }
  197:       else
  198:         {
  199:         if ((c < 0xd800 || c > 0xdfff) == isprop)
  200:           return !negated;
  201:         }
  202:       break;
  203: 
  204:       /* The following three properties can occur only in an XCLASS, as there
  205:       is no \p or \P coding for them. */
  206: 
  207:       /* Graphic character. Implement this as not Z (space or separator) and
  208:       not C (other), except for Cf (format) with a few exceptions. This seems
  209:       to be what Perl does. The exceptional characters are:
  210: 
  211:       U+061C           Arabic Letter Mark
  212:       U+180E           Mongolian Vowel Separator
  213:       U+2066 - U+2069  Various "isolate"s
  214:       */
  215: 
  216:       case PT_PXGRAPH:
  217:       if ((PRIV(ucp_gentype)[prop->chartype] != ucp_Z &&
  218:             (PRIV(ucp_gentype)[prop->chartype] != ucp_C ||
  219:               (prop->chartype == ucp_Cf &&
  220:                 c != 0x061c && c != 0x180e && (c < 0x2066 || c > 0x2069))
  221:          )) == isprop)
  222:         return !negated;
  223:       break;
  224: 
  225:       /* Printable character: same as graphic, with the addition of Zs, i.e.
  226:       not Zl and not Zp, and U+180E. */
  227: 
  228:       case PT_PXPRINT:
  229:       if ((prop->chartype != ucp_Zl &&
  230:            prop->chartype != ucp_Zp &&
  231:             (PRIV(ucp_gentype)[prop->chartype] != ucp_C ||
  232:               (prop->chartype == ucp_Cf &&
  233:                 c != 0x061c && (c < 0x2066 || c > 0x2069))
  234:          )) == isprop)
  235:         return !negated;
  236:       break;
  237: 
  238:       /* Punctuation: all Unicode punctuation, plus ASCII characters that
  239:       Unicode treats as symbols rather than punctuation, for Perl
  240:       compatibility (these are $+<=>^`|~). */
  241: 
  242:       case PT_PXPUNCT:
  243:       if ((PRIV(ucp_gentype)[prop->chartype] == ucp_P ||
  244:             (c < 256 && PRIV(ucp_gentype)[prop->chartype] == ucp_S)) == isprop)
  245:         return !negated;
  246:       break;
  247: 
  248:       /* This should never occur, but compilers may mutter if there is no
  249:       default. */
  250: 
  251:       default:
  252:       return FALSE;
  253:       }
  254: 
  255:     data += 2;
  256:     }
  257: #endif  /* SUPPORT_UCP */
  258:   }
  259: 
  260: return negated;   /* char did not match */
  261: }
  262: 
  263: /* End of pcre_xclass.c */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>