Annotation of embedaddon/pcre/pcre_xclass.c, revision 1.1.1.4
1.1 misho 1: /*************************************************
2: * Perl-Compatible Regular Expressions *
3: *************************************************/
4:
5: /* PCRE is a library of functions to support regular expressions whose syntax
6: and semantics are as close as possible to those of the Perl 5 language.
7:
8: Written by Philip Hazel
1.1.1.3 misho 9: Copyright (c) 1997-2013 University of Cambridge
1.1 misho 10:
11: -----------------------------------------------------------------------------
12: Redistribution and use in source and binary forms, with or without
13: modification, are permitted provided that the following conditions are met:
14:
15: * Redistributions of source code must retain the above copyright notice,
16: this list of conditions and the following disclaimer.
17:
18: * Redistributions in binary form must reproduce the above copyright
19: notice, this list of conditions and the following disclaimer in the
20: documentation and/or other materials provided with the distribution.
21:
22: * Neither the name of the University of Cambridge nor the names of its
23: contributors may be used to endorse or promote products derived from
24: this software without specific prior written permission.
25:
26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36: POSSIBILITY OF SUCH DAMAGE.
37: -----------------------------------------------------------------------------
38: */
39:
40:
41: /* This module contains an internal function that is used to match an extended
42: class. It is used by both pcre_exec() and pcre_def_exec(). */
43:
44:
45: #ifdef HAVE_CONFIG_H
46: #include "config.h"
47: #endif
48:
49: #include "pcre_internal.h"
50:
51:
52: /*************************************************
53: * Match character against an XCLASS *
54: *************************************************/
55:
56: /* This function is called to match a character against an extended class that
57: might contain values > 255 and/or Unicode properties.
58:
59: Arguments:
60: c the character
61: data points to the flag byte of the XCLASS data
62:
63: Returns: TRUE if character matches, else FALSE
64: */
65:
66: BOOL
1.1.1.3 misho 67: PRIV(xclass)(pcre_uint32 c, const pcre_uchar *data, BOOL utf)
1.1 misho 68: {
1.1.1.3 misho 69: pcre_uchar t;
1.1 misho 70: BOOL negated = (*data & XCL_NOT) != 0;
71:
1.1.1.2 misho 72: (void)utf;
73: #ifdef COMPILE_PCRE8
74: /* In 8 bit mode, this must always be TRUE. Help the compiler to know that. */
75: utf = TRUE;
76: #endif
77:
1.1 misho 78: /* Character values < 256 are matched against a bitmap, if one is present. If
79: not, we still carry on, because there may be ranges that start below 256 in the
80: additional data. */
81:
82: if (c < 256)
83: {
1.1.1.2 misho 84: if ((*data & XCL_MAP) != 0 &&
85: (((pcre_uint8 *)(data + 1))[c/8] & (1 << (c&7))) != 0)
86: return !negated; /* char found */
1.1 misho 87: }
88:
89: /* First skip the bit map if present. Then match against the list of Unicode
90: properties or large chars or ranges that end with a large char. We won't ever
91: encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */
92:
1.1.1.2 misho 93: if ((*data++ & XCL_MAP) != 0) data += 32 / sizeof(pcre_uchar);
1.1 misho 94:
95: while ((t = *data++) != XCL_END)
96: {
1.1.1.3 misho 97: pcre_uint32 x, y;
1.1 misho 98: if (t == XCL_SINGLE)
99: {
1.1.1.2 misho 100: #ifdef SUPPORT_UTF
101: if (utf)
102: {
103: GETCHARINC(x, data); /* macro generates multiple statements */
104: }
105: else
106: #endif
107: x = *data++;
1.1 misho 108: if (c == x) return !negated;
109: }
110: else if (t == XCL_RANGE)
111: {
1.1.1.2 misho 112: #ifdef SUPPORT_UTF
113: if (utf)
114: {
115: GETCHARINC(x, data); /* macro generates multiple statements */
116: GETCHARINC(y, data); /* macro generates multiple statements */
117: }
118: else
119: #endif
120: {
121: x = *data++;
122: y = *data++;
123: }
1.1 misho 124: if (c >= x && c <= y) return !negated;
125: }
126:
127: #ifdef SUPPORT_UCP
128: else /* XCL_PROP & XCL_NOTPROP */
129: {
130: const ucd_record *prop = GET_UCD(c);
1.1.1.4 ! misho 131: BOOL isprop = t == XCL_PROP;
1.1 misho 132:
133: switch(*data)
134: {
135: case PT_ANY:
1.1.1.4 ! misho 136: if (isprop) return !negated;
1.1 misho 137: break;
138:
139: case PT_LAMP:
140: if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1.1.1.4 ! misho 141: prop->chartype == ucp_Lt) == isprop) return !negated;
1.1 misho 142: break;
143:
144: case PT_GC:
1.1.1.4 ! misho 145: if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == isprop)
1.1 misho 146: return !negated;
147: break;
148:
149: case PT_PC:
1.1.1.4 ! misho 150: if ((data[1] == prop->chartype) == isprop) return !negated;
1.1 misho 151: break;
152:
153: case PT_SC:
1.1.1.4 ! misho 154: if ((data[1] == prop->script) == isprop) return !negated;
1.1 misho 155: break;
156:
157: case PT_ALNUM:
1.1.1.2 misho 158: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1.1.1.4 ! misho 159: PRIV(ucp_gentype)[prop->chartype] == ucp_N) == isprop)
1.1 misho 160: return !negated;
161: break;
162:
1.1.1.4 ! misho 163: /* Perl space used to exclude VT, but from Perl 5.18 it is included,
! 164: which means that Perl space and POSIX space are now identical. PCRE
! 165: was changed at release 8.34. */
1.1 misho 166:
1.1.1.4 ! misho 167: case PT_SPACE: /* Perl space */
1.1 misho 168: case PT_PXSPACE: /* POSIX space */
1.1.1.4 ! misho 169: switch(c)
! 170: {
! 171: HSPACE_CASES:
! 172: VSPACE_CASES:
! 173: if (isprop) return !negated;
! 174: break;
! 175:
! 176: default:
! 177: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == isprop)
! 178: return !negated;
! 179: break;
! 180: }
1.1 misho 181: break;
182:
183: case PT_WORD:
1.1.1.2 misho 184: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
185: PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE)
1.1.1.4 ! misho 186: == isprop)
1.1 misho 187: return !negated;
188: break;
189:
1.1.1.3 misho 190: case PT_UCNC:
191: if (c < 0xa0)
192: {
193: if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1.1.1.4 ! misho 194: c == CHAR_GRAVE_ACCENT) == isprop)
1.1.1.3 misho 195: return !negated;
196: }
197: else
198: {
1.1.1.4 ! misho 199: if ((c < 0xd800 || c > 0xdfff) == isprop)
1.1.1.3 misho 200: return !negated;
201: }
202: break;
203:
1.1.1.4 ! misho 204: /* The following three properties can occur only in an XCLASS, as there
! 205: is no \p or \P coding for them. */
! 206:
! 207: /* Graphic character. Implement this as not Z (space or separator) and
! 208: not C (other), except for Cf (format) with a few exceptions. This seems
! 209: to be what Perl does. The exceptional characters are:
! 210:
! 211: U+061C Arabic Letter Mark
! 212: U+180E Mongolian Vowel Separator
! 213: U+2066 - U+2069 Various "isolate"s
! 214: */
! 215:
! 216: case PT_PXGRAPH:
! 217: if ((PRIV(ucp_gentype)[prop->chartype] != ucp_Z &&
! 218: (PRIV(ucp_gentype)[prop->chartype] != ucp_C ||
! 219: (prop->chartype == ucp_Cf &&
! 220: c != 0x061c && c != 0x180e && (c < 0x2066 || c > 0x2069))
! 221: )) == isprop)
! 222: return !negated;
! 223: break;
! 224:
! 225: /* Printable character: same as graphic, with the addition of Zs, i.e.
! 226: not Zl and not Zp, and U+180E. */
! 227:
! 228: case PT_PXPRINT:
! 229: if ((prop->chartype != ucp_Zl &&
! 230: prop->chartype != ucp_Zp &&
! 231: (PRIV(ucp_gentype)[prop->chartype] != ucp_C ||
! 232: (prop->chartype == ucp_Cf &&
! 233: c != 0x061c && (c < 0x2066 || c > 0x2069))
! 234: )) == isprop)
! 235: return !negated;
! 236: break;
! 237:
! 238: /* Punctuation: all Unicode punctuation, plus ASCII characters that
! 239: Unicode treats as symbols rather than punctuation, for Perl
! 240: compatibility (these are $+<=>^`|~). */
! 241:
! 242: case PT_PXPUNCT:
! 243: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_P ||
! 244: (c < 256 && PRIV(ucp_gentype)[prop->chartype] == ucp_S)) == isprop)
! 245: return !negated;
! 246: break;
! 247:
1.1 misho 248: /* This should never occur, but compilers may mutter if there is no
249: default. */
250:
251: default:
252: return FALSE;
253: }
254:
255: data += 2;
256: }
257: #endif /* SUPPORT_UCP */
258: }
259:
260: return negated; /* char did not match */
261: }
262:
263: /* End of pcre_xclass.c */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>