Annotation of embedaddon/php/ext/pcre/pcrelib/pcre_compile.c, revision 1.1.1.2
1.1 misho 1: /*************************************************
2: * Perl-Compatible Regular Expressions *
3: *************************************************/
4:
5: /* PCRE is a library of functions to support regular expressions whose syntax
6: and semantics are as close as possible to those of the Perl 5 language.
7:
8: Written by Philip Hazel
1.1.1.2 ! misho 9: Copyright (c) 1997-2012 University of Cambridge
1.1 misho 10:
11: -----------------------------------------------------------------------------
12: Redistribution and use in source and binary forms, with or without
13: modification, are permitted provided that the following conditions are met:
14:
15: * Redistributions of source code must retain the above copyright notice,
16: this list of conditions and the following disclaimer.
17:
18: * Redistributions in binary form must reproduce the above copyright
19: notice, this list of conditions and the following disclaimer in the
20: documentation and/or other materials provided with the distribution.
21:
22: * Neither the name of the University of Cambridge nor the names of its
23: contributors may be used to endorse or promote products derived from
24: this software without specific prior written permission.
25:
26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36: POSSIBILITY OF SUCH DAMAGE.
37: -----------------------------------------------------------------------------
38: */
39:
40:
41: /* This module contains the external function pcre_compile(), along with
42: supporting internal functions that are not used by other modules. */
43:
44:
1.1.1.2 ! misho 45: #ifdef HAVE_CONFIG_H
1.1 misho 46: #include "config.h"
1.1.1.2 ! misho 47: #endif
1.1 misho 48:
49: #define NLBLOCK cd /* Block containing newline information */
50: #define PSSTART start_pattern /* Field containing processed string start */
51: #define PSEND end_pattern /* Field containing processed string end */
52:
53: #include "pcre_internal.h"
54:
55:
1.1.1.2 ! misho 56: /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
! 57: is also used by pcretest. PCRE_DEBUG is not defined when building a production
! 58: library. We do not need to select pcre16_printint.c specially, because the
! 59: COMPILE_PCREx macro will already be appropriately set. */
1.1 misho 60:
61: #ifdef PCRE_DEBUG
1.1.1.2 ! misho 62: /* pcre_printint.c should not include any headers */
! 63: #define PCRE_INCLUDED
! 64: #include "pcre_printint.c"
! 65: #undef PCRE_INCLUDED
1.1 misho 66: #endif
67:
68:
69: /* Macro for setting individual bits in class bitmaps. */
70:
1.1.1.2 ! misho 71: #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
1.1 misho 72:
73: /* Maximum length value to check against when making sure that the integer that
74: holds the compiled pattern length does not overflow. We make it a bit less than
75: INT_MAX to allow for adding in group terminating bytes, so that we don't have
76: to check them every time. */
77:
78: #define OFLOW_MAX (INT_MAX - 20)
79:
1.1.1.2 ! misho 80: /* Definitions to allow mutual recursion */
! 81:
! 82: static int
! 83: add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
! 84: const pcre_uint32 *, unsigned int);
! 85:
! 86: static BOOL
! 87: compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
! 88: pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
! 89: compile_data *, int *);
! 90:
! 91:
1.1 misho 92:
93: /*************************************************
94: * Code parameters and static tables *
95: *************************************************/
96:
97: /* This value specifies the size of stack workspace that is used during the
98: first pre-compile phase that determines how much memory is required. The regex
99: is partly compiled into this space, but the compiled parts are discarded as
100: soon as they can be, so that hopefully there will never be an overrun. The code
101: does, however, check for an overrun. The largest amount I've seen used is 218,
102: so this number is very generous.
103:
104: The same workspace is used during the second, actual compile phase for
105: remembering forward references to groups so that they can be filled in at the
106: end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
1.1.1.2 ! misho 107: is 4 there is plenty of room for most patterns. However, the memory can get
! 108: filled up by repetitions of forward references, for example patterns like
! 109: /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
! 110: that the workspace is expanded using malloc() in this situation. The value
! 111: below is therefore a minimum, and we put a maximum on it for safety. The
! 112: minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
! 113: kicks in at the same number of forward references in all cases. */
1.1 misho 114:
1.1.1.2 ! misho 115: #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
! 116: #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
1.1 misho 117:
118: /* The overrun tests check for a slightly smaller size so that they detect the
119: overrun before it actually does run off the end of the data block. */
120:
1.1.1.2 ! misho 121: #define WORK_SIZE_SAFETY_MARGIN (100)
! 122:
! 123: /* Private flags added to firstchar and reqchar. */
1.1 misho 124:
1.1.1.2 ! misho 125: #define REQ_CASELESS (1 << 0) /* Indicates caselessness */
! 126: #define REQ_VARY (1 << 1) /* Reqchar followed non-literal item */
! 127: /* Negative values for the firstchar and reqchar flags */
! 128: #define REQ_UNSET (-2)
! 129: #define REQ_NONE (-1)
! 130:
! 131: /* Repeated character flags. */
! 132:
! 133: #define UTF_LENGTH 0x10000000l /* The char contains its length. */
1.1 misho 134:
135: /* Table for handling escaped characters in the range '0'-'z'. Positive returns
136: are simple data values; negative values are for special things like \d and so
137: on. Zero means further processing is needed (for things like \x), or the escape
138: is invalid. */
139:
140: #ifndef EBCDIC
141:
142: /* This is the "normal" table for ASCII systems or for EBCDIC systems running
143: in UTF-8 mode. */
144:
145: static const short int escapes[] = {
146: 0, 0,
147: 0, 0,
148: 0, 0,
149: 0, 0,
150: 0, 0,
151: CHAR_COLON, CHAR_SEMICOLON,
152: CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
153: CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
154: CHAR_COMMERCIAL_AT, -ESC_A,
155: -ESC_B, -ESC_C,
156: -ESC_D, -ESC_E,
157: 0, -ESC_G,
158: -ESC_H, 0,
159: 0, -ESC_K,
160: 0, 0,
161: -ESC_N, 0,
162: -ESC_P, -ESC_Q,
163: -ESC_R, -ESC_S,
164: 0, 0,
165: -ESC_V, -ESC_W,
166: -ESC_X, 0,
167: -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
168: CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
169: CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
170: CHAR_GRAVE_ACCENT, 7,
171: -ESC_b, 0,
172: -ESC_d, ESC_e,
173: ESC_f, 0,
174: -ESC_h, 0,
175: 0, -ESC_k,
176: 0, 0,
177: ESC_n, 0,
178: -ESC_p, 0,
179: ESC_r, -ESC_s,
180: ESC_tee, 0,
181: -ESC_v, -ESC_w,
182: 0, 0,
183: -ESC_z
184: };
185:
186: #else
187:
188: /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
189:
190: static const short int escapes[] = {
191: /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
192: /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
193: /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
194: /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
195: /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
196: /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
197: /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
198: /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
199: /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
200: /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
201: /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
202: /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
203: /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
204: /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
205: /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
206: /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
207: /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
208: /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
209: /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
210: /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
211: /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
212: /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
213: /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
214: };
215: #endif
216:
217:
218: /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
219: searched linearly. Put all the names into a single string, in order to reduce
220: the number of relocations when a shared library is dynamically linked. The
221: string is built from string macros so that it works in UTF-8 mode on EBCDIC
222: platforms. */
223:
224: typedef struct verbitem {
225: int len; /* Length of verb name */
226: int op; /* Op when no arg, or -1 if arg mandatory */
227: int op_arg; /* Op when arg present, or -1 if not allowed */
228: } verbitem;
229:
230: static const char verbnames[] =
231: "\0" /* Empty name is a shorthand for MARK */
232: STRING_MARK0
233: STRING_ACCEPT0
234: STRING_COMMIT0
235: STRING_F0
236: STRING_FAIL0
237: STRING_PRUNE0
238: STRING_SKIP0
239: STRING_THEN;
240:
241: static const verbitem verbs[] = {
242: { 0, -1, OP_MARK },
243: { 4, -1, OP_MARK },
244: { 6, OP_ACCEPT, -1 },
245: { 6, OP_COMMIT, -1 },
246: { 1, OP_FAIL, -1 },
247: { 4, OP_FAIL, -1 },
248: { 5, OP_PRUNE, OP_PRUNE_ARG },
249: { 4, OP_SKIP, OP_SKIP_ARG },
250: { 4, OP_THEN, OP_THEN_ARG }
251: };
252:
253: static const int verbcount = sizeof(verbs)/sizeof(verbitem);
254:
255:
256: /* Tables of names of POSIX character classes and their lengths. The names are
257: now all in a single string, to reduce the number of relocations when a shared
258: library is dynamically loaded. The list of lengths is terminated by a zero
259: length entry. The first three must be alpha, lower, upper, as this is assumed
260: for handling case independence. */
261:
262: static const char posix_names[] =
263: STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
264: STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
265: STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
266: STRING_word0 STRING_xdigit;
267:
1.1.1.2 ! misho 268: static const pcre_uint8 posix_name_lengths[] = {
1.1 misho 269: 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
270:
271: /* Table of class bit maps for each POSIX class. Each class is formed from a
272: base map, with an optional addition or removal of another map. Then, for some
273: classes, there is some additional tweaking: for [:blank:] the vertical space
274: characters are removed, and for [:alpha:] and [:alnum:] the underscore
275: character is removed. The triples in the table consist of the base map offset,
276: second map offset or -1 if no second map, and a non-negative value for map
277: addition or a negative value for map subtraction (if there are two maps). The
278: absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
279: remove vertical space characters, 2 => remove underscore. */
280:
281: static const int posix_class_maps[] = {
282: cbit_word, cbit_digit, -2, /* alpha */
283: cbit_lower, -1, 0, /* lower */
284: cbit_upper, -1, 0, /* upper */
285: cbit_word, -1, 2, /* alnum - word without underscore */
286: cbit_print, cbit_cntrl, 0, /* ascii */
287: cbit_space, -1, 1, /* blank - a GNU extension */
288: cbit_cntrl, -1, 0, /* cntrl */
289: cbit_digit, -1, 0, /* digit */
290: cbit_graph, -1, 0, /* graph */
291: cbit_print, -1, 0, /* print */
292: cbit_punct, -1, 0, /* punct */
293: cbit_space, -1, 0, /* space */
294: cbit_word, -1, 0, /* word - a Perl extension */
295: cbit_xdigit,-1, 0 /* xdigit */
296: };
297:
298: /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
299: substitutes must be in the order of the names, defined above, and there are
300: both positive and negative cases. NULL means no substitute. */
301:
302: #ifdef SUPPORT_UCP
1.1.1.2 ! misho 303: static const pcre_uchar string_PNd[] = {
! 304: CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
! 305: CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
! 306: static const pcre_uchar string_pNd[] = {
! 307: CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
! 308: CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
! 309: static const pcre_uchar string_PXsp[] = {
! 310: CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
! 311: CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
! 312: static const pcre_uchar string_pXsp[] = {
! 313: CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
! 314: CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
! 315: static const pcre_uchar string_PXwd[] = {
! 316: CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
! 317: CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
! 318: static const pcre_uchar string_pXwd[] = {
! 319: CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
! 320: CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
! 321:
! 322: static const pcre_uchar *substitutes[] = {
! 323: string_PNd, /* \D */
! 324: string_pNd, /* \d */
! 325: string_PXsp, /* \S */ /* NOTE: Xsp is Perl space */
! 326: string_pXsp, /* \s */
! 327: string_PXwd, /* \W */
! 328: string_pXwd /* \w */
1.1 misho 329: };
330:
1.1.1.2 ! misho 331: static const pcre_uchar string_pL[] = {
! 332: CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
! 333: CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
! 334: static const pcre_uchar string_pLl[] = {
! 335: CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
! 336: CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
! 337: static const pcre_uchar string_pLu[] = {
! 338: CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
! 339: CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
! 340: static const pcre_uchar string_pXan[] = {
! 341: CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
! 342: CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
! 343: static const pcre_uchar string_h[] = {
! 344: CHAR_BACKSLASH, CHAR_h, '\0' };
! 345: static const pcre_uchar string_pXps[] = {
! 346: CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
! 347: CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
! 348: static const pcre_uchar string_PL[] = {
! 349: CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
! 350: CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
! 351: static const pcre_uchar string_PLl[] = {
! 352: CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
! 353: CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
! 354: static const pcre_uchar string_PLu[] = {
! 355: CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
! 356: CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
! 357: static const pcre_uchar string_PXan[] = {
! 358: CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
! 359: CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
! 360: static const pcre_uchar string_H[] = {
! 361: CHAR_BACKSLASH, CHAR_H, '\0' };
! 362: static const pcre_uchar string_PXps[] = {
! 363: CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
! 364: CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
! 365:
! 366: static const pcre_uchar *posix_substitutes[] = {
! 367: string_pL, /* alpha */
! 368: string_pLl, /* lower */
! 369: string_pLu, /* upper */
! 370: string_pXan, /* alnum */
! 371: NULL, /* ascii */
! 372: string_h, /* blank */
! 373: NULL, /* cntrl */
! 374: string_pNd, /* digit */
! 375: NULL, /* graph */
! 376: NULL, /* print */
! 377: NULL, /* punct */
! 378: string_pXps, /* space */ /* NOTE: Xps is POSIX space */
! 379: string_pXwd, /* word */
! 380: NULL, /* xdigit */
1.1 misho 381: /* Negated cases */
1.1.1.2 ! misho 382: string_PL, /* ^alpha */
! 383: string_PLl, /* ^lower */
! 384: string_PLu, /* ^upper */
! 385: string_PXan, /* ^alnum */
! 386: NULL, /* ^ascii */
! 387: string_H, /* ^blank */
! 388: NULL, /* ^cntrl */
! 389: string_PNd, /* ^digit */
! 390: NULL, /* ^graph */
! 391: NULL, /* ^print */
! 392: NULL, /* ^punct */
! 393: string_PXps, /* ^space */ /* NOTE: Xps is POSIX space */
! 394: string_PXwd, /* ^word */
! 395: NULL /* ^xdigit */
1.1 misho 396: };
1.1.1.2 ! misho 397: #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
1.1 misho 398: #endif
399:
400: #define STRING(a) # a
401: #define XSTRING(s) STRING(s)
402:
403: /* The texts of compile-time error messages. These are "char *" because they
404: are passed to the outside world. Do not ever re-use any error number, because
405: they are documented. Always add a new error instead. Messages marked DEAD below
406: are no longer used. This used to be a table of strings, but in order to reduce
407: the number of relocations needed when a shared library is loaded dynamically,
408: it is now one long string. We cannot use a table of offsets, because the
409: lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
410: simply count through to the one we want - this isn't a performance issue
411: because these strings are used only when there is a compilation error.
412:
413: Each substring ends with \0 to insert a null character. This includes the final
414: substring, so that the whole string ends with \0\0, which can be detected when
415: counting through. */
416:
417: static const char error_texts[] =
418: "no error\0"
419: "\\ at end of pattern\0"
420: "\\c at end of pattern\0"
421: "unrecognized character follows \\\0"
422: "numbers out of order in {} quantifier\0"
423: /* 5 */
424: "number too big in {} quantifier\0"
425: "missing terminating ] for character class\0"
426: "invalid escape sequence in character class\0"
427: "range out of order in character class\0"
428: "nothing to repeat\0"
429: /* 10 */
430: "operand of unlimited repeat could match the empty string\0" /** DEAD **/
431: "internal error: unexpected repeat\0"
432: "unrecognized character after (? or (?-\0"
433: "POSIX named classes are supported only within a class\0"
434: "missing )\0"
435: /* 15 */
436: "reference to non-existent subpattern\0"
437: "erroffset passed as NULL\0"
438: "unknown option bit(s) set\0"
439: "missing ) after comment\0"
440: "parentheses nested too deeply\0" /** DEAD **/
441: /* 20 */
442: "regular expression is too large\0"
443: "failed to get memory\0"
444: "unmatched parentheses\0"
445: "internal error: code overflow\0"
446: "unrecognized character after (?<\0"
447: /* 25 */
448: "lookbehind assertion is not fixed length\0"
449: "malformed number or name after (?(\0"
450: "conditional group contains more than two branches\0"
451: "assertion expected after (?(\0"
452: "(?R or (?[+-]digits must be followed by )\0"
453: /* 30 */
454: "unknown POSIX class name\0"
455: "POSIX collating elements are not supported\0"
1.1.1.2 ! misho 456: "this version of PCRE is compiled without UTF support\0"
1.1 misho 457: "spare error\0" /** DEAD **/
458: "character value in \\x{...} sequence is too large\0"
459: /* 35 */
460: "invalid condition (?(0)\0"
461: "\\C not allowed in lookbehind assertion\0"
462: "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
463: "number after (?C is > 255\0"
464: "closing ) for (?C expected\0"
465: /* 40 */
466: "recursive call could loop indefinitely\0"
467: "unrecognized character after (?P\0"
468: "syntax error in subpattern name (missing terminator)\0"
469: "two named subpatterns have the same name\0"
470: "invalid UTF-8 string\0"
471: /* 45 */
472: "support for \\P, \\p, and \\X has not been compiled\0"
473: "malformed \\P or \\p sequence\0"
474: "unknown property name after \\P or \\p\0"
475: "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
476: "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
477: /* 50 */
478: "repeated subpattern is too long\0" /** DEAD **/
1.1.1.2 ! misho 479: "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
1.1 misho 480: "internal error: overran compiling workspace\0"
481: "internal error: previously-checked referenced subpattern not found\0"
482: "DEFINE group contains more than one branch\0"
483: /* 55 */
1.1.1.2 ! misho 484: "repeating a DEFINE group is not allowed\0" /** DEAD **/
1.1 misho 485: "inconsistent NEWLINE options\0"
486: "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
487: "a numbered reference must not be zero\0"
488: "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
489: /* 60 */
490: "(*VERB) not recognized\0"
491: "number is too big\0"
492: "subpattern name expected\0"
493: "digit expected after (?+\0"
494: "] is an invalid data character in JavaScript compatibility mode\0"
495: /* 65 */
496: "different names for subpatterns of the same number are not allowed\0"
497: "(*MARK) must have an argument\0"
1.1.1.2 ! misho 498: "this version of PCRE is not compiled with Unicode property support\0"
1.1 misho 499: "\\c must be followed by an ASCII character\0"
1.1.1.2 ! misho 500: "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
! 501: /* 70 */
! 502: "internal error: unknown opcode in find_fixedlength()\0"
! 503: "\\N is not supported in a class\0"
! 504: "too many forward references\0"
! 505: "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
! 506: "invalid UTF-16 string\0"
! 507: /* 75 */
! 508: "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
! 509: "character value in \\u.... sequence is too large\0"
! 510: "invalid UTF-32 string\0"
1.1 misho 511: ;
512:
513: /* Table to identify digits and hex digits. This is used when compiling
514: patterns. Note that the tables in chartables are dependent on the locale, and
515: may mark arbitrary characters as digits - but the PCRE compiling code expects
516: to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
517: a private table here. It costs 256 bytes, but it is a lot faster than doing
518: character value tests (at least in some simple cases I timed), and in some
519: applications one wants PCRE to compile efficiently as well as match
520: efficiently.
521:
522: For convenience, we use the same bit definitions as in chartables:
523:
524: 0x04 decimal digit
525: 0x08 hexadecimal digit
526:
527: Then we can use ctype_digit and ctype_xdigit in the code. */
528:
1.1.1.2 ! misho 529: /* Using a simple comparison for decimal numbers rather than a memory read
! 530: is much faster, and the resulting code is simpler (the compiler turns it
! 531: into a subtraction and unsigned comparison). */
! 532:
! 533: #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
! 534:
1.1 misho 535: #ifndef EBCDIC
536:
537: /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
538: UTF-8 mode. */
539:
1.1.1.2 ! misho 540: static const pcre_uint8 digitab[] =
1.1 misho 541: {
542: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
543: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
544: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
545: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
546: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
547: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
548: 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
549: 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
550: 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
551: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
552: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
553: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
554: 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
555: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
556: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
557: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
558: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
559: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
560: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
561: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
562: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
563: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
564: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
565: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
566: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
567: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
568: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
569: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
570: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
571: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
572: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
573: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
574:
575: #else
576:
577: /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
578:
1.1.1.2 ! misho 579: static const pcre_uint8 digitab[] =
1.1 misho 580: {
581: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
582: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
583: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
584: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
585: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
586: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
587: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
588: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
589: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
590: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
591: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
592: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
593: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
594: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
595: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
596: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
597: 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
598: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
599: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
600: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
601: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
602: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
603: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
604: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
605: 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
606: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
607: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
608: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
609: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
610: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
611: 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
612: 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
613:
1.1.1.2 ! misho 614: static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
1.1 misho 615: 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
616: 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
617: 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
618: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
619: 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
620: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
621: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
622: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
623: 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
624: 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
625: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
626: 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
627: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
628: 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
629: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
630: 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
631: 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
632: 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
633: 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
634: 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
635: 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
636: 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
637: 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
638: 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
639: 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
640: 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
641: 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
642: 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
643: 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
644: 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
645: 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
646: 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
647: #endif
648:
649:
650:
651: /*************************************************
652: * Find an error text *
653: *************************************************/
654:
655: /* The error texts are now all in one long string, to save on relocations. As
656: some of the text is of unknown length, we can't use a table of offsets.
657: Instead, just count through the strings. This is not a performance issue
658: because it happens only when there has been a compilation error.
659:
660: Argument: the error number
661: Returns: pointer to the error string
662: */
663:
664: static const char *
665: find_error_text(int n)
666: {
667: const char *s = error_texts;
668: for (; n > 0; n--)
669: {
1.1.1.2 ! misho 670: while (*s++ != CHAR_NULL) {};
! 671: if (*s == CHAR_NULL) return "Error text not found (please report)";
1.1 misho 672: }
673: return s;
674: }
675:
676:
677: /*************************************************
1.1.1.2 ! misho 678: * Expand the workspace *
! 679: *************************************************/
! 680:
! 681: /* This function is called during the second compiling phase, if the number of
! 682: forward references fills the existing workspace, which is originally a block on
! 683: the stack. A larger block is obtained from malloc() unless the ultimate limit
! 684: has been reached or the increase will be rather small.
! 685:
! 686: Argument: pointer to the compile data block
! 687: Returns: 0 if all went well, else an error number
! 688: */
! 689:
! 690: static int
! 691: expand_workspace(compile_data *cd)
! 692: {
! 693: pcre_uchar *newspace;
! 694: int newsize = cd->workspace_size * 2;
! 695:
! 696: if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
! 697: if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
! 698: newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
! 699: return ERR72;
! 700:
! 701: newspace = (PUBL(malloc))(IN_UCHARS(newsize));
! 702: if (newspace == NULL) return ERR21;
! 703: memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
! 704: cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
! 705: if (cd->workspace_size > COMPILE_WORK_SIZE)
! 706: (PUBL(free))((void *)cd->start_workspace);
! 707: cd->start_workspace = newspace;
! 708: cd->workspace_size = newsize;
! 709: return 0;
! 710: }
! 711:
! 712:
! 713:
! 714: /*************************************************
! 715: * Check for counted repeat *
! 716: *************************************************/
! 717:
! 718: /* This function is called when a '{' is encountered in a place where it might
! 719: start a quantifier. It looks ahead to see if it really is a quantifier or not.
! 720: It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
! 721: where the ddds are digits.
! 722:
! 723: Arguments:
! 724: p pointer to the first char after '{'
! 725:
! 726: Returns: TRUE or FALSE
! 727: */
! 728:
! 729: static BOOL
! 730: is_counted_repeat(const pcre_uchar *p)
! 731: {
! 732: if (!IS_DIGIT(*p)) return FALSE;
! 733: p++;
! 734: while (IS_DIGIT(*p)) p++;
! 735: if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
! 736:
! 737: if (*p++ != CHAR_COMMA) return FALSE;
! 738: if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
! 739:
! 740: if (!IS_DIGIT(*p)) return FALSE;
! 741: p++;
! 742: while (IS_DIGIT(*p)) p++;
! 743:
! 744: return (*p == CHAR_RIGHT_CURLY_BRACKET);
! 745: }
! 746:
! 747:
! 748:
! 749: /*************************************************
1.1 misho 750: * Handle escapes *
751: *************************************************/
752:
753: /* This function is called when a \ has been encountered. It either returns a
1.1.1.2 ! misho 754: positive value for a simple escape such as \n, or 0 for a data character
! 755: which will be placed in chptr. A backreference to group n is returned as
! 756: negative n. When UTF-8 is enabled, a positive value greater than 255 may
! 757: be returned in chptr.
! 758: On entry,ptr is pointing at the \. On exit, it is on the final character of the
! 759: escape sequence.
1.1 misho 760:
761: Arguments:
762: ptrptr points to the pattern position pointer
1.1.1.2 ! misho 763: chptr points to the data character
1.1 misho 764: errorcodeptr points to the errorcode variable
765: bracount number of previous extracting brackets
766: options the options bits
767: isclass TRUE if inside a character class
768:
1.1.1.2 ! misho 769: Returns: zero => a data character
! 770: positive => a special escape sequence
! 771: negative => a back reference
1.1 misho 772: on error, errorcodeptr is set
773: */
774:
775: static int
1.1.1.2 ! misho 776: check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
! 777: int bracount, int options, BOOL isclass)
1.1 misho 778: {
1.1.1.2 ! misho 779: /* PCRE_UTF16 has the same value as PCRE_UTF8. */
! 780: BOOL utf = (options & PCRE_UTF8) != 0;
! 781: const pcre_uchar *ptr = *ptrptr + 1;
! 782: pcre_uint32 c;
! 783: int escape = 0;
! 784: int i;
1.1 misho 785:
786: GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
787: ptr--; /* Set pointer back to the last byte */
788:
789: /* If backslash is at the end of the pattern, it's an error. */
790:
1.1.1.2 ! misho 791: if (c == CHAR_NULL) *errorcodeptr = ERR1;
1.1 misho 792:
793: /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
794: in a table. A non-zero result is something that can be returned immediately.
795: Otherwise further processing may be required. */
796:
797: #ifndef EBCDIC /* ASCII/UTF-8 coding */
1.1.1.2 ! misho 798: /* Not alphanumeric */
! 799: else if (c < CHAR_0 || c > CHAR_z) {}
! 800: else if ((i = escapes[c - CHAR_0]) != 0) { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1.1 misho 801:
802: #else /* EBCDIC coding */
1.1.1.2 ! misho 803: /* Not alphanumeric */
! 804: else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
! 805: else if ((i = escapes[c - 0x48]) != 0) { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1.1 misho 806: #endif
807:
808: /* Escapes that need further processing, or are illegal. */
809:
810: else
811: {
1.1.1.2 ! misho 812: const pcre_uchar *oldptr;
! 813: BOOL braced, negated, overflow;
! 814: int s;
1.1 misho 815:
816: switch (c)
817: {
818: /* A number of Perl escapes are not handled by PCRE. We give an explicit
819: error. */
820:
821: case CHAR_l:
822: case CHAR_L:
1.1.1.2 ! misho 823: *errorcodeptr = ERR37;
! 824: break;
! 825:
1.1 misho 826: case CHAR_u:
1.1.1.2 ! misho 827: if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
! 828: {
! 829: /* In JavaScript, \u must be followed by four hexadecimal numbers.
! 830: Otherwise it is a lowercase u letter. */
! 831: if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
! 832: && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
! 833: && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
! 834: && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
! 835: {
! 836: c = 0;
! 837: for (i = 0; i < 4; ++i)
! 838: {
! 839: register pcre_uint32 cc = *(++ptr);
! 840: #ifndef EBCDIC /* ASCII/UTF-8 coding */
! 841: if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
! 842: c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
! 843: #else /* EBCDIC coding */
! 844: if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
! 845: c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
! 846: #endif
! 847: }
! 848:
! 849: #if defined COMPILE_PCRE8
! 850: if (c > (utf ? 0x10ffff : 0xff))
! 851: #elif defined COMPILE_PCRE16
! 852: if (c > (utf ? 0x10ffff : 0xffff))
! 853: #elif defined COMPILE_PCRE32
! 854: if (utf && c > 0x10ffff)
! 855: #endif
! 856: {
! 857: *errorcodeptr = ERR76;
! 858: }
! 859: else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
! 860: }
! 861: }
! 862: else
! 863: *errorcodeptr = ERR37;
! 864: break;
! 865:
1.1 misho 866: case CHAR_U:
1.1.1.2 ! misho 867: /* In JavaScript, \U is an uppercase U letter. */
! 868: if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
1.1 misho 869: break;
870:
1.1.1.2 ! misho 871: /* In a character class, \g is just a literal "g". Outside a character
! 872: class, \g must be followed by one of a number of specific things:
1.1 misho 873:
874: (1) A number, either plain or braced. If positive, it is an absolute
875: backreference. If negative, it is a relative backreference. This is a Perl
876: 5.10 feature.
877:
878: (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
879: is part of Perl's movement towards a unified syntax for back references. As
880: this is synonymous with \k{name}, we fudge it up by pretending it really
881: was \k.
882:
883: (3) For Oniguruma compatibility we also support \g followed by a name or a
884: number either in angle brackets or in single quotes. However, these are
885: (possibly recursive) subroutine calls, _not_ backreferences. Just return
1.1.1.2 ! misho 886: the ESC_g code (cf \k). */
1.1 misho 887:
888: case CHAR_g:
1.1.1.2 ! misho 889: if (isclass) break;
1.1 misho 890: if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
891: {
1.1.1.2 ! misho 892: escape = ESC_g;
1.1 misho 893: break;
894: }
895:
896: /* Handle the Perl-compatible cases */
897:
898: if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
899: {
1.1.1.2 ! misho 900: const pcre_uchar *p;
! 901: for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
! 902: if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
! 903: if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
1.1 misho 904: {
1.1.1.2 ! misho 905: escape = ESC_k;
1.1 misho 906: break;
907: }
908: braced = TRUE;
909: ptr++;
910: }
911: else braced = FALSE;
912:
913: if (ptr[1] == CHAR_MINUS)
914: {
915: negated = TRUE;
916: ptr++;
917: }
918: else negated = FALSE;
919:
1.1.1.2 ! misho 920: /* The integer range is limited by the machine's int representation. */
! 921: s = 0;
! 922: overflow = FALSE;
! 923: while (IS_DIGIT(ptr[1]))
1.1 misho 924: {
1.1.1.2 ! misho 925: if (s > INT_MAX / 10 - 1) /* Integer overflow */
! 926: {
! 927: overflow = TRUE;
! 928: break;
! 929: }
! 930: s = s * 10 + (int)(*(++ptr) - CHAR_0);
! 931: }
! 932: if (overflow) /* Integer overflow */
! 933: {
! 934: while (IS_DIGIT(ptr[1]))
! 935: ptr++;
1.1 misho 936: *errorcodeptr = ERR61;
937: break;
938: }
939:
940: if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
941: {
942: *errorcodeptr = ERR57;
943: break;
944: }
945:
1.1.1.2 ! misho 946: if (s == 0)
1.1 misho 947: {
948: *errorcodeptr = ERR58;
949: break;
950: }
951:
952: if (negated)
953: {
1.1.1.2 ! misho 954: if (s > bracount)
1.1 misho 955: {
956: *errorcodeptr = ERR15;
957: break;
958: }
1.1.1.2 ! misho 959: s = bracount - (s - 1);
1.1 misho 960: }
961:
1.1.1.2 ! misho 962: escape = -s;
1.1 misho 963: break;
964:
965: /* The handling of escape sequences consisting of a string of digits
966: starting with one that is not zero is not straightforward. By experiment,
967: the way Perl works seems to be as follows:
968:
969: Outside a character class, the digits are read as a decimal number. If the
970: number is less than 10, or if there are that many previous extracting
971: left brackets, then it is a back reference. Otherwise, up to three octal
972: digits are read to form an escaped byte. Thus \123 is likely to be octal
973: 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
974: value is greater than 377, the least significant 8 bits are taken. Inside a
975: character class, \ followed by a digit is always an octal number. */
976:
977: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
978: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
979:
980: if (!isclass)
981: {
982: oldptr = ptr;
1.1.1.2 ! misho 983: /* The integer range is limited by the machine's int representation. */
! 984: s = (int)(c -CHAR_0);
! 985: overflow = FALSE;
! 986: while (IS_DIGIT(ptr[1]))
! 987: {
! 988: if (s > INT_MAX / 10 - 1) /* Integer overflow */
! 989: {
! 990: overflow = TRUE;
! 991: break;
! 992: }
! 993: s = s * 10 + (int)(*(++ptr) - CHAR_0);
! 994: }
! 995: if (overflow) /* Integer overflow */
1.1 misho 996: {
1.1.1.2 ! misho 997: while (IS_DIGIT(ptr[1]))
! 998: ptr++;
1.1 misho 999: *errorcodeptr = ERR61;
1000: break;
1001: }
1.1.1.2 ! misho 1002: if (s < 10 || s <= bracount)
1.1 misho 1003: {
1.1.1.2 ! misho 1004: escape = -s;
1.1 misho 1005: break;
1006: }
1007: ptr = oldptr; /* Put the pointer back and fall through */
1008: }
1009:
1010: /* Handle an octal number following \. If the first digit is 8 or 9, Perl
1011: generates a binary zero byte and treats the digit as a following literal.
1012: Thus we have to pull back the pointer by one. */
1013:
1014: if ((c = *ptr) >= CHAR_8)
1015: {
1016: ptr--;
1017: c = 0;
1018: break;
1019: }
1020:
1021: /* \0 always starts an octal number, but we may drop through to here with a
1022: larger first octal digit. The original code used just to take the least
1023: significant 8 bits of octal numbers (I think this is what early Perls used
1.1.1.2 ! misho 1024: to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
! 1025: but no more than 3 octal digits. */
1.1 misho 1026:
1027: case CHAR_0:
1028: c -= CHAR_0;
1029: while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1030: c = c * 8 + *(++ptr) - CHAR_0;
1.1.1.2 ! misho 1031: #ifdef COMPILE_PCRE8
! 1032: if (!utf && c > 0xff) *errorcodeptr = ERR51;
! 1033: #endif
1.1 misho 1034: break;
1035:
1036: /* \x is complicated. \x{ddd} is a character number which can be greater
1.1.1.2 ! misho 1037: than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.
! 1038: If not, { is treated as a data character. */
1.1 misho 1039:
1040: case CHAR_x:
1.1.1.2 ! misho 1041: if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
! 1042: {
! 1043: /* In JavaScript, \x must be followed by two hexadecimal numbers.
! 1044: Otherwise it is a lowercase x letter. */
! 1045: if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
! 1046: && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
! 1047: {
! 1048: c = 0;
! 1049: for (i = 0; i < 2; ++i)
! 1050: {
! 1051: register pcre_uint32 cc = *(++ptr);
! 1052: #ifndef EBCDIC /* ASCII/UTF-8 coding */
! 1053: if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
! 1054: c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
! 1055: #else /* EBCDIC coding */
! 1056: if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
! 1057: c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
! 1058: #endif
! 1059: }
! 1060: }
! 1061: break;
! 1062: }
! 1063:
1.1 misho 1064: if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1065: {
1.1.1.2 ! misho 1066: const pcre_uchar *pt = ptr + 2;
1.1 misho 1067:
1068: c = 0;
1.1.1.2 ! misho 1069: overflow = FALSE;
! 1070: while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)
1.1 misho 1071: {
1.1.1.2 ! misho 1072: register pcre_uint32 cc = *pt++;
1.1 misho 1073: if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1.1.1.2 ! misho 1074:
! 1075: #ifdef COMPILE_PCRE32
! 1076: if (c >= 0x10000000l) { overflow = TRUE; break; }
! 1077: #endif
1.1 misho 1078:
1079: #ifndef EBCDIC /* ASCII/UTF-8 coding */
1080: if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1081: c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1082: #else /* EBCDIC coding */
1083: if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1084: c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1085: #endif
1.1.1.2 ! misho 1086:
! 1087: #if defined COMPILE_PCRE8
! 1088: if (c > (utf ? 0x10ffff : 0xff)) { overflow = TRUE; break; }
! 1089: #elif defined COMPILE_PCRE16
! 1090: if (c > (utf ? 0x10ffff : 0xffff)) { overflow = TRUE; break; }
! 1091: #elif defined COMPILE_PCRE32
! 1092: if (utf && c > 0x10ffff) { overflow = TRUE; break; }
! 1093: #endif
! 1094: }
! 1095:
! 1096: if (overflow)
! 1097: {
! 1098: while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;
! 1099: *errorcodeptr = ERR34;
1.1 misho 1100: }
1101:
1102: if (*pt == CHAR_RIGHT_CURLY_BRACKET)
1103: {
1.1.1.2 ! misho 1104: if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1.1 misho 1105: ptr = pt;
1106: break;
1107: }
1108:
1109: /* If the sequence of hex digits does not end with '}', then we don't
1110: recognize this construct; fall through to the normal \x handling. */
1111: }
1112:
1113: /* Read just a single-byte hex-defined char */
1114:
1115: c = 0;
1.1.1.2 ! misho 1116: while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1.1 misho 1117: {
1.1.1.2 ! misho 1118: pcre_uint32 cc; /* Some compilers don't like */
1.1 misho 1119: cc = *(++ptr); /* ++ in initializers */
1120: #ifndef EBCDIC /* ASCII/UTF-8 coding */
1121: if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1122: c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1123: #else /* EBCDIC coding */
1124: if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
1125: c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1126: #endif
1127: }
1128: break;
1129:
1130: /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1131: An error is given if the byte following \c is not an ASCII character. This
1132: coding is ASCII-specific, but then the whole concept of \cx is
1133: ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1134:
1135: case CHAR_c:
1136: c = *(++ptr);
1.1.1.2 ! misho 1137: if (c == CHAR_NULL)
1.1 misho 1138: {
1139: *errorcodeptr = ERR2;
1140: break;
1141: }
1142: #ifndef EBCDIC /* ASCII/UTF-8 coding */
1143: if (c > 127) /* Excludes all non-ASCII in either mode */
1144: {
1145: *errorcodeptr = ERR68;
1146: break;
1147: }
1148: if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1149: c ^= 0x40;
1150: #else /* EBCDIC coding */
1151: if (c >= CHAR_a && c <= CHAR_z) c += 64;
1152: c ^= 0xC0;
1153: #endif
1154: break;
1155:
1156: /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1157: other alphanumeric following \ is an error if PCRE_EXTRA was set;
1158: otherwise, for Perl compatibility, it is a literal. This code looks a bit
1159: odd, but there used to be some cases other than the default, and there may
1160: be again in future, so I haven't "optimized" it. */
1161:
1162: default:
1163: if ((options & PCRE_EXTRA) != 0) switch(c)
1164: {
1165: default:
1166: *errorcodeptr = ERR3;
1167: break;
1168: }
1169: break;
1170: }
1171: }
1172:
1173: /* Perl supports \N{name} for character names, as well as plain \N for "not
1.1.1.2 ! misho 1174: newline". PCRE does not support \N{name}. However, it does support
! 1175: quantification such as \N{2,3}. */
1.1 misho 1176:
1.1.1.2 ! misho 1177: if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
! 1178: !is_counted_repeat(ptr+2))
1.1 misho 1179: *errorcodeptr = ERR37;
1180:
1181: /* If PCRE_UCP is set, we change the values for \d etc. */
1182:
1.1.1.2 ! misho 1183: if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
! 1184: escape += (ESC_DU - ESC_D);
1.1 misho 1185:
1186: /* Set the pointer to the final character before returning. */
1187:
1188: *ptrptr = ptr;
1.1.1.2 ! misho 1189: *chptr = c;
! 1190: return escape;
1.1 misho 1191: }
1192:
1193: #ifdef SUPPORT_UCP
1194: /*************************************************
1195: * Handle \P and \p *
1196: *************************************************/
1197:
1198: /* This function is called after \P or \p has been encountered, provided that
1199: PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1200: pointing at the P or p. On exit, it is pointing at the final character of the
1201: escape sequence.
1202:
1203: Argument:
1204: ptrptr points to the pattern position pointer
1205: negptr points to a boolean that is set TRUE for negation else FALSE
1.1.1.2 ! misho 1206: ptypeptr points to an unsigned int that is set to the type value
! 1207: pdataptr points to an unsigned int that is set to the detailed property value
1.1 misho 1208: errorcodeptr points to the error code variable
1209:
1.1.1.2 ! misho 1210: Returns: TRUE if the type value was found, or FALSE for an invalid type
1.1 misho 1211: */
1212:
1.1.1.2 ! misho 1213: static BOOL
! 1214: get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
! 1215: unsigned int *pdataptr, int *errorcodeptr)
1.1 misho 1216: {
1.1.1.2 ! misho 1217: pcre_uchar c;
! 1218: int i, bot, top;
! 1219: const pcre_uchar *ptr = *ptrptr;
! 1220: pcre_uchar name[32];
1.1 misho 1221:
1222: c = *(++ptr);
1.1.1.2 ! misho 1223: if (c == CHAR_NULL) goto ERROR_RETURN;
1.1 misho 1224:
1225: *negptr = FALSE;
1226:
1227: /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1228: negation. */
1229:
1230: if (c == CHAR_LEFT_CURLY_BRACKET)
1231: {
1232: if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1233: {
1234: *negptr = TRUE;
1235: ptr++;
1236: }
1.1.1.2 ! misho 1237: for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1.1 misho 1238: {
1239: c = *(++ptr);
1.1.1.2 ! misho 1240: if (c == CHAR_NULL) goto ERROR_RETURN;
1.1 misho 1241: if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1242: name[i] = c;
1243: }
1244: if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1245: name[i] = 0;
1246: }
1247:
1248: /* Otherwise there is just one following character */
1249:
1250: else
1251: {
1252: name[0] = c;
1253: name[1] = 0;
1254: }
1255:
1256: *ptrptr = ptr;
1257:
1258: /* Search for a recognized property name using binary chop */
1259:
1260: bot = 0;
1.1.1.2 ! misho 1261: top = PRIV(utt_size);
1.1 misho 1262:
1263: while (bot < top)
1264: {
1.1.1.2 ! misho 1265: int r;
1.1 misho 1266: i = (bot + top) >> 1;
1.1.1.2 ! misho 1267: r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
! 1268: if (r == 0)
1.1 misho 1269: {
1.1.1.2 ! misho 1270: *ptypeptr = PRIV(utt)[i].type;
! 1271: *pdataptr = PRIV(utt)[i].value;
! 1272: return TRUE;
1.1 misho 1273: }
1.1.1.2 ! misho 1274: if (r > 0) bot = i + 1; else top = i;
1.1 misho 1275: }
1276:
1277: *errorcodeptr = ERR47;
1278: *ptrptr = ptr;
1.1.1.2 ! misho 1279: return FALSE;
1.1 misho 1280:
1281: ERROR_RETURN:
1282: *errorcodeptr = ERR46;
1283: *ptrptr = ptr;
1.1.1.2 ! misho 1284: return FALSE;
1.1 misho 1285: }
1286: #endif
1287:
1288:
1289:
1290:
1291: /*************************************************
1292: * Read repeat counts *
1293: *************************************************/
1294:
1295: /* Read an item of the form {n,m} and return the values. This is called only
1296: after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1297: so the syntax is guaranteed to be correct, but we need to check the values.
1298:
1299: Arguments:
1300: p pointer to first char after '{'
1301: minp pointer to int for min
1302: maxp pointer to int for max
1303: returned as -1 if no max
1304: errorcodeptr points to error code variable
1305:
1306: Returns: pointer to '}' on success;
1307: current ptr on error, with errorcodeptr set non-zero
1308: */
1309:
1.1.1.2 ! misho 1310: static const pcre_uchar *
! 1311: read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1.1 misho 1312: {
1313: int min = 0;
1314: int max = -1;
1315:
1316: /* Read the minimum value and do a paranoid check: a negative value indicates
1317: an integer overflow. */
1318:
1.1.1.2 ! misho 1319: while (IS_DIGIT(*p)) min = min * 10 + (int)(*p++ - CHAR_0);
1.1 misho 1320: if (min < 0 || min > 65535)
1321: {
1322: *errorcodeptr = ERR5;
1323: return p;
1324: }
1325:
1326: /* Read the maximum value if there is one, and again do a paranoid on its size.
1327: Also, max must not be less than min. */
1328:
1329: if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1330: {
1331: if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1332: {
1333: max = 0;
1.1.1.2 ! misho 1334: while(IS_DIGIT(*p)) max = max * 10 + (int)(*p++ - CHAR_0);
1.1 misho 1335: if (max < 0 || max > 65535)
1336: {
1337: *errorcodeptr = ERR5;
1338: return p;
1339: }
1340: if (max < min)
1341: {
1342: *errorcodeptr = ERR4;
1343: return p;
1344: }
1345: }
1346: }
1347:
1348: /* Fill in the required variables, and pass back the pointer to the terminating
1349: '}'. */
1350:
1351: *minp = min;
1352: *maxp = max;
1353: return p;
1354: }
1355:
1356:
1357:
1358: /*************************************************
1359: * Subroutine for finding forward reference *
1360: *************************************************/
1361:
1362: /* This recursive function is called only from find_parens() below. The
1363: top-level call starts at the beginning of the pattern. All other calls must
1364: start at a parenthesis. It scans along a pattern's text looking for capturing
1365: subpatterns, and counting them. If it finds a named pattern that matches the
1366: name it is given, it returns its number. Alternatively, if the name is NULL, it
1367: returns when it reaches a given numbered subpattern. Recursion is used to keep
1368: track of subpatterns that reset the capturing group numbers - the (?| feature.
1369:
1370: This function was originally called only from the second pass, in which we know
1371: that if (?< or (?' or (?P< is encountered, the name will be correctly
1372: terminated because that is checked in the first pass. There is now one call to
1373: this function in the first pass, to check for a recursive back reference by
1374: name (so that we can make the whole group atomic). In this case, we need check
1375: only up to the current position in the pattern, and that is still OK because
1376: and previous occurrences will have been checked. To make this work, the test
1377: for "end of pattern" is a check against cd->end_pattern in the main loop,
1378: instead of looking for a binary zero. This means that the special first-pass
1379: call can adjust cd->end_pattern temporarily. (Checks for binary zero while
1380: processing items within the loop are OK, because afterwards the main loop will
1381: terminate.)
1382:
1383: Arguments:
1384: ptrptr address of the current character pointer (updated)
1385: cd compile background data
1386: name name to seek, or NULL if seeking a numbered subpattern
1387: lorn name length, or subpattern number if name is NULL
1388: xmode TRUE if we are in /x mode
1.1.1.2 ! misho 1389: utf TRUE if we are in UTF-8 / UTF-16 / UTF-32 mode
1.1 misho 1390: count pointer to the current capturing subpattern number (updated)
1391:
1392: Returns: the number of the named subpattern, or -1 if not found
1393: */
1394:
1395: static int
1.1.1.2 ! misho 1396: find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn,
! 1397: BOOL xmode, BOOL utf, int *count)
1.1 misho 1398: {
1.1.1.2 ! misho 1399: pcre_uchar *ptr = *ptrptr;
1.1 misho 1400: int start_count = *count;
1401: int hwm_count = start_count;
1402: BOOL dup_parens = FALSE;
1403:
1404: /* If the first character is a parenthesis, check on the type of group we are
1405: dealing with. The very first call may not start with a parenthesis. */
1406:
1407: if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1408: {
1409: /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1410:
1411: if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1412:
1413: /* Handle a normal, unnamed capturing parenthesis. */
1414:
1415: else if (ptr[1] != CHAR_QUESTION_MARK)
1416: {
1417: *count += 1;
1418: if (name == NULL && *count == lorn) return *count;
1419: ptr++;
1420: }
1421:
1422: /* All cases now have (? at the start. Remember when we are in a group
1423: where the parenthesis numbers are duplicated. */
1424:
1425: else if (ptr[2] == CHAR_VERTICAL_LINE)
1426: {
1427: ptr += 3;
1428: dup_parens = TRUE;
1429: }
1430:
1431: /* Handle comments; all characters are allowed until a ket is reached. */
1432:
1433: else if (ptr[2] == CHAR_NUMBER_SIGN)
1434: {
1.1.1.2 ! misho 1435: for (ptr += 3; *ptr != CHAR_NULL; ptr++)
! 1436: if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1.1 misho 1437: goto FAIL_EXIT;
1438: }
1439:
1440: /* Handle a condition. If it is an assertion, just carry on so that it
1441: is processed as normal. If not, skip to the closing parenthesis of the
1442: condition (there can't be any nested parens). */
1443:
1444: else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1445: {
1446: ptr += 2;
1447: if (ptr[1] != CHAR_QUESTION_MARK)
1448: {
1.1.1.2 ! misho 1449: while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
! 1450: if (*ptr != CHAR_NULL) ptr++;
1.1 misho 1451: }
1452: }
1453:
1454: /* Start with (? but not a condition. */
1455:
1456: else
1457: {
1458: ptr += 2;
1459: if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1460:
1461: /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1462:
1463: if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1464: ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1465: {
1.1.1.2 ! misho 1466: pcre_uchar term;
! 1467: const pcre_uchar *thisname;
1.1 misho 1468: *count += 1;
1469: if (name == NULL && *count == lorn) return *count;
1470: term = *ptr++;
1471: if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1472: thisname = ptr;
1473: while (*ptr != term) ptr++;
1.1.1.2 ! misho 1474: if (name != NULL && lorn == (int)(ptr - thisname) &&
! 1475: STRNCMP_UC_UC(name, thisname, (unsigned int)lorn) == 0)
1.1 misho 1476: return *count;
1477: term++;
1478: }
1479: }
1480: }
1481:
1482: /* Past any initial parenthesis handling, scan for parentheses or vertical
1483: bars. Stop if we get to cd->end_pattern. Note that this is important for the
1484: first-pass call when this value is temporarily adjusted to stop at the current
1485: position. So DO NOT change this to a test for binary zero. */
1486:
1487: for (; ptr < cd->end_pattern; ptr++)
1488: {
1489: /* Skip over backslashed characters and also entire \Q...\E */
1490:
1491: if (*ptr == CHAR_BACKSLASH)
1492: {
1.1.1.2 ! misho 1493: if (*(++ptr) == CHAR_NULL) goto FAIL_EXIT;
1.1 misho 1494: if (*ptr == CHAR_Q) for (;;)
1495: {
1.1.1.2 ! misho 1496: while (*(++ptr) != CHAR_NULL && *ptr != CHAR_BACKSLASH) {};
! 1497: if (*ptr == CHAR_NULL) goto FAIL_EXIT;
1.1 misho 1498: if (*(++ptr) == CHAR_E) break;
1499: }
1500: continue;
1501: }
1502:
1503: /* Skip over character classes; this logic must be similar to the way they
1504: are handled for real. If the first character is '^', skip it. Also, if the
1505: first few characters (either before or after ^) are \Q\E or \E we skip them
1506: too. This makes for compatibility with Perl. Note the use of STR macros to
1507: encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1508:
1509: if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1510: {
1511: BOOL negate_class = FALSE;
1512: for (;;)
1513: {
1514: if (ptr[1] == CHAR_BACKSLASH)
1515: {
1516: if (ptr[2] == CHAR_E)
1517: ptr+= 2;
1.1.1.2 ! misho 1518: else if (STRNCMP_UC_C8(ptr + 2,
1.1 misho 1519: STR_Q STR_BACKSLASH STR_E, 3) == 0)
1520: ptr += 4;
1521: else
1522: break;
1523: }
1524: else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1525: {
1526: negate_class = TRUE;
1527: ptr++;
1528: }
1529: else break;
1530: }
1531:
1532: /* If the next character is ']', it is a data character that must be
1533: skipped, except in JavaScript compatibility mode. */
1534:
1535: if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1536: (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1537: ptr++;
1538:
1539: while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1540: {
1.1.1.2 ! misho 1541: if (*ptr == CHAR_NULL) return -1;
1.1 misho 1542: if (*ptr == CHAR_BACKSLASH)
1543: {
1.1.1.2 ! misho 1544: if (*(++ptr) == CHAR_NULL) goto FAIL_EXIT;
1.1 misho 1545: if (*ptr == CHAR_Q) for (;;)
1546: {
1.1.1.2 ! misho 1547: while (*(++ptr) != CHAR_NULL && *ptr != CHAR_BACKSLASH) {};
! 1548: if (*ptr == CHAR_NULL) goto FAIL_EXIT;
1.1 misho 1549: if (*(++ptr) == CHAR_E) break;
1550: }
1551: continue;
1552: }
1553: }
1554: continue;
1555: }
1556:
1557: /* Skip comments in /x mode */
1558:
1559: if (xmode && *ptr == CHAR_NUMBER_SIGN)
1560: {
1561: ptr++;
1.1.1.2 ! misho 1562: while (*ptr != CHAR_NULL)
1.1 misho 1563: {
1564: if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1565: ptr++;
1.1.1.2 ! misho 1566: #ifdef SUPPORT_UTF
! 1567: if (utf) FORWARDCHAR(ptr);
1.1 misho 1568: #endif
1569: }
1.1.1.2 ! misho 1570: if (*ptr == CHAR_NULL) goto FAIL_EXIT;
1.1 misho 1571: continue;
1572: }
1573:
1574: /* Check for the special metacharacters */
1575:
1576: if (*ptr == CHAR_LEFT_PARENTHESIS)
1577: {
1.1.1.2 ! misho 1578: int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, count);
1.1 misho 1579: if (rc > 0) return rc;
1.1.1.2 ! misho 1580: if (*ptr == CHAR_NULL) goto FAIL_EXIT;
1.1 misho 1581: }
1582:
1583: else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1584: {
1585: if (dup_parens && *count < hwm_count) *count = hwm_count;
1586: goto FAIL_EXIT;
1587: }
1588:
1589: else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1590: {
1591: if (*count > hwm_count) hwm_count = *count;
1592: *count = start_count;
1593: }
1594: }
1595:
1596: FAIL_EXIT:
1597: *ptrptr = ptr;
1598: return -1;
1599: }
1600:
1601:
1602:
1603:
1604: /*************************************************
1605: * Find forward referenced subpattern *
1606: *************************************************/
1607:
1608: /* This function scans along a pattern's text looking for capturing
1609: subpatterns, and counting them. If it finds a named pattern that matches the
1610: name it is given, it returns its number. Alternatively, if the name is NULL, it
1611: returns when it reaches a given numbered subpattern. This is used for forward
1612: references to subpatterns. We used to be able to start this scan from the
1613: current compiling point, using the current count value from cd->bracount, and
1614: do it all in a single loop, but the addition of the possibility of duplicate
1615: subpattern numbers means that we have to scan from the very start, in order to
1616: take account of such duplicates, and to use a recursive function to keep track
1617: of the different types of group.
1618:
1619: Arguments:
1620: cd compile background data
1621: name name to seek, or NULL if seeking a numbered subpattern
1622: lorn name length, or subpattern number if name is NULL
1623: xmode TRUE if we are in /x mode
1.1.1.2 ! misho 1624: utf TRUE if we are in UTF-8 / UTF-16 / UTF-32 mode
1.1 misho 1625:
1626: Returns: the number of the found subpattern, or -1 if not found
1627: */
1628:
1629: static int
1.1.1.2 ! misho 1630: find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode,
! 1631: BOOL utf)
1.1 misho 1632: {
1.1.1.2 ! misho 1633: pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern;
1.1 misho 1634: int count = 0;
1635: int rc;
1636:
1637: /* If the pattern does not start with an opening parenthesis, the first call
1638: to find_parens_sub() will scan right to the end (if necessary). However, if it
1639: does start with a parenthesis, find_parens_sub() will return when it hits the
1640: matching closing parens. That is why we have to have a loop. */
1641:
1642: for (;;)
1643: {
1.1.1.2 ! misho 1644: rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, &count);
! 1645: if (rc > 0 || *ptr++ == CHAR_NULL) break;
1.1 misho 1646: }
1647:
1648: return rc;
1649: }
1650:
1651:
1652:
1653:
1654: /*************************************************
1655: * Find first significant op code *
1656: *************************************************/
1657:
1658: /* This is called by several functions that scan a compiled expression looking
1659: for a fixed first character, or an anchoring op code etc. It skips over things
1.1.1.2 ! misho 1660: that do not influence this. For some calls, it makes sense to skip negative
! 1661: forward and all backward assertions, and also the \b assertion; for others it
! 1662: does not.
1.1 misho 1663:
1664: Arguments:
1665: code pointer to the start of the group
1666: skipassert TRUE if certain assertions are to be skipped
1667:
1668: Returns: pointer to the first significant opcode
1669: */
1670:
1.1.1.2 ! misho 1671: static const pcre_uchar*
! 1672: first_significant_code(const pcre_uchar *code, BOOL skipassert)
1.1 misho 1673: {
1674: for (;;)
1675: {
1676: switch ((int)*code)
1677: {
1678: case OP_ASSERT_NOT:
1679: case OP_ASSERTBACK:
1680: case OP_ASSERTBACK_NOT:
1681: if (!skipassert) return code;
1682: do code += GET(code, 1); while (*code == OP_ALT);
1.1.1.2 ! misho 1683: code += PRIV(OP_lengths)[*code];
1.1 misho 1684: break;
1685:
1686: case OP_WORD_BOUNDARY:
1687: case OP_NOT_WORD_BOUNDARY:
1688: if (!skipassert) return code;
1689: /* Fall through */
1690:
1691: case OP_CALLOUT:
1692: case OP_CREF:
1693: case OP_NCREF:
1694: case OP_RREF:
1695: case OP_NRREF:
1696: case OP_DEF:
1.1.1.2 ! misho 1697: code += PRIV(OP_lengths)[*code];
1.1 misho 1698: break;
1699:
1700: default:
1701: return code;
1702: }
1703: }
1704: /* Control never reaches here */
1705: }
1706:
1707:
1708:
1709:
1710: /*************************************************
1711: * Find the fixed length of a branch *
1712: *************************************************/
1713:
1714: /* Scan a branch and compute the fixed length of subject that will match it,
1715: if the length is fixed. This is needed for dealing with backward assertions.
1716: In UTF8 mode, the result is in characters rather than bytes. The branch is
1717: temporarily terminated with OP_END when this function is called.
1718:
1719: This function is called when a backward assertion is encountered, so that if it
1720: fails, the error message can point to the correct place in the pattern.
1721: However, we cannot do this when the assertion contains subroutine calls,
1722: because they can be forward references. We solve this by remembering this case
1723: and doing the check at the end; a flag specifies which mode we are running in.
1724:
1725: Arguments:
1726: code points to the start of the pattern (the bracket)
1.1.1.2 ! misho 1727: utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1.1 misho 1728: atend TRUE if called when the pattern is complete
1729: cd the "compile data" structure
1730:
1731: Returns: the fixed length,
1732: or -1 if there is no fixed length,
1.1.1.2 ! misho 1733: or -2 if \C was encountered (in UTF-8 mode only)
1.1 misho 1734: or -3 if an OP_RECURSE item was encountered and atend is FALSE
1.1.1.2 ! misho 1735: or -4 if an unknown opcode was encountered (internal error)
1.1 misho 1736: */
1737:
1738: static int
1.1.1.2 ! misho 1739: find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1.1 misho 1740: {
1741: int length = -1;
1742:
1743: register int branchlength = 0;
1.1.1.2 ! misho 1744: register pcre_uchar *cc = code + 1 + LINK_SIZE;
1.1 misho 1745:
1746: /* Scan along the opcodes for this branch. If we get to the end of the
1747: branch, check the length against that of the other branches. */
1748:
1749: for (;;)
1750: {
1751: int d;
1.1.1.2 ! misho 1752: pcre_uchar *ce, *cs;
! 1753: register pcre_uchar op = *cc;
! 1754:
1.1 misho 1755: switch (op)
1756: {
1.1.1.2 ! misho 1757: /* We only need to continue for OP_CBRA (normal capturing bracket) and
! 1758: OP_BRA (normal non-capturing bracket) because the other variants of these
! 1759: opcodes are all concerned with unlimited repeated groups, which of course
! 1760: are not of fixed length. */
! 1761:
1.1 misho 1762: case OP_CBRA:
1763: case OP_BRA:
1764: case OP_ONCE:
1.1.1.2 ! misho 1765: case OP_ONCE_NC:
1.1 misho 1766: case OP_COND:
1.1.1.2 ! misho 1767: d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1.1 misho 1768: if (d < 0) return d;
1769: branchlength += d;
1770: do cc += GET(cc, 1); while (*cc == OP_ALT);
1771: cc += 1 + LINK_SIZE;
1772: break;
1773:
1.1.1.2 ! misho 1774: /* Reached end of a branch; if it's a ket it is the end of a nested call.
! 1775: If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
! 1776: an ALT. If it is END it's the end of the outer call. All can be handled by
! 1777: the same code. Note that we must not include the OP_KETRxxx opcodes here,
! 1778: because they all imply an unlimited repeat. */
1.1 misho 1779:
1780: case OP_ALT:
1781: case OP_KET:
1782: case OP_END:
1.1.1.2 ! misho 1783: case OP_ACCEPT:
! 1784: case OP_ASSERT_ACCEPT:
1.1 misho 1785: if (length < 0) length = branchlength;
1786: else if (length != branchlength) return -1;
1787: if (*cc != OP_ALT) return length;
1788: cc += 1 + LINK_SIZE;
1789: branchlength = 0;
1790: break;
1791:
1792: /* A true recursion implies not fixed length, but a subroutine call may
1793: be OK. If the subroutine is a forward reference, we can't deal with
1794: it until the end of the pattern, so return -3. */
1795:
1796: case OP_RECURSE:
1797: if (!atend) return -3;
1.1.1.2 ! misho 1798: cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */
! 1799: do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
! 1800: if (cc > cs && cc < ce) return -1; /* Recursion */
! 1801: d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
1.1 misho 1802: if (d < 0) return d;
1803: branchlength += d;
1804: cc += 1 + LINK_SIZE;
1805: break;
1806:
1807: /* Skip over assertive subpatterns */
1808:
1809: case OP_ASSERT:
1810: case OP_ASSERT_NOT:
1811: case OP_ASSERTBACK:
1812: case OP_ASSERTBACK_NOT:
1813: do cc += GET(cc, 1); while (*cc == OP_ALT);
1.1.1.2 ! misho 1814: cc += PRIV(OP_lengths)[*cc];
! 1815: break;
1.1 misho 1816:
1817: /* Skip over things that don't match chars */
1818:
1.1.1.2 ! misho 1819: case OP_MARK:
! 1820: case OP_PRUNE_ARG:
! 1821: case OP_SKIP_ARG:
! 1822: case OP_THEN_ARG:
! 1823: cc += cc[1] + PRIV(OP_lengths)[*cc];
! 1824: break;
! 1825:
! 1826: case OP_CALLOUT:
! 1827: case OP_CIRC:
! 1828: case OP_CIRCM:
! 1829: case OP_CLOSE:
! 1830: case OP_COMMIT:
1.1 misho 1831: case OP_CREF:
1832: case OP_DEF:
1.1.1.2 ! misho 1833: case OP_DOLL:
! 1834: case OP_DOLLM:
1.1 misho 1835: case OP_EOD:
1836: case OP_EODN:
1.1.1.2 ! misho 1837: case OP_FAIL:
! 1838: case OP_NCREF:
! 1839: case OP_NRREF:
1.1 misho 1840: case OP_NOT_WORD_BOUNDARY:
1.1.1.2 ! misho 1841: case OP_PRUNE:
! 1842: case OP_REVERSE:
! 1843: case OP_RREF:
! 1844: case OP_SET_SOM:
! 1845: case OP_SKIP:
! 1846: case OP_SOD:
! 1847: case OP_SOM:
! 1848: case OP_THEN:
1.1 misho 1849: case OP_WORD_BOUNDARY:
1.1.1.2 ! misho 1850: cc += PRIV(OP_lengths)[*cc];
1.1 misho 1851: break;
1852:
1853: /* Handle literal characters */
1854:
1855: case OP_CHAR:
1.1.1.2 ! misho 1856: case OP_CHARI:
1.1 misho 1857: case OP_NOT:
1.1.1.2 ! misho 1858: case OP_NOTI:
1.1 misho 1859: branchlength++;
1860: cc += 2;
1.1.1.2 ! misho 1861: #ifdef SUPPORT_UTF
! 1862: if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1.1 misho 1863: #endif
1864: break;
1865:
1866: /* Handle exact repetitions. The count is already in characters, but we
1867: need to skip over a multibyte character in UTF8 mode. */
1868:
1869: case OP_EXACT:
1.1.1.2 ! misho 1870: case OP_EXACTI:
! 1871: case OP_NOTEXACT:
! 1872: case OP_NOTEXACTI:
! 1873: branchlength += (int)GET2(cc,1);
! 1874: cc += 2 + IMM2_SIZE;
! 1875: #ifdef SUPPORT_UTF
! 1876: if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1.1 misho 1877: #endif
1878: break;
1879:
1880: case OP_TYPEEXACT:
1881: branchlength += GET2(cc,1);
1.1.1.2 ! misho 1882: if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
! 1883: cc += 2;
! 1884: cc += 1 + IMM2_SIZE + 1;
1.1 misho 1885: break;
1886:
1887: /* Handle single-char matchers */
1888:
1889: case OP_PROP:
1890: case OP_NOTPROP:
1891: cc += 2;
1892: /* Fall through */
1893:
1.1.1.2 ! misho 1894: case OP_HSPACE:
! 1895: case OP_VSPACE:
! 1896: case OP_NOT_HSPACE:
! 1897: case OP_NOT_VSPACE:
1.1 misho 1898: case OP_NOT_DIGIT:
1899: case OP_DIGIT:
1900: case OP_NOT_WHITESPACE:
1901: case OP_WHITESPACE:
1902: case OP_NOT_WORDCHAR:
1903: case OP_WORDCHAR:
1904: case OP_ANY:
1905: case OP_ALLANY:
1906: branchlength++;
1907: cc++;
1908: break;
1909:
1.1.1.2 ! misho 1910: /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
! 1911: otherwise \C is coded as OP_ALLANY. */
1.1 misho 1912:
1913: case OP_ANYBYTE:
1914: return -2;
1915:
1916: /* Check a class for variable quantification */
1917:
1918: case OP_CLASS:
1919: case OP_NCLASS:
1.1.1.2 ! misho 1920: #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
! 1921: case OP_XCLASS:
! 1922: /* The original code caused an unsigned overflow in 64 bit systems,
! 1923: so now we use a conditional statement. */
! 1924: if (op == OP_XCLASS)
! 1925: cc += GET(cc, 1);
! 1926: else
! 1927: cc += PRIV(OP_lengths)[OP_CLASS];
! 1928: #else
! 1929: cc += PRIV(OP_lengths)[OP_CLASS];
! 1930: #endif
1.1 misho 1931:
1932: switch (*cc)
1933: {
1.1.1.2 ! misho 1934: case OP_CRPLUS:
! 1935: case OP_CRMINPLUS:
1.1 misho 1936: case OP_CRSTAR:
1937: case OP_CRMINSTAR:
1938: case OP_CRQUERY:
1939: case OP_CRMINQUERY:
1940: return -1;
1941:
1942: case OP_CRRANGE:
1943: case OP_CRMINRANGE:
1.1.1.2 ! misho 1944: if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
! 1945: branchlength += (int)GET2(cc,1);
! 1946: cc += 1 + 2 * IMM2_SIZE;
1.1 misho 1947: break;
1948:
1949: default:
1950: branchlength++;
1951: }
1952: break;
1953:
1954: /* Anything else is variable length */
1955:
1.1.1.2 ! misho 1956: case OP_ANYNL:
! 1957: case OP_BRAMINZERO:
! 1958: case OP_BRAPOS:
! 1959: case OP_BRAPOSZERO:
! 1960: case OP_BRAZERO:
! 1961: case OP_CBRAPOS:
! 1962: case OP_EXTUNI:
! 1963: case OP_KETRMAX:
! 1964: case OP_KETRMIN:
! 1965: case OP_KETRPOS:
! 1966: case OP_MINPLUS:
! 1967: case OP_MINPLUSI:
! 1968: case OP_MINQUERY:
! 1969: case OP_MINQUERYI:
! 1970: case OP_MINSTAR:
! 1971: case OP_MINSTARI:
! 1972: case OP_MINUPTO:
! 1973: case OP_MINUPTOI:
! 1974: case OP_NOTMINPLUS:
! 1975: case OP_NOTMINPLUSI:
! 1976: case OP_NOTMINQUERY:
! 1977: case OP_NOTMINQUERYI:
! 1978: case OP_NOTMINSTAR:
! 1979: case OP_NOTMINSTARI:
! 1980: case OP_NOTMINUPTO:
! 1981: case OP_NOTMINUPTOI:
! 1982: case OP_NOTPLUS:
! 1983: case OP_NOTPLUSI:
! 1984: case OP_NOTPOSPLUS:
! 1985: case OP_NOTPOSPLUSI:
! 1986: case OP_NOTPOSQUERY:
! 1987: case OP_NOTPOSQUERYI:
! 1988: case OP_NOTPOSSTAR:
! 1989: case OP_NOTPOSSTARI:
! 1990: case OP_NOTPOSUPTO:
! 1991: case OP_NOTPOSUPTOI:
! 1992: case OP_NOTQUERY:
! 1993: case OP_NOTQUERYI:
! 1994: case OP_NOTSTAR:
! 1995: case OP_NOTSTARI:
! 1996: case OP_NOTUPTO:
! 1997: case OP_NOTUPTOI:
! 1998: case OP_PLUS:
! 1999: case OP_PLUSI:
! 2000: case OP_POSPLUS:
! 2001: case OP_POSPLUSI:
! 2002: case OP_POSQUERY:
! 2003: case OP_POSQUERYI:
! 2004: case OP_POSSTAR:
! 2005: case OP_POSSTARI:
! 2006: case OP_POSUPTO:
! 2007: case OP_POSUPTOI:
! 2008: case OP_QUERY:
! 2009: case OP_QUERYI:
! 2010: case OP_REF:
! 2011: case OP_REFI:
! 2012: case OP_SBRA:
! 2013: case OP_SBRAPOS:
! 2014: case OP_SCBRA:
! 2015: case OP_SCBRAPOS:
! 2016: case OP_SCOND:
! 2017: case OP_SKIPZERO:
! 2018: case OP_STAR:
! 2019: case OP_STARI:
! 2020: case OP_TYPEMINPLUS:
! 2021: case OP_TYPEMINQUERY:
! 2022: case OP_TYPEMINSTAR:
! 2023: case OP_TYPEMINUPTO:
! 2024: case OP_TYPEPLUS:
! 2025: case OP_TYPEPOSPLUS:
! 2026: case OP_TYPEPOSQUERY:
! 2027: case OP_TYPEPOSSTAR:
! 2028: case OP_TYPEPOSUPTO:
! 2029: case OP_TYPEQUERY:
! 2030: case OP_TYPESTAR:
! 2031: case OP_TYPEUPTO:
! 2032: case OP_UPTO:
! 2033: case OP_UPTOI:
1.1 misho 2034: return -1;
1.1.1.2 ! misho 2035:
! 2036: /* Catch unrecognized opcodes so that when new ones are added they
! 2037: are not forgotten, as has happened in the past. */
! 2038:
! 2039: default:
! 2040: return -4;
1.1 misho 2041: }
2042: }
2043: /* Control never gets here */
2044: }
2045:
2046:
2047:
2048:
2049: /*************************************************
2050: * Scan compiled regex for specific bracket *
2051: *************************************************/
2052:
2053: /* This little function scans through a compiled pattern until it finds a
2054: capturing bracket with the given number, or, if the number is negative, an
2055: instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2056: so that it can be called from pcre_study() when finding the minimum matching
2057: length.
2058:
2059: Arguments:
2060: code points to start of expression
1.1.1.2 ! misho 2061: utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1.1 misho 2062: number the required bracket number or negative to find a lookbehind
2063:
2064: Returns: pointer to the opcode for the bracket, or NULL if not found
2065: */
2066:
1.1.1.2 ! misho 2067: const pcre_uchar *
! 2068: PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
1.1 misho 2069: {
2070: for (;;)
2071: {
1.1.1.2 ! misho 2072: register pcre_uchar c = *code;
! 2073:
1.1 misho 2074: if (c == OP_END) return NULL;
2075:
2076: /* XCLASS is used for classes that cannot be represented just by a bit
2077: map. This includes negated single high-valued characters. The length in
2078: the table is zero; the actual length is stored in the compiled code. */
2079:
2080: if (c == OP_XCLASS) code += GET(code, 1);
2081:
2082: /* Handle recursion */
2083:
2084: else if (c == OP_REVERSE)
2085: {
1.1.1.2 ! misho 2086: if (number < 0) return (pcre_uchar *)code;
! 2087: code += PRIV(OP_lengths)[c];
1.1 misho 2088: }
2089:
2090: /* Handle capturing bracket */
2091:
1.1.1.2 ! misho 2092: else if (c == OP_CBRA || c == OP_SCBRA ||
! 2093: c == OP_CBRAPOS || c == OP_SCBRAPOS)
1.1 misho 2094: {
1.1.1.2 ! misho 2095: int n = (int)GET2(code, 1+LINK_SIZE);
! 2096: if (n == number) return (pcre_uchar *)code;
! 2097: code += PRIV(OP_lengths)[c];
1.1 misho 2098: }
2099:
2100: /* Otherwise, we can get the item's length from the table, except that for
2101: repeated character types, we have to test for \p and \P, which have an extra
2102: two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2103: must add in its length. */
2104:
2105: else
2106: {
2107: switch(c)
2108: {
2109: case OP_TYPESTAR:
2110: case OP_TYPEMINSTAR:
2111: case OP_TYPEPLUS:
2112: case OP_TYPEMINPLUS:
2113: case OP_TYPEQUERY:
2114: case OP_TYPEMINQUERY:
2115: case OP_TYPEPOSSTAR:
2116: case OP_TYPEPOSPLUS:
2117: case OP_TYPEPOSQUERY:
2118: if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2119: break;
2120:
2121: case OP_TYPEUPTO:
2122: case OP_TYPEMINUPTO:
2123: case OP_TYPEEXACT:
2124: case OP_TYPEPOSUPTO:
1.1.1.2 ! misho 2125: if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
! 2126: code += 2;
1.1 misho 2127: break;
2128:
2129: case OP_MARK:
2130: case OP_PRUNE_ARG:
2131: case OP_SKIP_ARG:
2132: code += code[1];
2133: break;
2134:
2135: case OP_THEN_ARG:
1.1.1.2 ! misho 2136: code += code[1];
1.1 misho 2137: break;
2138: }
2139:
2140: /* Add in the fixed length from the table */
2141:
1.1.1.2 ! misho 2142: code += PRIV(OP_lengths)[c];
1.1 misho 2143:
2144: /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2145: a multi-byte character. The length in the table is a minimum, so we have to
2146: arrange to skip the extra bytes. */
2147:
1.1.1.2 ! misho 2148: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
! 2149: if (utf) switch(c)
1.1 misho 2150: {
2151: case OP_CHAR:
1.1.1.2 ! misho 2152: case OP_CHARI:
1.1 misho 2153: case OP_EXACT:
1.1.1.2 ! misho 2154: case OP_EXACTI:
1.1 misho 2155: case OP_UPTO:
1.1.1.2 ! misho 2156: case OP_UPTOI:
1.1 misho 2157: case OP_MINUPTO:
1.1.1.2 ! misho 2158: case OP_MINUPTOI:
1.1 misho 2159: case OP_POSUPTO:
1.1.1.2 ! misho 2160: case OP_POSUPTOI:
1.1 misho 2161: case OP_STAR:
1.1.1.2 ! misho 2162: case OP_STARI:
1.1 misho 2163: case OP_MINSTAR:
1.1.1.2 ! misho 2164: case OP_MINSTARI:
1.1 misho 2165: case OP_POSSTAR:
1.1.1.2 ! misho 2166: case OP_POSSTARI:
1.1 misho 2167: case OP_PLUS:
1.1.1.2 ! misho 2168: case OP_PLUSI:
1.1 misho 2169: case OP_MINPLUS:
1.1.1.2 ! misho 2170: case OP_MINPLUSI:
1.1 misho 2171: case OP_POSPLUS:
1.1.1.2 ! misho 2172: case OP_POSPLUSI:
1.1 misho 2173: case OP_QUERY:
1.1.1.2 ! misho 2174: case OP_QUERYI:
1.1 misho 2175: case OP_MINQUERY:
1.1.1.2 ! misho 2176: case OP_MINQUERYI:
1.1 misho 2177: case OP_POSQUERY:
1.1.1.2 ! misho 2178: case OP_POSQUERYI:
! 2179: if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
1.1 misho 2180: break;
2181: }
2182: #else
1.1.1.2 ! misho 2183: (void)(utf); /* Keep compiler happy by referencing function argument */
1.1 misho 2184: #endif
2185: }
2186: }
2187: }
2188:
2189:
2190:
2191: /*************************************************
2192: * Scan compiled regex for recursion reference *
2193: *************************************************/
2194:
2195: /* This little function scans through a compiled pattern until it finds an
2196: instance of OP_RECURSE.
2197:
2198: Arguments:
2199: code points to start of expression
1.1.1.2 ! misho 2200: utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1.1 misho 2201:
2202: Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
2203: */
2204:
1.1.1.2 ! misho 2205: static const pcre_uchar *
! 2206: find_recurse(const pcre_uchar *code, BOOL utf)
1.1 misho 2207: {
2208: for (;;)
2209: {
1.1.1.2 ! misho 2210: register pcre_uchar c = *code;
1.1 misho 2211: if (c == OP_END) return NULL;
2212: if (c == OP_RECURSE) return code;
2213:
2214: /* XCLASS is used for classes that cannot be represented just by a bit
2215: map. This includes negated single high-valued characters. The length in
2216: the table is zero; the actual length is stored in the compiled code. */
2217:
2218: if (c == OP_XCLASS) code += GET(code, 1);
2219:
2220: /* Otherwise, we can get the item's length from the table, except that for
2221: repeated character types, we have to test for \p and \P, which have an extra
2222: two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2223: must add in its length. */
2224:
2225: else
2226: {
2227: switch(c)
2228: {
2229: case OP_TYPESTAR:
2230: case OP_TYPEMINSTAR:
2231: case OP_TYPEPLUS:
2232: case OP_TYPEMINPLUS:
2233: case OP_TYPEQUERY:
2234: case OP_TYPEMINQUERY:
2235: case OP_TYPEPOSSTAR:
2236: case OP_TYPEPOSPLUS:
2237: case OP_TYPEPOSQUERY:
2238: if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2239: break;
2240:
2241: case OP_TYPEPOSUPTO:
2242: case OP_TYPEUPTO:
2243: case OP_TYPEMINUPTO:
2244: case OP_TYPEEXACT:
1.1.1.2 ! misho 2245: if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
! 2246: code += 2;
1.1 misho 2247: break;
2248:
2249: case OP_MARK:
2250: case OP_PRUNE_ARG:
2251: case OP_SKIP_ARG:
2252: code += code[1];
2253: break;
2254:
2255: case OP_THEN_ARG:
1.1.1.2 ! misho 2256: code += code[1];
1.1 misho 2257: break;
2258: }
2259:
2260: /* Add in the fixed length from the table */
2261:
1.1.1.2 ! misho 2262: code += PRIV(OP_lengths)[c];
1.1 misho 2263:
2264: /* In UTF-8 mode, opcodes that are followed by a character may be followed
2265: by a multi-byte character. The length in the table is a minimum, so we have
2266: to arrange to skip the extra bytes. */
2267:
1.1.1.2 ! misho 2268: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
! 2269: if (utf) switch(c)
1.1 misho 2270: {
2271: case OP_CHAR:
1.1.1.2 ! misho 2272: case OP_CHARI:
! 2273: case OP_NOT:
! 2274: case OP_NOTI:
1.1 misho 2275: case OP_EXACT:
1.1.1.2 ! misho 2276: case OP_EXACTI:
! 2277: case OP_NOTEXACT:
! 2278: case OP_NOTEXACTI:
1.1 misho 2279: case OP_UPTO:
1.1.1.2 ! misho 2280: case OP_UPTOI:
! 2281: case OP_NOTUPTO:
! 2282: case OP_NOTUPTOI:
1.1 misho 2283: case OP_MINUPTO:
1.1.1.2 ! misho 2284: case OP_MINUPTOI:
! 2285: case OP_NOTMINUPTO:
! 2286: case OP_NOTMINUPTOI:
1.1 misho 2287: case OP_POSUPTO:
1.1.1.2 ! misho 2288: case OP_POSUPTOI:
! 2289: case OP_NOTPOSUPTO:
! 2290: case OP_NOTPOSUPTOI:
1.1 misho 2291: case OP_STAR:
1.1.1.2 ! misho 2292: case OP_STARI:
! 2293: case OP_NOTSTAR:
! 2294: case OP_NOTSTARI:
1.1 misho 2295: case OP_MINSTAR:
1.1.1.2 ! misho 2296: case OP_MINSTARI:
! 2297: case OP_NOTMINSTAR:
! 2298: case OP_NOTMINSTARI:
1.1 misho 2299: case OP_POSSTAR:
1.1.1.2 ! misho 2300: case OP_POSSTARI:
! 2301: case OP_NOTPOSSTAR:
! 2302: case OP_NOTPOSSTARI:
1.1 misho 2303: case OP_PLUS:
1.1.1.2 ! misho 2304: case OP_PLUSI:
! 2305: case OP_NOTPLUS:
! 2306: case OP_NOTPLUSI:
1.1 misho 2307: case OP_MINPLUS:
1.1.1.2 ! misho 2308: case OP_MINPLUSI:
! 2309: case OP_NOTMINPLUS:
! 2310: case OP_NOTMINPLUSI:
1.1 misho 2311: case OP_POSPLUS:
1.1.1.2 ! misho 2312: case OP_POSPLUSI:
! 2313: case OP_NOTPOSPLUS:
! 2314: case OP_NOTPOSPLUSI:
1.1 misho 2315: case OP_QUERY:
1.1.1.2 ! misho 2316: case OP_QUERYI:
! 2317: case OP_NOTQUERY:
! 2318: case OP_NOTQUERYI:
1.1 misho 2319: case OP_MINQUERY:
1.1.1.2 ! misho 2320: case OP_MINQUERYI:
! 2321: case OP_NOTMINQUERY:
! 2322: case OP_NOTMINQUERYI:
1.1 misho 2323: case OP_POSQUERY:
1.1.1.2 ! misho 2324: case OP_POSQUERYI:
! 2325: case OP_NOTPOSQUERY:
! 2326: case OP_NOTPOSQUERYI:
! 2327: if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
1.1 misho 2328: break;
2329: }
2330: #else
1.1.1.2 ! misho 2331: (void)(utf); /* Keep compiler happy by referencing function argument */
1.1 misho 2332: #endif
2333: }
2334: }
2335: }
2336:
2337:
2338:
2339: /*************************************************
2340: * Scan compiled branch for non-emptiness *
2341: *************************************************/
2342:
2343: /* This function scans through a branch of a compiled pattern to see whether it
2344: can match the empty string or not. It is called from could_be_empty()
2345: below and from compile_branch() when checking for an unlimited repeat of a
2346: group that can match nothing. Note that first_significant_code() skips over
2347: backward and negative forward assertions when its final argument is TRUE. If we
2348: hit an unclosed bracket, we return "empty" - this means we've struck an inner
2349: bracket whose current branch will already have been scanned.
2350:
2351: Arguments:
2352: code points to start of search
2353: endcode points to where to stop
1.1.1.2 ! misho 2354: utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
1.1 misho 2355: cd contains pointers to tables etc.
2356:
2357: Returns: TRUE if what is matched could be empty
2358: */
2359:
2360: static BOOL
1.1.1.2 ! misho 2361: could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
! 2362: BOOL utf, compile_data *cd)
1.1 misho 2363: {
1.1.1.2 ! misho 2364: register pcre_uchar c;
! 2365: for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
1.1 misho 2366: code < endcode;
1.1.1.2 ! misho 2367: code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
1.1 misho 2368: {
1.1.1.2 ! misho 2369: const pcre_uchar *ccode;
1.1 misho 2370:
2371: c = *code;
2372:
2373: /* Skip over forward assertions; the other assertions are skipped by
2374: first_significant_code() with a TRUE final argument. */
2375:
2376: if (c == OP_ASSERT)
2377: {
2378: do code += GET(code, 1); while (*code == OP_ALT);
2379: c = *code;
2380: continue;
2381: }
2382:
2383: /* For a recursion/subroutine call, if its end has been reached, which
1.1.1.2 ! misho 2384: implies a backward reference subroutine call, we can scan it. If it's a
! 2385: forward reference subroutine call, we can't. To detect forward reference
! 2386: we have to scan up the list that is kept in the workspace. This function is
! 2387: called only when doing the real compile, not during the pre-compile that
! 2388: measures the size of the compiled pattern. */
1.1 misho 2389:
2390: if (c == OP_RECURSE)
2391: {
1.1.1.2 ! misho 2392: const pcre_uchar *scode;
! 2393: BOOL empty_branch;
! 2394:
! 2395: /* Test for forward reference */
! 2396:
! 2397: for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
! 2398: if ((int)GET(scode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
! 2399:
! 2400: /* Not a forward reference, test for completed backward reference */
! 2401:
! 2402: empty_branch = FALSE;
! 2403: scode = cd->start_code + GET(code, 1);
1.1 misho 2404: if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
1.1.1.2 ! misho 2405:
! 2406: /* Completed backwards reference */
! 2407:
1.1 misho 2408: do
2409: {
1.1.1.2 ! misho 2410: if (could_be_empty_branch(scode, endcode, utf, cd))
1.1 misho 2411: {
2412: empty_branch = TRUE;
2413: break;
2414: }
2415: scode += GET(scode, 1);
2416: }
2417: while (*scode == OP_ALT);
1.1.1.2 ! misho 2418:
1.1 misho 2419: if (!empty_branch) return FALSE; /* All branches are non-empty */
2420: continue;
2421: }
2422:
1.1.1.2 ! misho 2423: /* Groups with zero repeats can of course be empty; skip them. */
1.1 misho 2424:
1.1.1.2 ! misho 2425: if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
! 2426: c == OP_BRAPOSZERO)
1.1 misho 2427: {
1.1.1.2 ! misho 2428: code += PRIV(OP_lengths)[c];
! 2429: do code += GET(code, 1); while (*code == OP_ALT);
! 2430: c = *code;
! 2431: continue;
! 2432: }
1.1 misho 2433:
1.1.1.2 ! misho 2434: /* A nested group that is already marked as "could be empty" can just be
! 2435: skipped. */
! 2436:
! 2437: if (c == OP_SBRA || c == OP_SBRAPOS ||
! 2438: c == OP_SCBRA || c == OP_SCBRAPOS)
! 2439: {
! 2440: do code += GET(code, 1); while (*code == OP_ALT);
! 2441: c = *code;
! 2442: continue;
! 2443: }
! 2444:
! 2445: /* For other groups, scan the branches. */
! 2446:
! 2447: if (c == OP_BRA || c == OP_BRAPOS ||
! 2448: c == OP_CBRA || c == OP_CBRAPOS ||
! 2449: c == OP_ONCE || c == OP_ONCE_NC ||
! 2450: c == OP_COND)
! 2451: {
! 2452: BOOL empty_branch;
! 2453: if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
! 2454:
! 2455: /* If a conditional group has only one branch, there is a second, implied,
1.1 misho 2456: empty branch, so just skip over the conditional, because it could be empty.
2457: Otherwise, scan the individual branches of the group. */
2458:
2459: if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2460: code += GET(code, 1);
2461: else
2462: {
2463: empty_branch = FALSE;
2464: do
2465: {
1.1.1.2 ! misho 2466: if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd))
1.1 misho 2467: empty_branch = TRUE;
2468: code += GET(code, 1);
2469: }
2470: while (*code == OP_ALT);
2471: if (!empty_branch) return FALSE; /* All branches are non-empty */
2472: }
2473:
2474: c = *code;
2475: continue;
2476: }
2477:
2478: /* Handle the other opcodes */
2479:
2480: switch (c)
2481: {
2482: /* Check for quantifiers after a class. XCLASS is used for classes that
2483: cannot be represented just by a bit map. This includes negated single
1.1.1.2 ! misho 2484: high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
1.1 misho 2485: actual length is stored in the compiled code, so we must update "code"
2486: here. */
2487:
1.1.1.2 ! misho 2488: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1.1 misho 2489: case OP_XCLASS:
2490: ccode = code += GET(code, 1);
2491: goto CHECK_CLASS_REPEAT;
2492: #endif
2493:
2494: case OP_CLASS:
2495: case OP_NCLASS:
1.1.1.2 ! misho 2496: ccode = code + PRIV(OP_lengths)[OP_CLASS];
1.1 misho 2497:
1.1.1.2 ! misho 2498: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1.1 misho 2499: CHECK_CLASS_REPEAT:
2500: #endif
2501:
2502: switch (*ccode)
2503: {
2504: case OP_CRSTAR: /* These could be empty; continue */
2505: case OP_CRMINSTAR:
2506: case OP_CRQUERY:
2507: case OP_CRMINQUERY:
2508: break;
2509:
2510: default: /* Non-repeat => class must match */
2511: case OP_CRPLUS: /* These repeats aren't empty */
2512: case OP_CRMINPLUS:
2513: return FALSE;
2514:
2515: case OP_CRRANGE:
2516: case OP_CRMINRANGE:
2517: if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2518: break;
2519: }
2520: break;
2521:
2522: /* Opcodes that must match a character */
2523:
2524: case OP_PROP:
2525: case OP_NOTPROP:
2526: case OP_EXTUNI:
2527: case OP_NOT_DIGIT:
2528: case OP_DIGIT:
2529: case OP_NOT_WHITESPACE:
2530: case OP_WHITESPACE:
2531: case OP_NOT_WORDCHAR:
2532: case OP_WORDCHAR:
2533: case OP_ANY:
2534: case OP_ALLANY:
2535: case OP_ANYBYTE:
2536: case OP_CHAR:
1.1.1.2 ! misho 2537: case OP_CHARI:
1.1 misho 2538: case OP_NOT:
1.1.1.2 ! misho 2539: case OP_NOTI:
1.1 misho 2540: case OP_PLUS:
2541: case OP_MINPLUS:
2542: case OP_POSPLUS:
2543: case OP_EXACT:
2544: case OP_NOTPLUS:
2545: case OP_NOTMINPLUS:
2546: case OP_NOTPOSPLUS:
2547: case OP_NOTEXACT:
2548: case OP_TYPEPLUS:
2549: case OP_TYPEMINPLUS:
2550: case OP_TYPEPOSPLUS:
2551: case OP_TYPEEXACT:
2552: return FALSE;
2553:
2554: /* These are going to continue, as they may be empty, but we have to
2555: fudge the length for the \p and \P cases. */
2556:
2557: case OP_TYPESTAR:
2558: case OP_TYPEMINSTAR:
2559: case OP_TYPEPOSSTAR:
2560: case OP_TYPEQUERY:
2561: case OP_TYPEMINQUERY:
2562: case OP_TYPEPOSQUERY:
2563: if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2564: break;
2565:
2566: /* Same for these */
2567:
2568: case OP_TYPEUPTO:
2569: case OP_TYPEMINUPTO:
2570: case OP_TYPEPOSUPTO:
1.1.1.2 ! misho 2571: if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
! 2572: code += 2;
1.1 misho 2573: break;
2574:
2575: /* End of branch */
2576:
2577: case OP_KET:
2578: case OP_KETRMAX:
2579: case OP_KETRMIN:
1.1.1.2 ! misho 2580: case OP_KETRPOS:
1.1 misho 2581: case OP_ALT:
2582: return TRUE;
2583:
2584: /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2585: MINUPTO, and POSUPTO may be followed by a multibyte character */
2586:
1.1.1.2 ! misho 2587: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
1.1 misho 2588: case OP_STAR:
1.1.1.2 ! misho 2589: case OP_STARI:
1.1 misho 2590: case OP_MINSTAR:
1.1.1.2 ! misho 2591: case OP_MINSTARI:
1.1 misho 2592: case OP_POSSTAR:
1.1.1.2 ! misho 2593: case OP_POSSTARI:
1.1 misho 2594: case OP_QUERY:
1.1.1.2 ! misho 2595: case OP_QUERYI:
1.1 misho 2596: case OP_MINQUERY:
1.1.1.2 ! misho 2597: case OP_MINQUERYI:
1.1 misho 2598: case OP_POSQUERY:
1.1.1.2 ! misho 2599: case OP_POSQUERYI:
! 2600: if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
1.1 misho 2601: break;
2602:
2603: case OP_UPTO:
1.1.1.2 ! misho 2604: case OP_UPTOI:
1.1 misho 2605: case OP_MINUPTO:
1.1.1.2 ! misho 2606: case OP_MINUPTOI:
1.1 misho 2607: case OP_POSUPTO:
1.1.1.2 ! misho 2608: case OP_POSUPTOI:
! 2609: if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
1.1 misho 2610: break;
2611: #endif
2612:
2613: /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2614: string. */
2615:
2616: case OP_MARK:
2617: case OP_PRUNE_ARG:
2618: case OP_SKIP_ARG:
2619: code += code[1];
2620: break;
2621:
2622: case OP_THEN_ARG:
1.1.1.2 ! misho 2623: code += code[1];
1.1 misho 2624: break;
2625:
2626: /* None of the remaining opcodes are required to match a character. */
2627:
2628: default:
2629: break;
2630: }
2631: }
2632:
2633: return TRUE;
2634: }
2635:
2636:
2637:
2638: /*************************************************
2639: * Scan compiled regex for non-emptiness *
2640: *************************************************/
2641:
2642: /* This function is called to check for left recursive calls. We want to check
2643: the current branch of the current pattern to see if it could match the empty
2644: string. If it could, we must look outwards for branches at other levels,
2645: stopping when we pass beyond the bracket which is the subject of the recursion.
1.1.1.2 ! misho 2646: This function is called only during the real compile, not during the
! 2647: pre-compile.
1.1 misho 2648:
2649: Arguments:
2650: code points to start of the recursion
2651: endcode points to where to stop (current RECURSE item)
2652: bcptr points to the chain of current (unclosed) branch starts
1.1.1.2 ! misho 2653: utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
1.1 misho 2654: cd pointers to tables etc
2655:
2656: Returns: TRUE if what is matched could be empty
2657: */
2658:
2659: static BOOL
1.1.1.2 ! misho 2660: could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
! 2661: branch_chain *bcptr, BOOL utf, compile_data *cd)
1.1 misho 2662: {
2663: while (bcptr != NULL && bcptr->current_branch >= code)
2664: {
1.1.1.2 ! misho 2665: if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd))
1.1 misho 2666: return FALSE;
2667: bcptr = bcptr->outer;
2668: }
2669: return TRUE;
2670: }
2671:
2672:
2673:
2674: /*************************************************
2675: * Check for POSIX class syntax *
2676: *************************************************/
2677:
2678: /* This function is called when the sequence "[:" or "[." or "[=" is
2679: encountered in a character class. It checks whether this is followed by a
2680: sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2681: reach an unescaped ']' without the special preceding character, return FALSE.
2682:
2683: Originally, this function only recognized a sequence of letters between the
2684: terminators, but it seems that Perl recognizes any sequence of characters,
2685: though of course unknown POSIX names are subsequently rejected. Perl gives an
2686: "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2687: didn't consider this to be a POSIX class. Likewise for [:1234:].
2688:
2689: The problem in trying to be exactly like Perl is in the handling of escapes. We
2690: have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2691: class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2692: below handles the special case of \], but does not try to do any other escape
2693: processing. This makes it different from Perl for cases such as [:l\ower:]
2694: where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2695: "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2696: I think.
2697:
1.1.1.2 ! misho 2698: A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
! 2699: It seems that the appearance of a nested POSIX class supersedes an apparent
! 2700: external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
! 2701: a digit.
! 2702:
! 2703: In Perl, unescaped square brackets may also appear as part of class names. For
! 2704: example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
! 2705: [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
! 2706: seem right at all. PCRE does not allow closing square brackets in POSIX class
! 2707: names.
! 2708:
1.1 misho 2709: Arguments:
2710: ptr pointer to the initial [
2711: endptr where to return the end pointer
2712:
2713: Returns: TRUE or FALSE
2714: */
2715:
2716: static BOOL
1.1.1.2 ! misho 2717: check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
1.1 misho 2718: {
1.1.1.2 ! misho 2719: pcre_uchar terminator; /* Don't combine these lines; the Solaris cc */
1.1 misho 2720: terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1.1.1.2 ! misho 2721: for (++ptr; *ptr != CHAR_NULL; ptr++)
1.1 misho 2722: {
1.1.1.2 ! misho 2723: if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
! 2724: ptr++;
! 2725: else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
! 2726: else
1.1 misho 2727: {
2728: if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2729: {
2730: *endptr = ptr;
2731: return TRUE;
2732: }
1.1.1.2 ! misho 2733: if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
! 2734: (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
! 2735: ptr[1] == CHAR_EQUALS_SIGN) &&
! 2736: check_posix_syntax(ptr, endptr))
! 2737: return FALSE;
1.1 misho 2738: }
2739: }
2740: return FALSE;
2741: }
2742:
2743:
2744:
2745:
2746: /*************************************************
2747: * Check POSIX class name *
2748: *************************************************/
2749:
2750: /* This function is called to check the name given in a POSIX-style class entry
2751: such as [:alnum:].
2752:
2753: Arguments:
2754: ptr points to the first letter
2755: len the length of the name
2756:
2757: Returns: a value representing the name, or -1 if unknown
2758: */
2759:
2760: static int
1.1.1.2 ! misho 2761: check_posix_name(const pcre_uchar *ptr, int len)
1.1 misho 2762: {
2763: const char *pn = posix_names;
2764: register int yield = 0;
2765: while (posix_name_lengths[yield] != 0)
2766: {
2767: if (len == posix_name_lengths[yield] &&
1.1.1.2 ! misho 2768: STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
1.1 misho 2769: pn += posix_name_lengths[yield] + 1;
2770: yield++;
2771: }
2772: return -1;
2773: }
2774:
2775:
2776: /*************************************************
2777: * Adjust OP_RECURSE items in repeated group *
2778: *************************************************/
2779:
2780: /* OP_RECURSE items contain an offset from the start of the regex to the group
2781: that is referenced. This means that groups can be replicated for fixed
2782: repetition simply by copying (because the recursion is allowed to refer to
2783: earlier groups that are outside the current group). However, when a group is
2784: optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2785: inserted before it, after it has been compiled. This means that any OP_RECURSE
2786: items within it that refer to the group itself or any contained groups have to
2787: have their offsets adjusted. That one of the jobs of this function. Before it
2788: is called, the partially compiled regex must be temporarily terminated with
2789: OP_END.
2790:
2791: This function has been extended with the possibility of forward references for
2792: recursions and subroutine calls. It must also check the list of such references
2793: for the group we are dealing with. If it finds that one of the recursions in
2794: the current group is on this list, it adjusts the offset in the list, not the
2795: value in the reference (which is a group number).
2796:
2797: Arguments:
2798: group points to the start of the group
2799: adjust the amount by which the group is to be moved
1.1.1.2 ! misho 2800: utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1.1 misho 2801: cd contains pointers to tables etc.
2802: save_hwm the hwm forward reference pointer at the start of the group
2803:
2804: Returns: nothing
2805: */
2806:
2807: static void
1.1.1.2 ! misho 2808: adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
! 2809: pcre_uchar *save_hwm)
1.1 misho 2810: {
1.1.1.2 ! misho 2811: pcre_uchar *ptr = group;
1.1 misho 2812:
1.1.1.2 ! misho 2813: while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
1.1 misho 2814: {
2815: int offset;
1.1.1.2 ! misho 2816: pcre_uchar *hc;
1.1 misho 2817:
2818: /* See if this recursion is on the forward reference list. If so, adjust the
2819: reference. */
2820:
2821: for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2822: {
1.1.1.2 ! misho 2823: offset = (int)GET(hc, 0);
1.1 misho 2824: if (cd->start_code + offset == ptr + 1)
2825: {
2826: PUT(hc, 0, offset + adjust);
2827: break;
2828: }
2829: }
2830:
2831: /* Otherwise, adjust the recursion offset if it's after the start of this
2832: group. */
2833:
2834: if (hc >= cd->hwm)
2835: {
1.1.1.2 ! misho 2836: offset = (int)GET(ptr, 1);
1.1 misho 2837: if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2838: }
2839:
2840: ptr += 1 + LINK_SIZE;
2841: }
2842: }
2843:
2844:
2845:
2846: /*************************************************
2847: * Insert an automatic callout point *
2848: *************************************************/
2849:
2850: /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2851: callout points before each pattern item.
2852:
2853: Arguments:
2854: code current code pointer
2855: ptr current pattern pointer
2856: cd pointers to tables etc
2857:
2858: Returns: new code pointer
2859: */
2860:
1.1.1.2 ! misho 2861: static pcre_uchar *
! 2862: auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
1.1 misho 2863: {
2864: *code++ = OP_CALLOUT;
2865: *code++ = 255;
2866: PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
2867: PUT(code, LINK_SIZE, 0); /* Default length */
1.1.1.2 ! misho 2868: return code + 2 * LINK_SIZE;
1.1 misho 2869: }
2870:
2871:
2872:
2873: /*************************************************
2874: * Complete a callout item *
2875: *************************************************/
2876:
2877: /* A callout item contains the length of the next item in the pattern, which
2878: we can't fill in till after we have reached the relevant point. This is used
2879: for both automatic and manual callouts.
2880:
2881: Arguments:
2882: previous_callout points to previous callout item
2883: ptr current pattern pointer
2884: cd pointers to tables etc
2885:
2886: Returns: nothing
2887: */
2888:
2889: static void
1.1.1.2 ! misho 2890: complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
1.1 misho 2891: {
2892: int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2893: PUT(previous_callout, 2 + LINK_SIZE, length);
2894: }
2895:
2896:
2897:
2898: #ifdef SUPPORT_UCP
2899: /*************************************************
2900: * Get othercase range *
2901: *************************************************/
2902:
2903: /* This function is passed the start and end of a class range, in UTF-8 mode
1.1.1.2 ! misho 2904: with UCP support. It searches up the characters, looking for ranges of
1.1 misho 2905: characters in the "other" case. Each call returns the next one, updating the
1.1.1.2 ! misho 2906: start address. A character with multiple other cases is returned on its own
! 2907: with a special return value.
1.1 misho 2908:
2909: Arguments:
2910: cptr points to starting character value; updated
2911: d end value
2912: ocptr where to put start of othercase range
2913: odptr where to put end of othercase range
2914:
1.1.1.2 ! misho 2915: Yield: -1 when no more
! 2916: 0 when a range is returned
! 2917: >0 the CASESET offset for char with multiple other cases
! 2918: in this case, ocptr contains the original
1.1 misho 2919: */
2920:
1.1.1.2 ! misho 2921: static int
! 2922: get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
! 2923: pcre_uint32 *odptr)
1.1 misho 2924: {
1.1.1.2 ! misho 2925: pcre_uint32 c, othercase, next;
! 2926: unsigned int co;
! 2927:
! 2928: /* Find the first character that has an other case. If it has multiple other
! 2929: cases, return its case offset value. */
1.1 misho 2930:
2931: for (c = *cptr; c <= d; c++)
1.1.1.2 ! misho 2932: {
! 2933: if ((co = UCD_CASESET(c)) != 0)
! 2934: {
! 2935: *ocptr = c++; /* Character that has the set */
! 2936: *cptr = c; /* Rest of input range */
! 2937: return (int)co;
! 2938: }
! 2939: if ((othercase = UCD_OTHERCASE(c)) != c) break;
! 2940: }
1.1 misho 2941:
1.1.1.2 ! misho 2942: if (c > d) return -1; /* Reached end of range */
1.1 misho 2943:
2944: *ocptr = othercase;
2945: next = othercase + 1;
2946:
2947: for (++c; c <= d; c++)
2948: {
2949: if (UCD_OTHERCASE(c) != next) break;
2950: next++;
2951: }
2952:
1.1.1.2 ! misho 2953: *odptr = next - 1; /* End of othercase range */
! 2954: *cptr = c; /* Rest of input range */
! 2955: return 0;
1.1 misho 2956: }
2957:
2958:
2959:
2960: /*************************************************
2961: * Check a character and a property *
2962: *************************************************/
2963:
2964: /* This function is called by check_auto_possessive() when a property item
2965: is adjacent to a fixed character.
2966:
2967: Arguments:
2968: c the character
2969: ptype the property type
2970: pdata the data for the type
2971: negated TRUE if it's a negated property (\P or \p{^)
2972:
2973: Returns: TRUE if auto-possessifying is OK
2974: */
2975:
2976: static BOOL
1.1.1.2 ! misho 2977: check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata, BOOL negated)
1.1 misho 2978: {
1.1.1.2 ! misho 2979: #ifdef SUPPORT_UCP
! 2980: const pcre_uint32 *p;
! 2981: #endif
! 2982:
1.1 misho 2983: const ucd_record *prop = GET_UCD(c);
1.1.1.2 ! misho 2984:
1.1 misho 2985: switch(ptype)
2986: {
2987: case PT_LAMP:
2988: return (prop->chartype == ucp_Lu ||
2989: prop->chartype == ucp_Ll ||
2990: prop->chartype == ucp_Lt) == negated;
2991:
2992: case PT_GC:
1.1.1.2 ! misho 2993: return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
1.1 misho 2994:
2995: case PT_PC:
2996: return (pdata == prop->chartype) == negated;
2997:
2998: case PT_SC:
2999: return (pdata == prop->script) == negated;
3000:
3001: /* These are specials */
3002:
3003: case PT_ALNUM:
1.1.1.2 ! misho 3004: return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
! 3005: PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
1.1 misho 3006:
3007: case PT_SPACE: /* Perl space */
1.1.1.2 ! misho 3008: return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.1 misho 3009: c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
3010: == negated;
3011:
3012: case PT_PXSPACE: /* POSIX space */
1.1.1.2 ! misho 3013: return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.1 misho 3014: c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
3015: c == CHAR_FF || c == CHAR_CR)
3016: == negated;
3017:
3018: case PT_WORD:
1.1.1.2 ! misho 3019: return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
! 3020: PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1.1 misho 3021: c == CHAR_UNDERSCORE) == negated;
1.1.1.2 ! misho 3022:
! 3023: #ifdef SUPPORT_UCP
! 3024: case PT_CLIST:
! 3025: p = PRIV(ucd_caseless_sets) + prop->caseset;
! 3026: for (;;)
! 3027: {
! 3028: if (c < *p) return !negated;
! 3029: if (c == *p++) return negated;
! 3030: }
! 3031: break; /* Control never reaches here */
! 3032: #endif
1.1 misho 3033: }
1.1.1.2 ! misho 3034:
1.1 misho 3035: return FALSE;
3036: }
3037: #endif /* SUPPORT_UCP */
3038:
3039:
3040:
3041: /*************************************************
3042: * Check if auto-possessifying is possible *
3043: *************************************************/
3044:
3045: /* This function is called for unlimited repeats of certain items, to see
3046: whether the next thing could possibly match the repeated item. If not, it makes
3047: sense to automatically possessify the repeated item.
3048:
3049: Arguments:
3050: previous pointer to the repeated opcode
1.1.1.2 ! misho 3051: utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1.1 misho 3052: ptr next character in pattern
3053: options options bits
3054: cd contains pointers to tables etc.
3055:
3056: Returns: TRUE if possessifying is wanted
3057: */
3058:
3059: static BOOL
1.1.1.2 ! misho 3060: check_auto_possessive(const pcre_uchar *previous, BOOL utf,
! 3061: const pcre_uchar *ptr, int options, compile_data *cd)
1.1 misho 3062: {
1.1.1.2 ! misho 3063: pcre_uint32 c = NOTACHAR;
! 3064: pcre_uint32 next;
! 3065: int escape;
! 3066: pcre_uchar op_code = *previous++;
1.1 misho 3067:
3068: /* Skip whitespace and comments in extended mode */
3069:
3070: if ((options & PCRE_EXTENDED) != 0)
3071: {
3072: for (;;)
3073: {
1.1.1.2 ! misho 3074: while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1.1 misho 3075: if (*ptr == CHAR_NUMBER_SIGN)
3076: {
3077: ptr++;
1.1.1.2 ! misho 3078: while (*ptr != CHAR_NULL)
1.1 misho 3079: {
3080: if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
3081: ptr++;
1.1.1.2 ! misho 3082: #ifdef SUPPORT_UTF
! 3083: if (utf) FORWARDCHAR(ptr);
1.1 misho 3084: #endif
3085: }
3086: }
3087: else break;
3088: }
3089: }
3090:
3091: /* If the next item is one that we can handle, get its value. A non-negative
3092: value is a character, a negative value is an escape value. */
3093:
3094: if (*ptr == CHAR_BACKSLASH)
3095: {
3096: int temperrorcode = 0;
1.1.1.2 ! misho 3097: escape = check_escape(&ptr, &next, &temperrorcode, cd->bracount, options, FALSE);
1.1 misho 3098: if (temperrorcode != 0) return FALSE;
3099: ptr++; /* Point after the escape sequence */
3100: }
1.1.1.2 ! misho 3101: else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0)
1.1 misho 3102: {
1.1.1.2 ! misho 3103: escape = 0;
! 3104: #ifdef SUPPORT_UTF
! 3105: if (utf) { GETCHARINC(next, ptr); } else
1.1 misho 3106: #endif
3107: next = *ptr++;
3108: }
3109: else return FALSE;
3110:
3111: /* Skip whitespace and comments in extended mode */
3112:
3113: if ((options & PCRE_EXTENDED) != 0)
3114: {
3115: for (;;)
3116: {
1.1.1.2 ! misho 3117: while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1.1 misho 3118: if (*ptr == CHAR_NUMBER_SIGN)
3119: {
3120: ptr++;
1.1.1.2 ! misho 3121: while (*ptr != CHAR_NULL)
1.1 misho 3122: {
3123: if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
3124: ptr++;
1.1.1.2 ! misho 3125: #ifdef SUPPORT_UTF
! 3126: if (utf) FORWARDCHAR(ptr);
1.1 misho 3127: #endif
3128: }
3129: }
3130: else break;
3131: }
3132: }
3133:
3134: /* If the next thing is itself optional, we have to give up. */
3135:
3136: if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
1.1.1.2 ! misho 3137: STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
1.1 misho 3138: return FALSE;
3139:
1.1.1.2 ! misho 3140: /* If the previous item is a character, get its value. */
1.1 misho 3141:
1.1.1.2 ! misho 3142: if (op_code == OP_CHAR || op_code == OP_CHARI ||
! 3143: op_code == OP_NOT || op_code == OP_NOTI)
1.1 misho 3144: {
1.1.1.2 ! misho 3145: #ifdef SUPPORT_UTF
1.1 misho 3146: GETCHARTEST(c, previous);
3147: #else
3148: c = *previous;
3149: #endif
1.1.1.2 ! misho 3150: }
1.1 misho 3151:
1.1.1.2 ! misho 3152: /* Now compare the next item with the previous opcode. First, handle cases when
! 3153: the next item is a character. */
1.1 misho 3154:
1.1.1.2 ! misho 3155: if (escape == 0)
! 3156: {
! 3157: /* For a caseless UTF match, the next character may have more than one other
! 3158: case, which maps to the special PT_CLIST property. Check this first. */
! 3159:
! 3160: #ifdef SUPPORT_UCP
! 3161: if (utf && c != NOTACHAR && (options & PCRE_CASELESS) != 0)
! 3162: {
! 3163: unsigned int ocs = UCD_CASESET(next);
! 3164: if (ocs > 0) return check_char_prop(c, PT_CLIST, ocs, op_code >= OP_NOT);
! 3165: }
1.1 misho 3166: #endif
1.1.1.2 ! misho 3167:
! 3168: switch(op_code)
1.1 misho 3169: {
1.1.1.2 ! misho 3170: case OP_CHAR:
! 3171: return c != next;
! 3172:
! 3173: /* For CHARI (caseless character) we must check the other case. If we have
! 3174: Unicode property support, we can use it to test the other case of
! 3175: high-valued characters. We know that next can have only one other case,
! 3176: because multi-other-case characters are dealt with above. */
! 3177:
! 3178: case OP_CHARI:
! 3179: if (c == next) return FALSE;
! 3180: #ifdef SUPPORT_UTF
! 3181: if (utf)
! 3182: {
! 3183: pcre_uint32 othercase;
! 3184: if (next < 128) othercase = cd->fcc[next]; else
1.1 misho 3185: #ifdef SUPPORT_UCP
1.1.1.2 ! misho 3186: othercase = UCD_OTHERCASE(next);
1.1 misho 3187: #else
1.1.1.2 ! misho 3188: othercase = NOTACHAR;
1.1 misho 3189: #endif
1.1.1.2 ! misho 3190: return c != othercase;
! 3191: }
! 3192: else
! 3193: #endif /* SUPPORT_UTF */
! 3194: return (c != TABLE_GET(next, cd->fcc, next)); /* Not UTF */
1.1 misho 3195:
1.1.1.2 ! misho 3196: case OP_NOT:
! 3197: return c == next;
1.1 misho 3198:
1.1.1.2 ! misho 3199: case OP_NOTI:
! 3200: if (c == next) return TRUE;
! 3201: #ifdef SUPPORT_UTF
! 3202: if (utf)
! 3203: {
! 3204: pcre_uint32 othercase;
! 3205: if (next < 128) othercase = cd->fcc[next]; else
1.1 misho 3206: #ifdef SUPPORT_UCP
1.1.1.2 ! misho 3207: othercase = UCD_OTHERCASE(next);
1.1 misho 3208: #else
1.1.1.2 ! misho 3209: othercase = NOTACHAR;
1.1 misho 3210: #endif
1.1.1.2 ! misho 3211: return c == othercase;
! 3212: }
! 3213: else
! 3214: #endif /* SUPPORT_UTF */
! 3215: return (c == TABLE_GET(next, cd->fcc, next)); /* Not UTF */
1.1 misho 3216:
1.1.1.2 ! misho 3217: /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
! 3218: When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
1.1 misho 3219:
1.1.1.2 ! misho 3220: case OP_DIGIT:
! 3221: return next > 255 || (cd->ctypes[next] & ctype_digit) == 0;
1.1 misho 3222:
1.1.1.2 ! misho 3223: case OP_NOT_DIGIT:
! 3224: return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0;
1.1 misho 3225:
1.1.1.2 ! misho 3226: case OP_WHITESPACE:
! 3227: return next > 255 || (cd->ctypes[next] & ctype_space) == 0;
1.1 misho 3228:
1.1.1.2 ! misho 3229: case OP_NOT_WHITESPACE:
! 3230: return next <= 255 && (cd->ctypes[next] & ctype_space) != 0;
1.1 misho 3231:
1.1.1.2 ! misho 3232: case OP_WORDCHAR:
! 3233: return next > 255 || (cd->ctypes[next] & ctype_word) == 0;
1.1 misho 3234:
1.1.1.2 ! misho 3235: case OP_NOT_WORDCHAR:
! 3236: return next <= 255 && (cd->ctypes[next] & ctype_word) != 0;
1.1 misho 3237:
1.1.1.2 ! misho 3238: case OP_HSPACE:
! 3239: case OP_NOT_HSPACE:
! 3240: switch(next)
! 3241: {
! 3242: HSPACE_CASES:
! 3243: return op_code == OP_NOT_HSPACE;
1.1 misho 3244:
1.1.1.2 ! misho 3245: default:
! 3246: return op_code != OP_NOT_HSPACE;
! 3247: }
! 3248:
! 3249: case OP_ANYNL:
! 3250: case OP_VSPACE:
! 3251: case OP_NOT_VSPACE:
! 3252: switch(next)
! 3253: {
! 3254: VSPACE_CASES:
! 3255: return op_code == OP_NOT_VSPACE;
! 3256:
! 3257: default:
! 3258: return op_code != OP_NOT_VSPACE;
! 3259: }
1.1 misho 3260:
3261: #ifdef SUPPORT_UCP
1.1.1.2 ! misho 3262: case OP_PROP:
! 3263: return check_char_prop(next, previous[0], previous[1], FALSE);
1.1 misho 3264:
1.1.1.2 ! misho 3265: case OP_NOTPROP:
! 3266: return check_char_prop(next, previous[0], previous[1], TRUE);
1.1 misho 3267: #endif
3268:
1.1.1.2 ! misho 3269: default:
! 3270: return FALSE;
! 3271: }
1.1 misho 3272: }
3273:
3274: /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
3275: is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
3276: generated only when PCRE_UCP is *not* set, that is, when only ASCII
3277: characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
3278: replaced by OP_PROP codes when PCRE_UCP is set. */
3279:
3280: switch(op_code)
3281: {
3282: case OP_CHAR:
1.1.1.2 ! misho 3283: case OP_CHARI:
! 3284: switch(escape)
1.1 misho 3285: {
3286: case ESC_d:
1.1.1.2 ! misho 3287: return c > 255 || (cd->ctypes[c] & ctype_digit) == 0;
1.1 misho 3288:
3289: case ESC_D:
1.1.1.2 ! misho 3290: return c <= 255 && (cd->ctypes[c] & ctype_digit) != 0;
1.1 misho 3291:
3292: case ESC_s:
1.1.1.2 ! misho 3293: return c > 255 || (cd->ctypes[c] & ctype_space) == 0;
1.1 misho 3294:
3295: case ESC_S:
1.1.1.2 ! misho 3296: return c <= 255 && (cd->ctypes[c] & ctype_space) != 0;
1.1 misho 3297:
3298: case ESC_w:
1.1.1.2 ! misho 3299: return c > 255 || (cd->ctypes[c] & ctype_word) == 0;
1.1 misho 3300:
3301: case ESC_W:
1.1.1.2 ! misho 3302: return c <= 255 && (cd->ctypes[c] & ctype_word) != 0;
1.1 misho 3303:
3304: case ESC_h:
3305: case ESC_H:
3306: switch(c)
3307: {
1.1.1.2 ! misho 3308: HSPACE_CASES:
! 3309: return escape != ESC_h;
! 3310:
1.1 misho 3311: default:
1.1.1.2 ! misho 3312: return escape == ESC_h;
1.1 misho 3313: }
3314:
3315: case ESC_v:
3316: case ESC_V:
3317: switch(c)
3318: {
1.1.1.2 ! misho 3319: VSPACE_CASES:
! 3320: return escape != ESC_v;
! 3321:
1.1 misho 3322: default:
1.1.1.2 ! misho 3323: return escape == ESC_v;
1.1 misho 3324: }
3325:
3326: /* When PCRE_UCP is set, these values get generated for \d etc. Find
3327: their substitutions and process them. The result will always be either
1.1.1.2 ! misho 3328: ESC_p or ESC_P. Then fall through to process those values. */
1.1 misho 3329:
3330: #ifdef SUPPORT_UCP
3331: case ESC_du:
3332: case ESC_DU:
3333: case ESC_wu:
3334: case ESC_WU:
3335: case ESC_su:
3336: case ESC_SU:
3337: {
3338: int temperrorcode = 0;
1.1.1.2 ! misho 3339: ptr = substitutes[escape - ESC_DU];
! 3340: escape = check_escape(&ptr, &next, &temperrorcode, 0, options, FALSE);
1.1 misho 3341: if (temperrorcode != 0) return FALSE;
3342: ptr++; /* For compatibility */
3343: }
3344: /* Fall through */
3345:
3346: case ESC_p:
3347: case ESC_P:
3348: {
1.1.1.2 ! misho 3349: unsigned int ptype = 0, pdata = 0;
! 3350: int errorcodeptr;
1.1 misho 3351: BOOL negated;
3352:
3353: ptr--; /* Make ptr point at the p or P */
1.1.1.2 ! misho 3354: if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcodeptr))
! 3355: return FALSE;
1.1 misho 3356: ptr++; /* Point past the final curly ket */
3357:
3358: /* If the property item is optional, we have to give up. (When generated
3359: from \d etc by PCRE_UCP, this test will have been applied much earlier,
3360: to the original \d etc. At this point, ptr will point to a zero byte. */
3361:
3362: if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
1.1.1.2 ! misho 3363: STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
1.1 misho 3364: return FALSE;
3365:
3366: /* Do the property check. */
3367:
1.1.1.2 ! misho 3368: return check_char_prop(c, ptype, pdata, (escape == ESC_P) != negated);
1.1 misho 3369: }
3370: #endif
3371:
3372: default:
3373: return FALSE;
3374: }
3375:
3376: /* In principle, support for Unicode properties should be integrated here as
3377: well. It means re-organizing the above code so as to get hold of the property
3378: values before switching on the op-code. However, I wonder how many patterns
3379: combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
3380: these op-codes are never generated.) */
3381:
3382: case OP_DIGIT:
1.1.1.2 ! misho 3383: return escape == ESC_D || escape == ESC_s || escape == ESC_W ||
! 3384: escape == ESC_h || escape == ESC_v || escape == ESC_R;
1.1 misho 3385:
3386: case OP_NOT_DIGIT:
1.1.1.2 ! misho 3387: return escape == ESC_d;
1.1 misho 3388:
3389: case OP_WHITESPACE:
1.1.1.2 ! misho 3390: return escape == ESC_S || escape == ESC_d || escape == ESC_w;
1.1 misho 3391:
3392: case OP_NOT_WHITESPACE:
1.1.1.2 ! misho 3393: return escape == ESC_s || escape == ESC_h || escape == ESC_v || escape == ESC_R;
1.1 misho 3394:
3395: case OP_HSPACE:
1.1.1.2 ! misho 3396: return escape == ESC_S || escape == ESC_H || escape == ESC_d ||
! 3397: escape == ESC_w || escape == ESC_v || escape == ESC_R;
1.1 misho 3398:
3399: case OP_NOT_HSPACE:
1.1.1.2 ! misho 3400: return escape == ESC_h;
1.1 misho 3401:
3402: /* Can't have \S in here because VT matches \S (Perl anomaly) */
3403: case OP_ANYNL:
3404: case OP_VSPACE:
1.1.1.2 ! misho 3405: return escape == ESC_V || escape == ESC_d || escape == ESC_w;
1.1 misho 3406:
3407: case OP_NOT_VSPACE:
1.1.1.2 ! misho 3408: return escape == ESC_v || escape == ESC_R;
1.1 misho 3409:
3410: case OP_WORDCHAR:
1.1.1.2 ! misho 3411: return escape == ESC_W || escape == ESC_s || escape == ESC_h ||
! 3412: escape == ESC_v || escape == ESC_R;
1.1 misho 3413:
3414: case OP_NOT_WORDCHAR:
1.1.1.2 ! misho 3415: return escape == ESC_w || escape == ESC_d;
1.1 misho 3416:
3417: default:
3418: return FALSE;
3419: }
3420:
3421: /* Control does not reach here */
3422: }
3423:
3424:
3425:
3426: /*************************************************
1.1.1.2 ! misho 3427: * Add a character or range to a class *
! 3428: *************************************************/
! 3429:
! 3430: /* This function packages up the logic of adding a character or range of
! 3431: characters to a class. The character values in the arguments will be within the
! 3432: valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
! 3433: mutually recursive with the function immediately below.
! 3434:
! 3435: Arguments:
! 3436: classbits the bit map for characters < 256
! 3437: uchardptr points to the pointer for extra data
! 3438: options the options word
! 3439: cd contains pointers to tables etc.
! 3440: start start of range character
! 3441: end end of range character
! 3442:
! 3443: Returns: the number of < 256 characters added
! 3444: the pointer to extra data is updated
! 3445: */
! 3446:
! 3447: static int
! 3448: add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
! 3449: compile_data *cd, pcre_uint32 start, pcre_uint32 end)
! 3450: {
! 3451: pcre_uint32 c;
! 3452: int n8 = 0;
! 3453:
! 3454: /* If caseless matching is required, scan the range and process alternate
! 3455: cases. In Unicode, there are 8-bit characters that have alternate cases that
! 3456: are greater than 255 and vice-versa. Sometimes we can just extend the original
! 3457: range. */
! 3458:
! 3459: if ((options & PCRE_CASELESS) != 0)
! 3460: {
! 3461: #ifdef SUPPORT_UCP
! 3462: if ((options & PCRE_UTF8) != 0)
! 3463: {
! 3464: int rc;
! 3465: pcre_uint32 oc, od;
! 3466:
! 3467: options &= ~PCRE_CASELESS; /* Remove for recursive calls */
! 3468: c = start;
! 3469:
! 3470: while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
! 3471: {
! 3472: /* Handle a single character that has more than one other case. */
! 3473:
! 3474: if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
! 3475: PRIV(ucd_caseless_sets) + rc, oc);
! 3476:
! 3477: /* Do nothing if the other case range is within the original range. */
! 3478:
! 3479: else if (oc >= start && od <= end) continue;
! 3480:
! 3481: /* Extend the original range if there is overlap, noting that if oc < c, we
! 3482: can't have od > end because a subrange is always shorter than the basic
! 3483: range. Otherwise, use a recursive call to add the additional range. */
! 3484:
! 3485: else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
! 3486: else if (od > end && oc <= end + 1) end = od; /* Extend upwards */
! 3487: else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
! 3488: }
! 3489: }
! 3490: else
! 3491: #endif /* SUPPORT_UCP */
! 3492:
! 3493: /* Not UTF-mode, or no UCP */
! 3494:
! 3495: for (c = start; c <= end && c < 256; c++)
! 3496: {
! 3497: SETBIT(classbits, cd->fcc[c]);
! 3498: n8++;
! 3499: }
! 3500: }
! 3501:
! 3502: /* Now handle the original range. Adjust the final value according to the bit
! 3503: length - this means that the same lists of (e.g.) horizontal spaces can be used
! 3504: in all cases. */
! 3505:
! 3506: #if defined COMPILE_PCRE8
! 3507: #ifdef SUPPORT_UTF
! 3508: if ((options & PCRE_UTF8) == 0)
! 3509: #endif
! 3510: if (end > 0xff) end = 0xff;
! 3511:
! 3512: #elif defined COMPILE_PCRE16
! 3513: #ifdef SUPPORT_UTF
! 3514: if ((options & PCRE_UTF16) == 0)
! 3515: #endif
! 3516: if (end > 0xffff) end = 0xffff;
! 3517:
! 3518: #endif /* COMPILE_PCRE[8|16] */
! 3519:
! 3520: /* If all characters are less than 256, use the bit map. Otherwise use extra
! 3521: data. */
! 3522:
! 3523: if (end < 0x100)
! 3524: {
! 3525: for (c = start; c <= end; c++)
! 3526: {
! 3527: n8++;
! 3528: SETBIT(classbits, c);
! 3529: }
! 3530: }
! 3531:
! 3532: else
! 3533: {
! 3534: pcre_uchar *uchardata = *uchardptr;
! 3535:
! 3536: #ifdef SUPPORT_UTF
! 3537: if ((options & PCRE_UTF8) != 0) /* All UTFs use the same flag bit */
! 3538: {
! 3539: if (start < end)
! 3540: {
! 3541: *uchardata++ = XCL_RANGE;
! 3542: uchardata += PRIV(ord2utf)(start, uchardata);
! 3543: uchardata += PRIV(ord2utf)(end, uchardata);
! 3544: }
! 3545: else if (start == end)
! 3546: {
! 3547: *uchardata++ = XCL_SINGLE;
! 3548: uchardata += PRIV(ord2utf)(start, uchardata);
! 3549: }
! 3550: }
! 3551: else
! 3552: #endif /* SUPPORT_UTF */
! 3553:
! 3554: /* Without UTF support, character values are constrained by the bit length,
! 3555: and can only be > 256 for 16-bit and 32-bit libraries. */
! 3556:
! 3557: #ifdef COMPILE_PCRE8
! 3558: {}
! 3559: #else
! 3560: if (start < end)
! 3561: {
! 3562: *uchardata++ = XCL_RANGE;
! 3563: *uchardata++ = start;
! 3564: *uchardata++ = end;
! 3565: }
! 3566: else if (start == end)
! 3567: {
! 3568: *uchardata++ = XCL_SINGLE;
! 3569: *uchardata++ = start;
! 3570: }
! 3571: #endif
! 3572:
! 3573: *uchardptr = uchardata; /* Updata extra data pointer */
! 3574: }
! 3575:
! 3576: return n8; /* Number of 8-bit characters */
! 3577: }
! 3578:
! 3579:
! 3580:
! 3581:
! 3582: /*************************************************
! 3583: * Add a list of characters to a class *
! 3584: *************************************************/
! 3585:
! 3586: /* This function is used for adding a list of case-equivalent characters to a
! 3587: class, and also for adding a list of horizontal or vertical whitespace. If the
! 3588: list is in order (which it should be), ranges of characters are detected and
! 3589: handled appropriately. This function is mutually recursive with the function
! 3590: above.
! 3591:
! 3592: Arguments:
! 3593: classbits the bit map for characters < 256
! 3594: uchardptr points to the pointer for extra data
! 3595: options the options word
! 3596: cd contains pointers to tables etc.
! 3597: p points to row of 32-bit values, terminated by NOTACHAR
! 3598: except character to omit; this is used when adding lists of
! 3599: case-equivalent characters to avoid including the one we
! 3600: already know about
! 3601:
! 3602: Returns: the number of < 256 characters added
! 3603: the pointer to extra data is updated
! 3604: */
! 3605:
! 3606: static int
! 3607: add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
! 3608: compile_data *cd, const pcre_uint32 *p, unsigned int except)
! 3609: {
! 3610: int n8 = 0;
! 3611: while (p[0] < NOTACHAR)
! 3612: {
! 3613: int n = 0;
! 3614: if (p[0] != except)
! 3615: {
! 3616: while(p[n+1] == p[0] + n + 1) n++;
! 3617: n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
! 3618: }
! 3619: p += n + 1;
! 3620: }
! 3621: return n8;
! 3622: }
! 3623:
! 3624:
! 3625:
! 3626: /*************************************************
! 3627: * Add characters not in a list to a class *
! 3628: *************************************************/
! 3629:
! 3630: /* This function is used for adding the complement of a list of horizontal or
! 3631: vertical whitespace to a class. The list must be in order.
! 3632:
! 3633: Arguments:
! 3634: classbits the bit map for characters < 256
! 3635: uchardptr points to the pointer for extra data
! 3636: options the options word
! 3637: cd contains pointers to tables etc.
! 3638: p points to row of 32-bit values, terminated by NOTACHAR
! 3639:
! 3640: Returns: the number of < 256 characters added
! 3641: the pointer to extra data is updated
! 3642: */
! 3643:
! 3644: static int
! 3645: add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
! 3646: int options, compile_data *cd, const pcre_uint32 *p)
! 3647: {
! 3648: BOOL utf = (options & PCRE_UTF8) != 0;
! 3649: int n8 = 0;
! 3650: if (p[0] > 0)
! 3651: n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
! 3652: while (p[0] < NOTACHAR)
! 3653: {
! 3654: while (p[1] == p[0] + 1) p++;
! 3655: n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
! 3656: (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
! 3657: p++;
! 3658: }
! 3659: return n8;
! 3660: }
! 3661:
! 3662:
! 3663:
! 3664: /*************************************************
1.1 misho 3665: * Compile one branch *
3666: *************************************************/
3667:
3668: /* Scan the pattern, compiling it into the a vector. If the options are
3669: changed during the branch, the pointer is used to change the external options
3670: bits. This function is used during the pre-compile phase when we are trying
3671: to find out the amount of memory needed, as well as during the real compile
3672: phase. The value of lengthptr distinguishes the two phases.
3673:
3674: Arguments:
3675: optionsptr pointer to the option bits
3676: codeptr points to the pointer to the current code point
3677: ptrptr points to the current pattern pointer
3678: errorcodeptr points to error code variable
1.1.1.2 ! misho 3679: firstcharptr place to put the first required character
! 3680: firstcharflagsptr place to put the first character flags, or a negative number
! 3681: reqcharptr place to put the last required character
! 3682: reqcharflagsptr place to put the last required character flags, or a negative number
1.1 misho 3683: bcptr points to current branch chain
1.1.1.2 ! misho 3684: cond_depth conditional nesting depth
1.1 misho 3685: cd contains pointers to tables etc.
3686: lengthptr NULL during the real compile phase
3687: points to length accumulator during pre-compile phase
3688:
3689: Returns: TRUE on success
3690: FALSE, with *errorcodeptr set non-zero on error
3691: */
3692:
3693: static BOOL
1.1.1.2 ! misho 3694: compile_branch(int *optionsptr, pcre_uchar **codeptr,
! 3695: const pcre_uchar **ptrptr, int *errorcodeptr,
! 3696: pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
! 3697: pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
! 3698: branch_chain *bcptr, int cond_depth,
1.1 misho 3699: compile_data *cd, int *lengthptr)
3700: {
3701: int repeat_type, op_type;
3702: int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
3703: int bravalue = 0;
3704: int greedy_default, greedy_non_default;
1.1.1.2 ! misho 3705: pcre_uint32 firstchar, reqchar;
! 3706: pcre_int32 firstcharflags, reqcharflags;
! 3707: pcre_uint32 zeroreqchar, zerofirstchar;
! 3708: pcre_int32 zeroreqcharflags, zerofirstcharflags;
! 3709: pcre_int32 req_caseopt, reqvary, tempreqvary;
! 3710: int options = *optionsptr; /* May change dynamically */
1.1 misho 3711: int after_manual_callout = 0;
3712: int length_prevgroup = 0;
1.1.1.2 ! misho 3713: register pcre_uint32 c;
! 3714: int escape;
! 3715: register pcre_uchar *code = *codeptr;
! 3716: pcre_uchar *last_code = code;
! 3717: pcre_uchar *orig_code = code;
! 3718: pcre_uchar *tempcode;
1.1 misho 3719: BOOL inescq = FALSE;
1.1.1.2 ! misho 3720: BOOL groupsetfirstchar = FALSE;
! 3721: const pcre_uchar *ptr = *ptrptr;
! 3722: const pcre_uchar *tempptr;
! 3723: const pcre_uchar *nestptr = NULL;
! 3724: pcre_uchar *previous = NULL;
! 3725: pcre_uchar *previous_callout = NULL;
! 3726: pcre_uchar *save_hwm = NULL;
! 3727: pcre_uint8 classbits[32];
! 3728:
! 3729: /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
! 3730: must not do this for other options (e.g. PCRE_EXTENDED) because they may change
! 3731: dynamically as we process the pattern. */
! 3732:
! 3733: #ifdef SUPPORT_UTF
! 3734: /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
! 3735: BOOL utf = (options & PCRE_UTF8) != 0;
! 3736: #ifndef COMPILE_PCRE32
! 3737: pcre_uchar utf_chars[6];
! 3738: #endif
1.1 misho 3739: #else
1.1.1.2 ! misho 3740: BOOL utf = FALSE;
! 3741: #endif
! 3742:
! 3743: /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
! 3744: class_uchardata always so that it can be passed to add_to_class() always,
! 3745: though it will not be used in non-UTF 8-bit cases. This avoids having to supply
! 3746: alternative calls for the different cases. */
! 3747:
! 3748: pcre_uchar *class_uchardata;
! 3749: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
! 3750: BOOL xclass;
! 3751: pcre_uchar *class_uchardata_base;
1.1 misho 3752: #endif
3753:
3754: #ifdef PCRE_DEBUG
3755: if (lengthptr != NULL) DPRINTF((">> start branch\n"));
3756: #endif
3757:
3758: /* Set up the default and non-default settings for greediness */
3759:
3760: greedy_default = ((options & PCRE_UNGREEDY) != 0);
3761: greedy_non_default = greedy_default ^ 1;
3762:
3763: /* Initialize no first byte, no required byte. REQ_UNSET means "no char
3764: matching encountered yet". It gets changed to REQ_NONE if we hit something that
1.1.1.2 ! misho 3765: matches a non-fixed char first char; reqchar just remains unset if we never
1.1 misho 3766: find one.
3767:
3768: When we hit a repeat whose minimum is zero, we may have to adjust these values
3769: to take the zero repeat into account. This is implemented by setting them to
1.1.1.2 ! misho 3770: zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
1.1 misho 3771: item types that can be repeated set these backoff variables appropriately. */
3772:
1.1.1.2 ! misho 3773: firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
! 3774: firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
1.1 misho 3775:
1.1.1.2 ! misho 3776: /* The variable req_caseopt contains either the REQ_CASELESS value
! 3777: or zero, according to the current setting of the caseless flag. The
! 3778: REQ_CASELESS leaves the lower 28 bit empty. It is added into the
! 3779: firstchar or reqchar variables to record the case status of the
! 3780: value. This is used only for ASCII characters. */
1.1 misho 3781:
1.1.1.2 ! misho 3782: req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
1.1 misho 3783:
3784: /* Switch on next character until the end of the branch */
3785:
3786: for (;; ptr++)
3787: {
3788: BOOL negate_class;
3789: BOOL should_flip_negation;
3790: BOOL possessive_quantifier;
3791: BOOL is_quantifier;
3792: BOOL is_recurse;
3793: BOOL reset_bracount;
1.1.1.2 ! misho 3794: int class_has_8bitchar;
! 3795: int class_one_char;
1.1 misho 3796: int newoptions;
3797: int recno;
3798: int refsign;
3799: int skipbytes;
1.1.1.2 ! misho 3800: pcre_uint32 subreqchar, subfirstchar;
! 3801: pcre_int32 subreqcharflags, subfirstcharflags;
1.1 misho 3802: int terminator;
1.1.1.2 ! misho 3803: unsigned int mclength;
! 3804: unsigned int tempbracount;
! 3805: pcre_uint32 ec;
! 3806: pcre_uchar mcbuffer[8];
1.1 misho 3807:
1.1.1.2 ! misho 3808: /* Get next character in the pattern */
1.1 misho 3809:
3810: c = *ptr;
3811:
3812: /* If we are at the end of a nested substitution, revert to the outer level
3813: string. Nesting only happens one level deep. */
3814:
1.1.1.2 ! misho 3815: if (c == CHAR_NULL && nestptr != NULL)
1.1 misho 3816: {
3817: ptr = nestptr;
3818: nestptr = NULL;
3819: c = *ptr;
3820: }
3821:
3822: /* If we are in the pre-compile phase, accumulate the length used for the
3823: previous cycle of this loop. */
3824:
3825: if (lengthptr != NULL)
3826: {
3827: #ifdef PCRE_DEBUG
3828: if (code > cd->hwm) cd->hwm = code; /* High water info */
3829: #endif
1.1.1.2 ! misho 3830: if (code > cd->start_workspace + cd->workspace_size -
! 3831: WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
1.1 misho 3832: {
3833: *errorcodeptr = ERR52;
3834: goto FAILED;
3835: }
3836:
3837: /* There is at least one situation where code goes backwards: this is the
3838: case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
3839: the class is simply eliminated. However, it is created first, so we have to
3840: allow memory for it. Therefore, don't ever reduce the length at this point.
3841: */
3842:
3843: if (code < last_code) code = last_code;
3844:
3845: /* Paranoid check for integer overflow */
3846:
3847: if (OFLOW_MAX - *lengthptr < code - last_code)
3848: {
3849: *errorcodeptr = ERR20;
3850: goto FAILED;
3851: }
3852:
3853: *lengthptr += (int)(code - last_code);
1.1.1.2 ! misho 3854: DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
! 3855: (int)(code - last_code), c, c));
1.1 misho 3856:
3857: /* If "previous" is set and it is not at the start of the work space, move
3858: it back to there, in order to avoid filling up the work space. Otherwise,
3859: if "previous" is NULL, reset the current code pointer to the start. */
3860:
3861: if (previous != NULL)
3862: {
3863: if (previous > orig_code)
3864: {
1.1.1.2 ! misho 3865: memmove(orig_code, previous, IN_UCHARS(code - previous));
1.1 misho 3866: code -= previous - orig_code;
3867: previous = orig_code;
3868: }
3869: }
3870: else code = orig_code;
3871:
3872: /* Remember where this code item starts so we can pick up the length
3873: next time round. */
3874:
3875: last_code = code;
3876: }
3877:
3878: /* In the real compile phase, just check the workspace used by the forward
3879: reference list. */
3880:
1.1.1.2 ! misho 3881: else if (cd->hwm > cd->start_workspace + cd->workspace_size -
! 3882: WORK_SIZE_SAFETY_MARGIN)
1.1 misho 3883: {
3884: *errorcodeptr = ERR52;
3885: goto FAILED;
3886: }
3887:
3888: /* If in \Q...\E, check for the end; if not, we have a literal */
3889:
1.1.1.2 ! misho 3890: if (inescq && c != CHAR_NULL)
1.1 misho 3891: {
3892: if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3893: {
3894: inescq = FALSE;
3895: ptr++;
3896: continue;
3897: }
3898: else
3899: {
3900: if (previous_callout != NULL)
3901: {
3902: if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3903: complete_callout(previous_callout, ptr, cd);
3904: previous_callout = NULL;
3905: }
3906: if ((options & PCRE_AUTO_CALLOUT) != 0)
3907: {
3908: previous_callout = code;
3909: code = auto_callout(code, ptr, cd);
3910: }
3911: goto NORMAL_CHAR;
3912: }
3913: }
3914:
3915: /* Fill in length of a previous callout, except when the next thing is
3916: a quantifier. */
3917:
3918: is_quantifier =
3919: c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
3920: (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
3921:
3922: if (!is_quantifier && previous_callout != NULL &&
3923: after_manual_callout-- <= 0)
3924: {
3925: if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3926: complete_callout(previous_callout, ptr, cd);
3927: previous_callout = NULL;
3928: }
3929:
1.1.1.2 ! misho 3930: /* In extended mode, skip white space and comments. */
1.1 misho 3931:
3932: if ((options & PCRE_EXTENDED) != 0)
3933: {
1.1.1.2 ! misho 3934: if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;
1.1 misho 3935: if (c == CHAR_NUMBER_SIGN)
3936: {
3937: ptr++;
1.1.1.2 ! misho 3938: while (*ptr != CHAR_NULL)
1.1 misho 3939: {
3940: if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3941: ptr++;
1.1.1.2 ! misho 3942: #ifdef SUPPORT_UTF
! 3943: if (utf) FORWARDCHAR(ptr);
1.1 misho 3944: #endif
3945: }
1.1.1.2 ! misho 3946: if (*ptr != CHAR_NULL) continue;
1.1 misho 3947:
3948: /* Else fall through to handle end of string */
3949: c = 0;
3950: }
3951: }
3952:
3953: /* No auto callout for quantifiers. */
3954:
3955: if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
3956: {
3957: previous_callout = code;
3958: code = auto_callout(code, ptr, cd);
3959: }
3960:
3961: switch(c)
3962: {
3963: /* ===================================================================*/
3964: case 0: /* The branch terminates at string end */
3965: case CHAR_VERTICAL_LINE: /* or | or ) */
3966: case CHAR_RIGHT_PARENTHESIS:
1.1.1.2 ! misho 3967: *firstcharptr = firstchar;
! 3968: *firstcharflagsptr = firstcharflags;
! 3969: *reqcharptr = reqchar;
! 3970: *reqcharflagsptr = reqcharflags;
1.1 misho 3971: *codeptr = code;
3972: *ptrptr = ptr;
3973: if (lengthptr != NULL)
3974: {
3975: if (OFLOW_MAX - *lengthptr < code - last_code)
3976: {
3977: *errorcodeptr = ERR20;
3978: goto FAILED;
3979: }
3980: *lengthptr += (int)(code - last_code); /* To include callout length */
3981: DPRINTF((">> end branch\n"));
3982: }
3983: return TRUE;
3984:
3985:
3986: /* ===================================================================*/
3987: /* Handle single-character metacharacters. In multiline mode, ^ disables
3988: the setting of any following char as a first character. */
3989:
3990: case CHAR_CIRCUMFLEX_ACCENT:
1.1.1.2 ! misho 3991: previous = NULL;
1.1 misho 3992: if ((options & PCRE_MULTILINE) != 0)
3993: {
1.1.1.2 ! misho 3994: if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
! 3995: *code++ = OP_CIRCM;
1.1 misho 3996: }
1.1.1.2 ! misho 3997: else *code++ = OP_CIRC;
1.1 misho 3998: break;
3999:
4000: case CHAR_DOLLAR_SIGN:
4001: previous = NULL;
1.1.1.2 ! misho 4002: *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
1.1 misho 4003: break;
4004:
4005: /* There can never be a first char if '.' is first, whatever happens about
1.1.1.2 ! misho 4006: repeats. The value of reqchar doesn't change either. */
1.1 misho 4007:
4008: case CHAR_DOT:
1.1.1.2 ! misho 4009: if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
! 4010: zerofirstchar = firstchar;
! 4011: zerofirstcharflags = firstcharflags;
! 4012: zeroreqchar = reqchar;
! 4013: zeroreqcharflags = reqcharflags;
1.1 misho 4014: previous = code;
4015: *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4016: break;
4017:
4018:
4019: /* ===================================================================*/
4020: /* Character classes. If the included characters are all < 256, we build a
4021: 32-byte bitmap of the permitted characters, except in the special case
4022: where there is only one such character. For negated classes, we build the
4023: map as usual, then invert it at the end. However, we use a different opcode
4024: so that data characters > 255 can be handled correctly.
4025:
4026: If the class contains characters outside the 0-255 range, a different
4027: opcode is compiled. It may optionally have a bit map for characters < 256,
4028: but those above are are explicitly listed afterwards. A flag byte tells
4029: whether the bitmap is present, and whether this is a negated class or not.
4030:
4031: In JavaScript compatibility mode, an isolated ']' causes an error. In
4032: default (Perl) mode, it is treated as a data character. */
4033:
4034: case CHAR_RIGHT_SQUARE_BRACKET:
4035: if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4036: {
4037: *errorcodeptr = ERR64;
4038: goto FAILED;
4039: }
4040: goto NORMAL_CHAR;
4041:
4042: case CHAR_LEFT_SQUARE_BRACKET:
4043: previous = code;
4044:
4045: /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
4046: they are encountered at the top level, so we'll do that too. */
4047:
4048: if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4049: ptr[1] == CHAR_EQUALS_SIGN) &&
4050: check_posix_syntax(ptr, &tempptr))
4051: {
4052: *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
4053: goto FAILED;
4054: }
4055:
4056: /* If the first character is '^', set the negation flag and skip it. Also,
4057: if the first few characters (either before or after ^) are \Q\E or \E we
4058: skip them too. This makes for compatibility with Perl. */
4059:
4060: negate_class = FALSE;
4061: for (;;)
4062: {
4063: c = *(++ptr);
4064: if (c == CHAR_BACKSLASH)
4065: {
4066: if (ptr[1] == CHAR_E)
4067: ptr++;
1.1.1.2 ! misho 4068: else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
1.1 misho 4069: ptr += 3;
4070: else
4071: break;
4072: }
4073: else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4074: negate_class = TRUE;
4075: else break;
4076: }
4077:
4078: /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
4079: an initial ']' is taken as a data character -- the code below handles
4080: that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
4081: [^] must match any character, so generate OP_ALLANY. */
4082:
4083: if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4084: (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4085: {
4086: *code++ = negate_class? OP_ALLANY : OP_FAIL;
1.1.1.2 ! misho 4087: if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
! 4088: zerofirstchar = firstchar;
! 4089: zerofirstcharflags = firstcharflags;
1.1 misho 4090: break;
4091: }
4092:
4093: /* If a class contains a negative special such as \S, we need to flip the
4094: negation flag at the end, so that support for characters > 255 works
4095: correctly (they are all included in the class). */
4096:
4097: should_flip_negation = FALSE;
4098:
1.1.1.2 ! misho 4099: /* For optimization purposes, we track some properties of the class:
! 4100: class_has_8bitchar will be non-zero if the class contains at least one <
! 4101: 256 character; class_one_char will be 1 if the class contains just one
! 4102: character. */
1.1 misho 4103:
1.1.1.2 ! misho 4104: class_has_8bitchar = 0;
! 4105: class_one_char = 0;
1.1 misho 4106:
4107: /* Initialize the 32-char bit map to all zeros. We build the map in a
1.1.1.2 ! misho 4108: temporary bit of memory, in case the class contains fewer than two
! 4109: 8-bit characters because in that case the compiled code doesn't use the bit
! 4110: map. */
! 4111:
! 4112: memset(classbits, 0, 32 * sizeof(pcre_uint8));
! 4113:
! 4114: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
! 4115: xclass = FALSE;
! 4116: class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
! 4117: class_uchardata_base = class_uchardata; /* Save the start */
1.1 misho 4118: #endif
4119:
4120: /* Process characters until ] is reached. By writing this as a "do" it
4121: means that an initial ] is taken as a data character. At the start of the
4122: loop, c contains the first byte of the character. */
4123:
1.1.1.2 ! misho 4124: if (c != CHAR_NULL) do
1.1 misho 4125: {
1.1.1.2 ! misho 4126: const pcre_uchar *oldptr;
1.1 misho 4127:
1.1.1.2 ! misho 4128: #ifdef SUPPORT_UTF
! 4129: if (utf && HAS_EXTRALEN(c))
1.1 misho 4130: { /* Braces are required because the */
4131: GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
4132: }
1.1.1.2 ! misho 4133: #endif
1.1 misho 4134:
1.1.1.2 ! misho 4135: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
! 4136: /* In the pre-compile phase, accumulate the length of any extra
1.1 misho 4137: data and reset the pointer. This is so that very large classes that
1.1.1.2 ! misho 4138: contain a zillion > 255 characters no longer overwrite the work space
! 4139: (which is on the stack). We have to remember that there was XCLASS data,
! 4140: however. */
! 4141:
! 4142: if (lengthptr != NULL && class_uchardata > class_uchardata_base)
! 4143: {
! 4144: xclass = TRUE;
! 4145: *lengthptr += class_uchardata - class_uchardata_base;
! 4146: class_uchardata = class_uchardata_base;
1.1 misho 4147: }
4148: #endif
4149:
4150: /* Inside \Q...\E everything is literal except \E */
4151:
4152: if (inescq)
4153: {
4154: if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
4155: {
4156: inescq = FALSE; /* Reset literal state */
4157: ptr++; /* Skip the 'E' */
4158: continue; /* Carry on with next */
4159: }
4160: goto CHECK_RANGE; /* Could be range if \E follows */
4161: }
4162:
4163: /* Handle POSIX class names. Perl allows a negation extension of the
4164: form [:^name:]. A square bracket that doesn't match the syntax is
4165: treated as a literal. We also recognize the POSIX constructions
4166: [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
4167: 5.6 and 5.8 do. */
4168:
4169: if (c == CHAR_LEFT_SQUARE_BRACKET &&
4170: (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4171: ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
4172: {
4173: BOOL local_negate = FALSE;
4174: int posix_class, taboffset, tabopt;
1.1.1.2 ! misho 4175: register const pcre_uint8 *cbits = cd->cbits;
! 4176: pcre_uint8 pbits[32];
1.1 misho 4177:
4178: if (ptr[1] != CHAR_COLON)
4179: {
4180: *errorcodeptr = ERR31;
4181: goto FAILED;
4182: }
4183:
4184: ptr += 2;
4185: if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
4186: {
4187: local_negate = TRUE;
4188: should_flip_negation = TRUE; /* Note negative special */
4189: ptr++;
4190: }
4191:
4192: posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
4193: if (posix_class < 0)
4194: {
4195: *errorcodeptr = ERR30;
4196: goto FAILED;
4197: }
4198:
4199: /* If matching is caseless, upper and lower are converted to
4200: alpha. This relies on the fact that the class table starts with
4201: alpha, lower, upper as the first 3 entries. */
4202:
4203: if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
4204: posix_class = 0;
4205:
4206: /* When PCRE_UCP is set, some of the POSIX classes are converted to
4207: different escape sequences that use Unicode properties. */
4208:
4209: #ifdef SUPPORT_UCP
4210: if ((options & PCRE_UCP) != 0)
4211: {
4212: int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
4213: if (posix_substitutes[pc] != NULL)
4214: {
4215: nestptr = tempptr + 1;
4216: ptr = posix_substitutes[pc] - 1;
4217: continue;
4218: }
4219: }
4220: #endif
4221: /* In the non-UCP case, we build the bit map for the POSIX class in a
4222: chunk of local store because we may be adding and subtracting from it,
4223: and we don't want to subtract bits that may be in the main map already.
4224: At the end we or the result into the bit map that is being built. */
4225:
4226: posix_class *= 3;
4227:
4228: /* Copy in the first table (always present) */
4229:
4230: memcpy(pbits, cbits + posix_class_maps[posix_class],
1.1.1.2 ! misho 4231: 32 * sizeof(pcre_uint8));
1.1 misho 4232:
4233: /* If there is a second table, add or remove it as required. */
4234:
4235: taboffset = posix_class_maps[posix_class + 1];
4236: tabopt = posix_class_maps[posix_class + 2];
4237:
4238: if (taboffset >= 0)
4239: {
4240: if (tabopt >= 0)
4241: for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
4242: else
4243: for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
4244: }
4245:
1.1.1.2 ! misho 4246: /* Now see if we need to remove any special characters. An option
1.1 misho 4247: value of 1 removes vertical space and 2 removes underscore. */
4248:
4249: if (tabopt < 0) tabopt = -tabopt;
4250: if (tabopt == 1) pbits[1] &= ~0x3c;
4251: else if (tabopt == 2) pbits[11] &= 0x7f;
4252:
4253: /* Add the POSIX table or its complement into the main table that is
4254: being built and we are done. */
4255:
4256: if (local_negate)
4257: for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
4258: else
4259: for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
4260:
4261: ptr = tempptr + 1;
1.1.1.2 ! misho 4262: /* Every class contains at least one < 256 character. */
! 4263: class_has_8bitchar = 1;
! 4264: /* Every class contains at least two characters. */
! 4265: class_one_char = 2;
1.1 misho 4266: continue; /* End of POSIX syntax handling */
4267: }
4268:
4269: /* Backslash may introduce a single character, or it may introduce one
4270: of the specials, which just set a flag. The sequence \b is a special
4271: case. Inside a class (and only there) it is treated as backspace. We
1.1.1.2 ! misho 4272: assume that other escapes have more than one character in them, so
! 4273: speculatively set both class_has_8bitchar and class_one_char bigger
! 4274: than one. Unrecognized escapes fall through and are either treated
! 4275: as literal characters (by default), or are faulted if
1.1 misho 4276: PCRE_EXTRA is set. */
4277:
4278: if (c == CHAR_BACKSLASH)
4279: {
1.1.1.2 ! misho 4280: escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options, TRUE);
! 4281:
1.1 misho 4282: if (*errorcodeptr != 0) goto FAILED;
4283:
1.1.1.2 ! misho 4284: if (escape == 0)
! 4285: c = ec;
! 4286: else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
! 4287: else if (escape == ESC_N) /* \N is not supported in a class */
! 4288: {
! 4289: *errorcodeptr = ERR71;
! 4290: goto FAILED;
! 4291: }
! 4292: else if (escape == ESC_Q) /* Handle start of quoted string */
1.1 misho 4293: {
4294: if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4295: {
4296: ptr += 2; /* avoid empty string */
4297: }
4298: else inescq = TRUE;
4299: continue;
4300: }
1.1.1.2 ! misho 4301: else if (escape == ESC_E) continue; /* Ignore orphan \E */
1.1 misho 4302:
1.1.1.2 ! misho 4303: else
1.1 misho 4304: {
1.1.1.2 ! misho 4305: register const pcre_uint8 *cbits = cd->cbits;
! 4306: /* Every class contains at least two < 256 characters. */
! 4307: class_has_8bitchar++;
! 4308: /* Every class contains at least two characters. */
! 4309: class_one_char += 2;
1.1 misho 4310:
1.1.1.2 ! misho 4311: switch (escape)
1.1 misho 4312: {
4313: #ifdef SUPPORT_UCP
4314: case ESC_du: /* These are the values given for \d etc */
4315: case ESC_DU: /* when PCRE_UCP is set. We replace the */
4316: case ESC_wu: /* escape sequence with an appropriate \p */
4317: case ESC_WU: /* or \P to test Unicode properties instead */
4318: case ESC_su: /* of the default ASCII testing. */
4319: case ESC_SU:
4320: nestptr = ptr;
1.1.1.2 ! misho 4321: ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
! 4322: class_has_8bitchar--; /* Undo! */
1.1 misho 4323: continue;
4324: #endif
4325: case ESC_d:
4326: for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
4327: continue;
4328:
4329: case ESC_D:
4330: should_flip_negation = TRUE;
4331: for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
4332: continue;
4333:
4334: case ESC_w:
4335: for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
4336: continue;
4337:
4338: case ESC_W:
4339: should_flip_negation = TRUE;
4340: for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
4341: continue;
4342:
4343: /* Perl 5.004 onwards omits VT from \s, but we must preserve it
4344: if it was previously set by something earlier in the character
1.1.1.2 ! misho 4345: class. Luckily, the value of CHAR_VT is 0x0b in both ASCII and
! 4346: EBCDIC, so we lazily just adjust the appropriate bit. */
1.1 misho 4347:
4348: case ESC_s:
4349: classbits[0] |= cbits[cbit_space];
4350: classbits[1] |= cbits[cbit_space+1] & ~0x08;
4351: for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
4352: continue;
4353:
4354: case ESC_S:
4355: should_flip_negation = TRUE;
4356: for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
4357: classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
4358: continue;
4359:
1.1.1.2 ! misho 4360: /* The rest apply in both UCP and non-UCP cases. */
! 4361:
1.1 misho 4362: case ESC_h:
1.1.1.2 ! misho 4363: (void)add_list_to_class(classbits, &class_uchardata, options, cd,
! 4364: PRIV(hspace_list), NOTACHAR);
1.1 misho 4365: continue;
4366:
4367: case ESC_H:
1.1.1.2 ! misho 4368: (void)add_not_list_to_class(classbits, &class_uchardata, options,
! 4369: cd, PRIV(hspace_list));
1.1 misho 4370: continue;
4371:
4372: case ESC_v:
1.1.1.2 ! misho 4373: (void)add_list_to_class(classbits, &class_uchardata, options, cd,
! 4374: PRIV(vspace_list), NOTACHAR);
1.1 misho 4375: continue;
4376:
4377: case ESC_V:
1.1.1.2 ! misho 4378: (void)add_not_list_to_class(classbits, &class_uchardata, options,
! 4379: cd, PRIV(vspace_list));
1.1 misho 4380: continue;
4381:
4382: #ifdef SUPPORT_UCP
4383: case ESC_p:
4384: case ESC_P:
4385: {
4386: BOOL negated;
1.1.1.2 ! misho 4387: unsigned int ptype = 0, pdata = 0;
! 4388: if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
! 4389: goto FAILED;
! 4390: *class_uchardata++ = ((escape == ESC_p) != negated)?
1.1 misho 4391: XCL_PROP : XCL_NOTPROP;
1.1.1.2 ! misho 4392: *class_uchardata++ = ptype;
! 4393: *class_uchardata++ = pdata;
! 4394: class_has_8bitchar--; /* Undo! */
1.1 misho 4395: continue;
4396: }
4397: #endif
4398: /* Unrecognized escapes are faulted if PCRE is running in its
4399: strict mode. By default, for compatibility with Perl, they are
4400: treated as literals. */
4401:
4402: default:
4403: if ((options & PCRE_EXTRA) != 0)
4404: {
4405: *errorcodeptr = ERR7;
4406: goto FAILED;
4407: }
1.1.1.2 ! misho 4408: class_has_8bitchar--; /* Undo the speculative increase. */
! 4409: class_one_char -= 2; /* Undo the speculative increase. */
! 4410: c = *ptr; /* Get the final character and fall through */
1.1 misho 4411: break;
4412: }
4413: }
4414:
1.1.1.2 ! misho 4415: /* Fall through if the escape just defined a single character (c >= 0).
! 4416: This may be greater than 256. */
! 4417:
! 4418: escape = 0;
1.1 misho 4419:
4420: } /* End of backslash handling */
4421:
1.1.1.2 ! misho 4422: /* A character may be followed by '-' to form a range. However, Perl does
! 4423: not permit ']' to be the end of the range. A '-' character at the end is
! 4424: treated as a literal. Perl ignores orphaned \E sequences entirely. The
! 4425: code for handling \Q and \E is messy. */
1.1 misho 4426:
4427: CHECK_RANGE:
4428: while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4429: {
4430: inescq = FALSE;
4431: ptr += 2;
4432: }
4433: oldptr = ptr;
4434:
1.1.1.2 ! misho 4435: /* Remember if \r or \n were explicitly used */
1.1 misho 4436:
4437: if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4438:
4439: /* Check for range */
4440:
4441: if (!inescq && ptr[1] == CHAR_MINUS)
4442: {
1.1.1.2 ! misho 4443: pcre_uint32 d;
1.1 misho 4444: ptr += 2;
4445: while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
4446:
4447: /* If we hit \Q (not followed by \E) at this point, go into escaped
4448: mode. */
4449:
4450: while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
4451: {
4452: ptr += 2;
4453: if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4454: { ptr += 2; continue; }
4455: inescq = TRUE;
4456: break;
4457: }
4458:
1.1.1.2 ! misho 4459: /* Minus (hyphen) at the end of a class is treated as a literal, so put
! 4460: back the pointer and jump to handle the character that preceded it. */
! 4461:
! 4462: if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
1.1 misho 4463: {
4464: ptr = oldptr;
1.1.1.2 ! misho 4465: goto CLASS_SINGLE_CHARACTER;
1.1 misho 4466: }
4467:
1.1.1.2 ! misho 4468: /* Otherwise, we have a potential range; pick up the next character */
! 4469:
! 4470: #ifdef SUPPORT_UTF
! 4471: if (utf)
1.1 misho 4472: { /* Braces are required because the */
4473: GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
4474: }
4475: else
4476: #endif
4477: d = *ptr; /* Not UTF-8 mode */
4478:
4479: /* The second part of a range can be a single-character escape, but
4480: not any of the other escapes. Perl 5.6 treats a hyphen as a literal
4481: in such circumstances. */
4482:
4483: if (!inescq && d == CHAR_BACKSLASH)
4484: {
1.1.1.2 ! misho 4485: int descape;
! 4486: descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
1.1 misho 4487: if (*errorcodeptr != 0) goto FAILED;
4488:
1.1.1.2 ! misho 4489: /* \b is backspace; any other special means the '-' was literal. */
1.1 misho 4490:
1.1.1.2 ! misho 4491: if (descape != 0)
1.1 misho 4492: {
1.1.1.2 ! misho 4493: if (descape == ESC_b) d = CHAR_BS; else
1.1 misho 4494: {
4495: ptr = oldptr;
1.1.1.2 ! misho 4496: goto CLASS_SINGLE_CHARACTER; /* A few lines below */
1.1 misho 4497: }
4498: }
4499: }
4500:
4501: /* Check that the two values are in the correct order. Optimize
1.1.1.2 ! misho 4502: one-character ranges. */
1.1 misho 4503:
4504: if (d < c)
4505: {
4506: *errorcodeptr = ERR8;
4507: goto FAILED;
4508: }
1.1.1.2 ! misho 4509: if (d == c) goto CLASS_SINGLE_CHARACTER; /* A few lines below */
1.1 misho 4510:
1.1.1.2 ! misho 4511: /* We have found a character range, so single character optimizations
! 4512: cannot be done anymore. Any value greater than 1 indicates that there
! 4513: is more than one character. */
1.1 misho 4514:
1.1.1.2 ! misho 4515: class_one_char = 2;
1.1 misho 4516:
1.1.1.2 ! misho 4517: /* Remember an explicit \r or \n, and add the range to the class. */
1.1 misho 4518:
1.1.1.2 ! misho 4519: if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
1.1 misho 4520:
1.1.1.2 ! misho 4521: class_has_8bitchar +=
! 4522: add_to_class(classbits, &class_uchardata, options, cd, c, d);
1.1 misho 4523:
1.1.1.2 ! misho 4524: continue; /* Go get the next char in the class */
! 4525: }
1.1 misho 4526:
1.1.1.2 ! misho 4527: /* Handle a single character - we can get here for a normal non-escape
! 4528: char, or after \ that introduces a single character or for an apparent
! 4529: range that isn't. Only the value 1 matters for class_one_char, so don't
! 4530: increase it if it is already 2 or more ... just in case there's a class
! 4531: with a zillion characters in it. */
! 4532:
! 4533: CLASS_SINGLE_CHARACTER:
! 4534: if (class_one_char < 2) class_one_char++;
! 4535:
! 4536: /* If class_one_char is 1, we have the first single character in the
! 4537: class, and there have been no prior ranges, or XCLASS items generated by
! 4538: escapes. If this is the final character in the class, we can optimize by
! 4539: turning the item into a 1-character OP_CHAR[I] if it's positive, or
! 4540: OP_NOT[I] if it's negative. In the positive case, it can cause firstchar
! 4541: to be set. Otherwise, there can be no first char if this item is first,
! 4542: whatever repeat count may follow. In the case of reqchar, save the
! 4543: previous value for reinstating. */
1.1 misho 4544:
1.1.1.2 ! misho 4545: if (class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
! 4546: {
! 4547: ptr++;
! 4548: zeroreqchar = reqchar;
! 4549: zeroreqcharflags = reqcharflags;
1.1 misho 4550:
1.1.1.2 ! misho 4551: if (negate_class)
! 4552: {
1.1 misho 4553: #ifdef SUPPORT_UCP
1.1.1.2 ! misho 4554: int d;
! 4555: #endif
! 4556: if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
! 4557: zerofirstchar = firstchar;
! 4558: zerofirstcharflags = firstcharflags;
! 4559:
! 4560: /* For caseless UTF-8 mode when UCP support is available, check
! 4561: whether this character has more than one other case. If so, generate
! 4562: a special OP_NOTPROP item instead of OP_NOTI. */
1.1 misho 4563:
1.1.1.2 ! misho 4564: #ifdef SUPPORT_UCP
! 4565: if (utf && (options & PCRE_CASELESS) != 0 &&
! 4566: (d = UCD_CASESET(c)) != 0)
1.1 misho 4567: {
1.1.1.2 ! misho 4568: *code++ = OP_NOTPROP;
! 4569: *code++ = PT_CLIST;
! 4570: *code++ = d;
1.1 misho 4571: }
1.1.1.2 ! misho 4572: else
! 4573: #endif
! 4574: /* Char has only one other case, or UCP not available */
1.1 misho 4575:
1.1.1.2 ! misho 4576: {
! 4577: *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
! 4578: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
! 4579: if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
! 4580: code += PRIV(ord2utf)(c, code);
! 4581: else
! 4582: #endif
! 4583: *code++ = c;
! 4584: }
1.1 misho 4585:
1.1.1.2 ! misho 4586: /* We are finished with this character class */
1.1 misho 4587:
1.1.1.2 ! misho 4588: goto END_CLASS;
! 4589: }
1.1 misho 4590:
1.1.1.2 ! misho 4591: /* For a single, positive character, get the value into mcbuffer, and
! 4592: then we can handle this with the normal one-character code. */
1.1 misho 4593:
1.1.1.2 ! misho 4594: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
! 4595: if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
! 4596: mclength = PRIV(ord2utf)(c, mcbuffer);
! 4597: else
! 4598: #endif
1.1 misho 4599: {
1.1.1.2 ! misho 4600: mcbuffer[0] = c;
! 4601: mclength = 1;
1.1 misho 4602: }
1.1.1.2 ! misho 4603: goto ONE_CHAR;
! 4604: } /* End of 1-char optimization */
1.1 misho 4605:
1.1.1.2 ! misho 4606: /* There is more than one character in the class, or an XCLASS item
! 4607: has been generated. Add this character to the class. */
1.1 misho 4608:
1.1.1.2 ! misho 4609: class_has_8bitchar +=
! 4610: add_to_class(classbits, &class_uchardata, options, cd, c, c);
1.1 misho 4611: }
4612:
4613: /* Loop until ']' reached. This "while" is the end of the "do" far above.
4614: If we are at the end of an internal nested string, revert to the outer
4615: string. */
4616:
1.1.1.2 ! misho 4617: while (((c = *(++ptr)) != CHAR_NULL ||
1.1 misho 4618: (nestptr != NULL &&
1.1.1.2 ! misho 4619: (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
1.1 misho 4620: (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
4621:
4622: /* Check for missing terminating ']' */
4623:
1.1.1.2 ! misho 4624: if (c == CHAR_NULL)
! 4625: {
! 4626: *errorcodeptr = ERR6;
! 4627: goto FAILED;
! 4628: }
1.1 misho 4629:
1.1.1.2 ! misho 4630: /* We will need an XCLASS if data has been placed in class_uchardata. In
! 4631: the second phase this is a sufficient test. However, in the pre-compile
! 4632: phase, class_uchardata gets emptied to prevent workspace overflow, so it
! 4633: only if the very last character in the class needs XCLASS will it contain
! 4634: anything at this point. For this reason, xclass gets set TRUE above when
! 4635: uchar_classdata is emptied, and that's why this code is the way it is here
! 4636: instead of just doing a test on class_uchardata below. */
! 4637:
! 4638: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
! 4639: if (class_uchardata > class_uchardata_base) xclass = TRUE;
! 4640: #endif
! 4641:
! 4642: /* If this is the first thing in the branch, there can be no first char
! 4643: setting, whatever the repeat count. Any reqchar setting must remain
! 4644: unchanged after any kind of repeat. */
! 4645:
! 4646: if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
! 4647: zerofirstchar = firstchar;
! 4648: zerofirstcharflags = firstcharflags;
! 4649: zeroreqchar = reqchar;
! 4650: zeroreqcharflags = reqcharflags;
1.1 misho 4651:
4652: /* If there are characters with values > 255, we have to compile an
4653: extended class, with its own opcode, unless there was a negated special
4654: such as \S in the class, and PCRE_UCP is not set, because in that case all
4655: characters > 255 are in the class, so any that were explicitly given as
4656: well can be ignored. If (when there are explicit characters > 255 that must
4657: be listed) there are no characters < 256, we can omit the bitmap in the
4658: actual compiled code. */
4659:
1.1.1.2 ! misho 4660: #ifdef SUPPORT_UTF
! 4661: if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
! 4662: #elif !defined COMPILE_PCRE8
! 4663: if (xclass && !should_flip_negation)
! 4664: #endif
! 4665: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1.1 misho 4666: {
1.1.1.2 ! misho 4667: *class_uchardata++ = XCL_END; /* Marks the end of extra data */
1.1 misho 4668: *code++ = OP_XCLASS;
4669: code += LINK_SIZE;
1.1.1.2 ! misho 4670: *code = negate_class? XCL_NOT:0;
1.1 misho 4671:
4672: /* If the map is required, move up the extra data to make room for it;
4673: otherwise just move the code pointer to the end of the extra data. */
4674:
1.1.1.2 ! misho 4675: if (class_has_8bitchar > 0)
1.1 misho 4676: {
4677: *code++ |= XCL_MAP;
1.1.1.2 ! misho 4678: memmove(code + (32 / sizeof(pcre_uchar)), code,
! 4679: IN_UCHARS(class_uchardata - code));
1.1 misho 4680: memcpy(code, classbits, 32);
1.1.1.2 ! misho 4681: code = class_uchardata + (32 / sizeof(pcre_uchar));
1.1 misho 4682: }
1.1.1.2 ! misho 4683: else code = class_uchardata;
1.1 misho 4684:
4685: /* Now fill in the complete length of the item */
4686:
1.1.1.2 ! misho 4687: PUT(previous, 1, (int)(code - previous));
1.1 misho 4688: break; /* End of class handling */
4689: }
4690: #endif
4691:
4692: /* If there are no characters > 255, or they are all to be included or
4693: excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
4694: whole class was negated and whether there were negative specials such as \S
4695: (non-UCP) in the class. Then copy the 32-byte map into the code vector,
4696: negating it if necessary. */
4697:
4698: *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
1.1.1.2 ! misho 4699: if (lengthptr == NULL) /* Save time in the pre-compile phase */
1.1 misho 4700: {
1.1.1.2 ! misho 4701: if (negate_class)
! 4702: for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
1.1 misho 4703: memcpy(code, classbits, 32);
4704: }
1.1.1.2 ! misho 4705: code += 32 / sizeof(pcre_uchar);
! 4706:
! 4707: END_CLASS:
1.1 misho 4708: break;
4709:
4710:
4711: /* ===================================================================*/
4712: /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
4713: has been tested above. */
4714:
4715: case CHAR_LEFT_CURLY_BRACKET:
4716: if (!is_quantifier) goto NORMAL_CHAR;
4717: ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
4718: if (*errorcodeptr != 0) goto FAILED;
4719: goto REPEAT;
4720:
4721: case CHAR_ASTERISK:
4722: repeat_min = 0;
4723: repeat_max = -1;
4724: goto REPEAT;
4725:
4726: case CHAR_PLUS:
4727: repeat_min = 1;
4728: repeat_max = -1;
4729: goto REPEAT;
4730:
4731: case CHAR_QUESTION_MARK:
4732: repeat_min = 0;
4733: repeat_max = 1;
4734:
4735: REPEAT:
4736: if (previous == NULL)
4737: {
4738: *errorcodeptr = ERR9;
4739: goto FAILED;
4740: }
4741:
4742: if (repeat_min == 0)
4743: {
1.1.1.2 ! misho 4744: firstchar = zerofirstchar; /* Adjust for zero repeat */
! 4745: firstcharflags = zerofirstcharflags;
! 4746: reqchar = zeroreqchar; /* Ditto */
! 4747: reqcharflags = zeroreqcharflags;
1.1 misho 4748: }
4749:
4750: /* Remember whether this is a variable length repeat */
4751:
4752: reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
4753:
4754: op_type = 0; /* Default single-char op codes */
4755: possessive_quantifier = FALSE; /* Default not possessive quantifier */
4756:
1.1.1.2 ! misho 4757: /* Save start of previous item, in case we have to move it up in order to
! 4758: insert something before it. */
1.1 misho 4759:
4760: tempcode = previous;
4761:
4762: /* If the next character is '+', we have a possessive quantifier. This
4763: implies greediness, whatever the setting of the PCRE_UNGREEDY option.
4764: If the next character is '?' this is a minimizing repeat, by default,
4765: but if PCRE_UNGREEDY is set, it works the other way round. We change the
4766: repeat type to the non-default. */
4767:
4768: if (ptr[1] == CHAR_PLUS)
4769: {
4770: repeat_type = 0; /* Force greedy */
4771: possessive_quantifier = TRUE;
4772: ptr++;
4773: }
4774: else if (ptr[1] == CHAR_QUESTION_MARK)
4775: {
4776: repeat_type = greedy_non_default;
4777: ptr++;
4778: }
4779: else repeat_type = greedy_default;
4780:
1.1.1.2 ! misho 4781: /* If previous was a recursion call, wrap it in atomic brackets so that
! 4782: previous becomes the atomic group. All recursions were so wrapped in the
! 4783: past, but it no longer happens for non-repeated recursions. In fact, the
! 4784: repeated ones could be re-implemented independently so as not to need this,
! 4785: but for the moment we rely on the code for repeating groups. */
! 4786:
! 4787: if (*previous == OP_RECURSE)
! 4788: {
! 4789: memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
! 4790: *previous = OP_ONCE;
! 4791: PUT(previous, 1, 2 + 2*LINK_SIZE);
! 4792: previous[2 + 2*LINK_SIZE] = OP_KET;
! 4793: PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
! 4794: code += 2 + 2 * LINK_SIZE;
! 4795: length_prevgroup = 3 + 3*LINK_SIZE;
! 4796:
! 4797: /* When actually compiling, we need to check whether this was a forward
! 4798: reference, and if so, adjust the offset. */
! 4799:
! 4800: if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
! 4801: {
! 4802: int offset = GET(cd->hwm, -LINK_SIZE);
! 4803: if (offset == previous + 1 - cd->start_code)
! 4804: PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
! 4805: }
! 4806: }
! 4807:
! 4808: /* Now handle repetition for the different types of item. */
1.1 misho 4809:
1.1.1.2 ! misho 4810: /* If previous was a character or negated character match, abolish the item
! 4811: and generate a repeat item instead. If a char item has a minimum of more
! 4812: than one, ensure that it is set in reqchar - it might not be if a sequence
! 4813: such as x{3} is the first thing in a branch because the x will have gone
! 4814: into firstchar instead. */
! 4815:
! 4816: if (*previous == OP_CHAR || *previous == OP_CHARI
! 4817: || *previous == OP_NOT || *previous == OP_NOTI)
1.1 misho 4818: {
1.1.1.2 ! misho 4819: switch (*previous)
! 4820: {
! 4821: default: /* Make compiler happy. */
! 4822: case OP_CHAR: op_type = OP_STAR - OP_STAR; break;
! 4823: case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
! 4824: case OP_NOT: op_type = OP_NOTSTAR - OP_STAR; break;
! 4825: case OP_NOTI: op_type = OP_NOTSTARI - OP_STAR; break;
! 4826: }
! 4827:
! 4828: /* Deal with UTF characters that take up more than one character. It's
1.1 misho 4829: easier to write this out separately than try to macrify it. Use c to
1.1.1.2 ! misho 4830: hold the length of the character in bytes, plus UTF_LENGTH to flag that
! 4831: it's a length rather than a small character. */
1.1 misho 4832:
1.1.1.2 ! misho 4833: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
! 4834: if (utf && NOT_FIRSTCHAR(code[-1]))
1.1 misho 4835: {
1.1.1.2 ! misho 4836: pcre_uchar *lastchar = code - 1;
! 4837: BACKCHAR(lastchar);
! 4838: c = (int)(code - lastchar); /* Length of UTF-8 character */
! 4839: memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
! 4840: c |= UTF_LENGTH; /* Flag c as a length */
1.1 misho 4841: }
4842: else
1.1.1.2 ! misho 4843: #endif /* SUPPORT_UTF */
1.1 misho 4844:
1.1.1.2 ! misho 4845: /* Handle the case of a single charater - either with no UTF support, or
! 4846: with UTF disabled, or for a single character UTF character. */
1.1 misho 4847: {
4848: c = code[-1];
1.1.1.2 ! misho 4849: if (*previous <= OP_CHARI && repeat_min > 1)
! 4850: {
! 4851: reqchar = c;
! 4852: reqcharflags = req_caseopt | cd->req_varyopt;
! 4853: }
1.1 misho 4854: }
4855:
4856: /* If the repetition is unlimited, it pays to see if the next thing on
4857: the line is something that cannot possibly match this character. If so,
4858: automatically possessifying this item gains some performance in the case
4859: where the match fails. */
4860:
4861: if (!possessive_quantifier &&
4862: repeat_max < 0 &&
1.1.1.2 ! misho 4863: check_auto_possessive(previous, utf, ptr + 1, options, cd))
1.1 misho 4864: {
4865: repeat_type = 0; /* Force greedy */
4866: possessive_quantifier = TRUE;
4867: }
4868:
4869: goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
4870: }
4871:
4872: /* If previous was a character type match (\d or similar), abolish it and
4873: create a suitable repeat item. The code is shared with single-character
4874: repeats by setting op_type to add a suitable offset into repeat_type. Note
4875: the the Unicode property types will be present only when SUPPORT_UCP is
4876: defined, but we don't wrap the little bits of code here because it just
4877: makes it horribly messy. */
4878:
4879: else if (*previous < OP_EODN)
4880: {
1.1.1.2 ! misho 4881: pcre_uchar *oldcode;
1.1 misho 4882: int prop_type, prop_value;
4883: op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
4884: c = *previous;
4885:
4886: if (!possessive_quantifier &&
4887: repeat_max < 0 &&
1.1.1.2 ! misho 4888: check_auto_possessive(previous, utf, ptr + 1, options, cd))
1.1 misho 4889: {
4890: repeat_type = 0; /* Force greedy */
4891: possessive_quantifier = TRUE;
4892: }
4893:
4894: OUTPUT_SINGLE_REPEAT:
4895: if (*previous == OP_PROP || *previous == OP_NOTPROP)
4896: {
4897: prop_type = previous[1];
4898: prop_value = previous[2];
4899: }
4900: else prop_type = prop_value = -1;
4901:
4902: oldcode = code;
4903: code = previous; /* Usually overwrite previous item */
4904:
4905: /* If the maximum is zero then the minimum must also be zero; Perl allows
4906: this case, so we do too - by simply omitting the item altogether. */
4907:
4908: if (repeat_max == 0) goto END_REPEAT;
4909:
4910: /*--------------------------------------------------------------------*/
4911: /* This code is obsolete from release 8.00; the restriction was finally
4912: removed: */
4913:
4914: /* All real repeats make it impossible to handle partial matching (maybe
4915: one day we will be able to remove this restriction). */
4916:
4917: /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4918: /*--------------------------------------------------------------------*/
4919:
4920: /* Combine the op_type with the repeat_type */
4921:
4922: repeat_type += op_type;
4923:
4924: /* A minimum of zero is handled either as the special case * or ?, or as
4925: an UPTO, with the maximum given. */
4926:
4927: if (repeat_min == 0)
4928: {
4929: if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
4930: else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
4931: else
4932: {
4933: *code++ = OP_UPTO + repeat_type;
4934: PUT2INC(code, 0, repeat_max);
4935: }
4936: }
4937:
4938: /* A repeat minimum of 1 is optimized into some special cases. If the
4939: maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
4940: left in place and, if the maximum is greater than 1, we use OP_UPTO with
4941: one less than the maximum. */
4942:
4943: else if (repeat_min == 1)
4944: {
4945: if (repeat_max == -1)
4946: *code++ = OP_PLUS + repeat_type;
4947: else
4948: {
4949: code = oldcode; /* leave previous item in place */
4950: if (repeat_max == 1) goto END_REPEAT;
4951: *code++ = OP_UPTO + repeat_type;
4952: PUT2INC(code, 0, repeat_max - 1);
4953: }
4954: }
4955:
4956: /* The case {n,n} is just an EXACT, while the general case {n,m} is
4957: handled as an EXACT followed by an UPTO. */
4958:
4959: else
4960: {
4961: *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
4962: PUT2INC(code, 0, repeat_min);
4963:
4964: /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
4965: we have to insert the character for the previous code. For a repeated
4966: Unicode property match, there are two extra bytes that define the
4967: required property. In UTF-8 mode, long characters have their length in
1.1.1.2 ! misho 4968: c, with the UTF_LENGTH bit as a flag. */
1.1 misho 4969:
4970: if (repeat_max < 0)
4971: {
1.1.1.2 ! misho 4972: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
! 4973: if (utf && (c & UTF_LENGTH) != 0)
1.1 misho 4974: {
1.1.1.2 ! misho 4975: memcpy(code, utf_chars, IN_UCHARS(c & 7));
1.1 misho 4976: code += c & 7;
4977: }
4978: else
4979: #endif
4980: {
4981: *code++ = c;
4982: if (prop_type >= 0)
4983: {
4984: *code++ = prop_type;
4985: *code++ = prop_value;
4986: }
4987: }
4988: *code++ = OP_STAR + repeat_type;
4989: }
4990:
4991: /* Else insert an UPTO if the max is greater than the min, again
4992: preceded by the character, for the previously inserted code. If the
4993: UPTO is just for 1 instance, we can use QUERY instead. */
4994:
4995: else if (repeat_max != repeat_min)
4996: {
1.1.1.2 ! misho 4997: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
! 4998: if (utf && (c & UTF_LENGTH) != 0)
1.1 misho 4999: {
1.1.1.2 ! misho 5000: memcpy(code, utf_chars, IN_UCHARS(c & 7));
1.1 misho 5001: code += c & 7;
5002: }
5003: else
5004: #endif
5005: *code++ = c;
5006: if (prop_type >= 0)
5007: {
5008: *code++ = prop_type;
5009: *code++ = prop_value;
5010: }
5011: repeat_max -= repeat_min;
5012:
5013: if (repeat_max == 1)
5014: {
5015: *code++ = OP_QUERY + repeat_type;
5016: }
5017: else
5018: {
5019: *code++ = OP_UPTO + repeat_type;
5020: PUT2INC(code, 0, repeat_max);
5021: }
5022: }
5023: }
5024:
5025: /* The character or character type itself comes last in all cases. */
5026:
1.1.1.2 ! misho 5027: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
! 5028: if (utf && (c & UTF_LENGTH) != 0)
1.1 misho 5029: {
1.1.1.2 ! misho 5030: memcpy(code, utf_chars, IN_UCHARS(c & 7));
1.1 misho 5031: code += c & 7;
5032: }
5033: else
5034: #endif
5035: *code++ = c;
5036:
5037: /* For a repeated Unicode property match, there are two extra bytes that
5038: define the required property. */
5039:
5040: #ifdef SUPPORT_UCP
5041: if (prop_type >= 0)
5042: {
5043: *code++ = prop_type;
5044: *code++ = prop_value;
5045: }
5046: #endif
5047: }
5048:
5049: /* If previous was a character class or a back reference, we put the repeat
5050: stuff after it, but just skip the item if the repeat was {0,0}. */
5051:
5052: else if (*previous == OP_CLASS ||
5053: *previous == OP_NCLASS ||
1.1.1.2 ! misho 5054: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1.1 misho 5055: *previous == OP_XCLASS ||
5056: #endif
1.1.1.2 ! misho 5057: *previous == OP_REF ||
! 5058: *previous == OP_REFI)
1.1 misho 5059: {
5060: if (repeat_max == 0)
5061: {
5062: code = previous;
5063: goto END_REPEAT;
5064: }
5065:
5066: /*--------------------------------------------------------------------*/
5067: /* This code is obsolete from release 8.00; the restriction was finally
5068: removed: */
5069:
5070: /* All real repeats make it impossible to handle partial matching (maybe
5071: one day we will be able to remove this restriction). */
5072:
5073: /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
5074: /*--------------------------------------------------------------------*/
5075:
5076: if (repeat_min == 0 && repeat_max == -1)
5077: *code++ = OP_CRSTAR + repeat_type;
5078: else if (repeat_min == 1 && repeat_max == -1)
5079: *code++ = OP_CRPLUS + repeat_type;
5080: else if (repeat_min == 0 && repeat_max == 1)
5081: *code++ = OP_CRQUERY + repeat_type;
5082: else
5083: {
5084: *code++ = OP_CRRANGE + repeat_type;
5085: PUT2INC(code, 0, repeat_min);
5086: if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
5087: PUT2INC(code, 0, repeat_max);
5088: }
5089: }
5090:
5091: /* If previous was a bracket group, we may have to replicate it in certain
1.1.1.2 ! misho 5092: cases. Note that at this point we can encounter only the "basic" bracket
! 5093: opcodes such as BRA and CBRA, as this is the place where they get converted
! 5094: into the more special varieties such as BRAPOS and SBRA. A test for >=
! 5095: OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
! 5096: ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow
! 5097: repetition of assertions, but now it does, for Perl compatibility. */
1.1 misho 5098:
1.1.1.2 ! misho 5099: else if (*previous >= OP_ASSERT && *previous <= OP_COND)
1.1 misho 5100: {
5101: register int i;
5102: int len = (int)(code - previous);
1.1.1.2 ! misho 5103: pcre_uchar *bralink = NULL;
! 5104: pcre_uchar *brazeroptr = NULL;
1.1 misho 5105:
1.1.1.2 ! misho 5106: /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
! 5107: we just ignore the repeat. */
1.1 misho 5108:
5109: if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
1.1.1.2 ! misho 5110: goto END_REPEAT;
1.1 misho 5111:
1.1.1.2 ! misho 5112: /* There is no sense in actually repeating assertions. The only potential
! 5113: use of repetition is in cases when the assertion is optional. Therefore,
! 5114: if the minimum is greater than zero, just ignore the repeat. If the
! 5115: maximum is not not zero or one, set it to 1. */
! 5116:
! 5117: if (*previous < OP_ONCE) /* Assertion */
! 5118: {
! 5119: if (repeat_min > 0) goto END_REPEAT;
! 5120: if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
1.1 misho 5121: }
5122:
5123: /* The case of a zero minimum is special because of the need to stick
5124: OP_BRAZERO in front of it, and because the group appears once in the
5125: data, whereas in other cases it appears the minimum number of times. For
5126: this reason, it is simplest to treat this case separately, as otherwise
5127: the code gets far too messy. There are several special subcases when the
5128: minimum is zero. */
5129:
5130: if (repeat_min == 0)
5131: {
5132: /* If the maximum is also zero, we used to just omit the group from the
5133: output altogether, like this:
5134:
5135: ** if (repeat_max == 0)
5136: ** {
5137: ** code = previous;
5138: ** goto END_REPEAT;
5139: ** }
5140:
1.1.1.2 ! misho 5141: However, that fails when a group or a subgroup within it is referenced
! 5142: as a subroutine from elsewhere in the pattern, so now we stick in
! 5143: OP_SKIPZERO in front of it so that it is skipped on execution. As we
! 5144: don't have a list of which groups are referenced, we cannot do this
! 5145: selectively.
1.1 misho 5146:
5147: If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
5148: and do no more at this point. However, we do need to adjust any
5149: OP_RECURSE calls inside the group that refer to the group itself or any
5150: internal or forward referenced group, because the offset is from the
5151: start of the whole regex. Temporarily terminate the pattern while doing
5152: this. */
5153:
5154: if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
5155: {
5156: *code = OP_END;
1.1.1.2 ! misho 5157: adjust_recurse(previous, 1, utf, cd, save_hwm);
! 5158: memmove(previous + 1, previous, IN_UCHARS(len));
1.1 misho 5159: code++;
5160: if (repeat_max == 0)
5161: {
5162: *previous++ = OP_SKIPZERO;
5163: goto END_REPEAT;
5164: }
1.1.1.2 ! misho 5165: brazeroptr = previous; /* Save for possessive optimizing */
1.1 misho 5166: *previous++ = OP_BRAZERO + repeat_type;
5167: }
5168:
5169: /* If the maximum is greater than 1 and limited, we have to replicate
5170: in a nested fashion, sticking OP_BRAZERO before each set of brackets.
5171: The first one has to be handled carefully because it's the original
5172: copy, which has to be moved up. The remainder can be handled by code
5173: that is common with the non-zero minimum case below. We have to
5174: adjust the value or repeat_max, since one less copy is required. Once
5175: again, we may have to adjust any OP_RECURSE calls inside the group. */
5176:
5177: else
5178: {
5179: int offset;
5180: *code = OP_END;
1.1.1.2 ! misho 5181: adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
! 5182: memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
1.1 misho 5183: code += 2 + LINK_SIZE;
5184: *previous++ = OP_BRAZERO + repeat_type;
5185: *previous++ = OP_BRA;
5186:
5187: /* We chain together the bracket offset fields that have to be
5188: filled in later when the ends of the brackets are reached. */
5189:
5190: offset = (bralink == NULL)? 0 : (int)(previous - bralink);
5191: bralink = previous;
5192: PUTINC(previous, 0, offset);
5193: }
5194:
5195: repeat_max--;
5196: }
5197:
5198: /* If the minimum is greater than zero, replicate the group as many
5199: times as necessary, and adjust the maximum to the number of subsequent
5200: copies that we need. If we set a first char from the group, and didn't
5201: set a required char, copy the latter from the former. If there are any
5202: forward reference subroutine calls in the group, there will be entries on
5203: the workspace list; replicate these with an appropriate increment. */
5204:
5205: else
5206: {
5207: if (repeat_min > 1)
5208: {
5209: /* In the pre-compile phase, we don't actually do the replication. We
5210: just adjust the length as if we had. Do some paranoid checks for
5211: potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
5212: integer type when available, otherwise double. */
5213:
5214: if (lengthptr != NULL)
5215: {
5216: int delta = (repeat_min - 1)*length_prevgroup;
5217: if ((INT64_OR_DOUBLE)(repeat_min - 1)*
5218: (INT64_OR_DOUBLE)length_prevgroup >
5219: (INT64_OR_DOUBLE)INT_MAX ||
5220: OFLOW_MAX - *lengthptr < delta)
5221: {
5222: *errorcodeptr = ERR20;
5223: goto FAILED;
5224: }
5225: *lengthptr += delta;
5226: }
5227:
1.1.1.2 ! misho 5228: /* This is compiling for real. If there is a set first byte for
! 5229: the group, and we have not yet set a "required byte", set it. Make
! 5230: sure there is enough workspace for copying forward references before
! 5231: doing the copy. */
1.1 misho 5232:
5233: else
5234: {
1.1.1.2 ! misho 5235: if (groupsetfirstchar && reqcharflags < 0)
! 5236: {
! 5237: reqchar = firstchar;
! 5238: reqcharflags = firstcharflags;
! 5239: }
! 5240:
1.1 misho 5241: for (i = 1; i < repeat_min; i++)
5242: {
1.1.1.2 ! misho 5243: pcre_uchar *hc;
! 5244: pcre_uchar *this_hwm = cd->hwm;
! 5245: memcpy(code, previous, IN_UCHARS(len));
! 5246:
! 5247: while (cd->hwm > cd->start_workspace + cd->workspace_size -
! 5248: WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
! 5249: {
! 5250: int save_offset = save_hwm - cd->start_workspace;
! 5251: int this_offset = this_hwm - cd->start_workspace;
! 5252: *errorcodeptr = expand_workspace(cd);
! 5253: if (*errorcodeptr != 0) goto FAILED;
! 5254: save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
! 5255: this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
! 5256: }
! 5257:
1.1 misho 5258: for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5259: {
5260: PUT(cd->hwm, 0, GET(hc, 0) + len);
5261: cd->hwm += LINK_SIZE;
5262: }
5263: save_hwm = this_hwm;
5264: code += len;
5265: }
5266: }
5267: }
5268:
5269: if (repeat_max > 0) repeat_max -= repeat_min;
5270: }
5271:
5272: /* This code is common to both the zero and non-zero minimum cases. If
5273: the maximum is limited, it replicates the group in a nested fashion,
5274: remembering the bracket starts on a stack. In the case of a zero minimum,
5275: the first one was set up above. In all cases the repeat_max now specifies
5276: the number of additional copies needed. Again, we must remember to
5277: replicate entries on the forward reference list. */
5278:
5279: if (repeat_max >= 0)
5280: {
5281: /* In the pre-compile phase, we don't actually do the replication. We
5282: just adjust the length as if we had. For each repetition we must add 1
5283: to the length for BRAZERO and for all but the last repetition we must
5284: add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
5285: paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
5286: a 64-bit integer type when available, otherwise double. */
5287:
5288: if (lengthptr != NULL && repeat_max > 0)
5289: {
5290: int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
5291: 2 - 2*LINK_SIZE; /* Last one doesn't nest */
5292: if ((INT64_OR_DOUBLE)repeat_max *
5293: (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
5294: > (INT64_OR_DOUBLE)INT_MAX ||
5295: OFLOW_MAX - *lengthptr < delta)
5296: {
5297: *errorcodeptr = ERR20;
5298: goto FAILED;
5299: }
5300: *lengthptr += delta;
5301: }
5302:
5303: /* This is compiling for real */
5304:
5305: else for (i = repeat_max - 1; i >= 0; i--)
5306: {
1.1.1.2 ! misho 5307: pcre_uchar *hc;
! 5308: pcre_uchar *this_hwm = cd->hwm;
1.1 misho 5309:
5310: *code++ = OP_BRAZERO + repeat_type;
5311:
5312: /* All but the final copy start a new nesting, maintaining the
5313: chain of brackets outstanding. */
5314:
5315: if (i != 0)
5316: {
5317: int offset;
5318: *code++ = OP_BRA;
5319: offset = (bralink == NULL)? 0 : (int)(code - bralink);
5320: bralink = code;
5321: PUTINC(code, 0, offset);
5322: }
5323:
1.1.1.2 ! misho 5324: memcpy(code, previous, IN_UCHARS(len));
! 5325:
! 5326: /* Ensure there is enough workspace for forward references before
! 5327: copying them. */
! 5328:
! 5329: while (cd->hwm > cd->start_workspace + cd->workspace_size -
! 5330: WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
! 5331: {
! 5332: int save_offset = save_hwm - cd->start_workspace;
! 5333: int this_offset = this_hwm - cd->start_workspace;
! 5334: *errorcodeptr = expand_workspace(cd);
! 5335: if (*errorcodeptr != 0) goto FAILED;
! 5336: save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
! 5337: this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
! 5338: }
! 5339:
1.1 misho 5340: for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5341: {
5342: PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
5343: cd->hwm += LINK_SIZE;
5344: }
5345: save_hwm = this_hwm;
5346: code += len;
5347: }
5348:
5349: /* Now chain through the pending brackets, and fill in their length
5350: fields (which are holding the chain links pro tem). */
5351:
5352: while (bralink != NULL)
5353: {
5354: int oldlinkoffset;
5355: int offset = (int)(code - bralink + 1);
1.1.1.2 ! misho 5356: pcre_uchar *bra = code - offset;
1.1 misho 5357: oldlinkoffset = GET(bra, 1);
5358: bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
5359: *code++ = OP_KET;
5360: PUTINC(code, 0, offset);
5361: PUT(bra, 1, offset);
5362: }
5363: }
5364:
1.1.1.2 ! misho 5365: /* If the maximum is unlimited, set a repeater in the final copy. For
! 5366: ONCE brackets, that's all we need to do. However, possessively repeated
! 5367: ONCE brackets can be converted into non-capturing brackets, as the
! 5368: behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
! 5369: deal with possessive ONCEs specially.
1.1 misho 5370:
1.1.1.2 ! misho 5371: Otherwise, when we are doing the actual compile phase, check to see
! 5372: whether this group is one that could match an empty string. If so,
1.1 misho 5373: convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
1.1.1.2 ! misho 5374: that runtime checking can be done. [This check is also applied to ONCE
! 5375: groups at runtime, but in a different way.]
! 5376:
! 5377: Then, if the quantifier was possessive and the bracket is not a
! 5378: conditional, we convert the BRA code to the POS form, and the KET code to
! 5379: KETRPOS. (It turns out to be convenient at runtime to detect this kind of
! 5380: subpattern at both the start and at the end.) The use of special opcodes
! 5381: makes it possible to reduce greatly the stack usage in pcre_exec(). If
! 5382: the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
! 5383:
! 5384: Then, if the minimum number of matches is 1 or 0, cancel the possessive
! 5385: flag so that the default action below, of wrapping everything inside
! 5386: atomic brackets, does not happen. When the minimum is greater than 1,
! 5387: there will be earlier copies of the group, and so we still have to wrap
! 5388: the whole thing. */
1.1 misho 5389:
5390: else
5391: {
1.1.1.2 ! misho 5392: pcre_uchar *ketcode = code - 1 - LINK_SIZE;
! 5393: pcre_uchar *bracode = ketcode - GET(ketcode, 1);
! 5394:
! 5395: /* Convert possessive ONCE brackets to non-capturing */
! 5396:
! 5397: if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
! 5398: possessive_quantifier) *bracode = OP_BRA;
! 5399:
! 5400: /* For non-possessive ONCE brackets, all we need to do is to
! 5401: set the KET. */
! 5402:
! 5403: if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
! 5404: *ketcode = OP_KETRMAX + repeat_type;
! 5405:
! 5406: /* Handle non-ONCE brackets and possessive ONCEs (which have been
! 5407: converted to non-capturing above). */
! 5408:
! 5409: else
1.1 misho 5410: {
1.1.1.2 ! misho 5411: /* In the compile phase, check for empty string matching. */
! 5412:
! 5413: if (lengthptr == NULL)
1.1 misho 5414: {
1.1.1.2 ! misho 5415: pcre_uchar *scode = bracode;
! 5416: do
1.1 misho 5417: {
1.1.1.2 ! misho 5418: if (could_be_empty_branch(scode, ketcode, utf, cd))
! 5419: {
! 5420: *bracode += OP_SBRA - OP_BRA;
! 5421: break;
! 5422: }
! 5423: scode += GET(scode, 1);
! 5424: }
! 5425: while (*scode == OP_ALT);
! 5426: }
! 5427:
! 5428: /* Handle possessive quantifiers. */
! 5429:
! 5430: if (possessive_quantifier)
! 5431: {
! 5432: /* For COND brackets, we wrap the whole thing in a possessively
! 5433: repeated non-capturing bracket, because we have not invented POS
! 5434: versions of the COND opcodes. Because we are moving code along, we
! 5435: must ensure that any pending recursive references are updated. */
! 5436:
! 5437: if (*bracode == OP_COND || *bracode == OP_SCOND)
! 5438: {
! 5439: int nlen = (int)(code - bracode);
! 5440: *code = OP_END;
! 5441: adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);
! 5442: memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
! 5443: code += 1 + LINK_SIZE;
! 5444: nlen += 1 + LINK_SIZE;
! 5445: *bracode = OP_BRAPOS;
! 5446: *code++ = OP_KETRPOS;
! 5447: PUTINC(code, 0, nlen);
! 5448: PUT(bracode, 1, nlen);
! 5449: }
! 5450:
! 5451: /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
! 5452:
! 5453: else
! 5454: {
! 5455: *bracode += 1; /* Switch to xxxPOS opcodes */
! 5456: *ketcode = OP_KETRPOS;
1.1 misho 5457: }
1.1.1.2 ! misho 5458:
! 5459: /* If the minimum is zero, mark it as possessive, then unset the
! 5460: possessive flag when the minimum is 0 or 1. */
! 5461:
! 5462: if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
! 5463: if (repeat_min < 2) possessive_quantifier = FALSE;
1.1 misho 5464: }
1.1.1.2 ! misho 5465:
! 5466: /* Non-possessive quantifier */
! 5467:
! 5468: else *ketcode = OP_KETRMAX + repeat_type;
1.1 misho 5469: }
5470: }
5471: }
5472:
5473: /* If previous is OP_FAIL, it was generated by an empty class [] in
5474: JavaScript mode. The other ways in which OP_FAIL can be generated, that is
5475: by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
5476: error above. We can just ignore the repeat in JS case. */
5477:
5478: else if (*previous == OP_FAIL) goto END_REPEAT;
5479:
5480: /* Else there's some kind of shambles */
5481:
5482: else
5483: {
5484: *errorcodeptr = ERR11;
5485: goto FAILED;
5486: }
5487:
5488: /* If the character following a repeat is '+', or if certain optimization
1.1.1.2 ! misho 5489: tests above succeeded, possessive_quantifier is TRUE. For some opcodes,
! 5490: there are special alternative opcodes for this case. For anything else, we
! 5491: wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'
! 5492: notation is just syntactic sugar, taken from Sun's Java package, but the
! 5493: special opcodes can optimize it.
! 5494:
! 5495: Some (but not all) possessively repeated subpatterns have already been
! 5496: completely handled in the code just above. For them, possessive_quantifier
! 5497: is always FALSE at this stage.
! 5498:
! 5499: Note that the repeated item starts at tempcode, not at previous, which
! 5500: might be the first part of a string whose (former) last char we repeated.
1.1 misho 5501:
5502: Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
5503: an 'upto' may follow. We skip over an 'exact' item, and then test the
5504: length of what remains before proceeding. */
5505:
5506: if (possessive_quantifier)
5507: {
5508: int len;
5509:
5510: if (*tempcode == OP_TYPEEXACT)
1.1.1.2 ! misho 5511: tempcode += PRIV(OP_lengths)[*tempcode] +
! 5512: ((tempcode[1 + IMM2_SIZE] == OP_PROP
! 5513: || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
1.1 misho 5514:
5515: else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
5516: {
1.1.1.2 ! misho 5517: tempcode += PRIV(OP_lengths)[*tempcode];
! 5518: #ifdef SUPPORT_UTF
! 5519: if (utf && HAS_EXTRALEN(tempcode[-1]))
! 5520: tempcode += GET_EXTRALEN(tempcode[-1]);
1.1 misho 5521: #endif
5522: }
5523:
5524: len = (int)(code - tempcode);
5525: if (len > 0) switch (*tempcode)
5526: {
5527: case OP_STAR: *tempcode = OP_POSSTAR; break;
5528: case OP_PLUS: *tempcode = OP_POSPLUS; break;
5529: case OP_QUERY: *tempcode = OP_POSQUERY; break;
5530: case OP_UPTO: *tempcode = OP_POSUPTO; break;
5531:
1.1.1.2 ! misho 5532: case OP_STARI: *tempcode = OP_POSSTARI; break;
! 5533: case OP_PLUSI: *tempcode = OP_POSPLUSI; break;
! 5534: case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
! 5535: case OP_UPTOI: *tempcode = OP_POSUPTOI; break;
1.1 misho 5536:
5537: case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
5538: case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
5539: case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
5540: case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
5541:
1.1.1.2 ! misho 5542: case OP_NOTSTARI: *tempcode = OP_NOTPOSSTARI; break;
! 5543: case OP_NOTPLUSI: *tempcode = OP_NOTPOSPLUSI; break;
! 5544: case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
! 5545: case OP_NOTUPTOI: *tempcode = OP_NOTPOSUPTOI; break;
! 5546:
! 5547: case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
! 5548: case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
! 5549: case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
! 5550: case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
! 5551:
1.1 misho 5552: /* Because we are moving code along, we must ensure that any
5553: pending recursive references are updated. */
5554:
5555: default:
5556: *code = OP_END;
1.1.1.2 ! misho 5557: adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
! 5558: memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
1.1 misho 5559: code += 1 + LINK_SIZE;
5560: len += 1 + LINK_SIZE;
5561: tempcode[0] = OP_ONCE;
5562: *code++ = OP_KET;
5563: PUTINC(code, 0, len);
5564: PUT(tempcode, 1, len);
5565: break;
5566: }
5567: }
5568:
5569: /* In all case we no longer have a previous item. We also set the
1.1.1.2 ! misho 5570: "follows varying string" flag for subsequently encountered reqchars if
1.1 misho 5571: it isn't already set and we have just passed a varying length item. */
5572:
5573: END_REPEAT:
5574: previous = NULL;
5575: cd->req_varyopt |= reqvary;
5576: break;
5577:
5578:
5579: /* ===================================================================*/
5580: /* Start of nested parenthesized sub-expression, or comment or lookahead or
5581: lookbehind or option setting or condition or all the other extended
5582: parenthesis forms. */
5583:
5584: case CHAR_LEFT_PARENTHESIS:
5585: newoptions = options;
5586: skipbytes = 0;
5587: bravalue = OP_CBRA;
5588: save_hwm = cd->hwm;
5589: reset_bracount = FALSE;
5590:
5591: /* First deal with various "verbs" that can be introduced by '*'. */
5592:
1.1.1.2 ! misho 5593: ptr++;
! 5594: if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
! 5595: || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
1.1 misho 5596: {
5597: int i, namelen;
5598: int arglen = 0;
5599: const char *vn = verbnames;
1.1.1.2 ! misho 5600: const pcre_uchar *name = ptr + 1;
! 5601: const pcre_uchar *arg = NULL;
1.1 misho 5602: previous = NULL;
1.1.1.2 ! misho 5603: ptr++;
! 5604: while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1.1 misho 5605: namelen = (int)(ptr - name);
5606:
1.1.1.2 ! misho 5607: /* It appears that Perl allows any characters whatsoever, other than
! 5608: a closing parenthesis, to appear in arguments, so we no longer insist on
! 5609: letters, digits, and underscores. */
! 5610:
1.1 misho 5611: if (*ptr == CHAR_COLON)
5612: {
5613: arg = ++ptr;
1.1.1.2 ! misho 5614: while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1.1 misho 5615: arglen = (int)(ptr - arg);
1.1.1.2 ! misho 5616: if ((unsigned int)arglen > MAX_MARK)
! 5617: {
! 5618: *errorcodeptr = ERR75;
! 5619: goto FAILED;
! 5620: }
1.1 misho 5621: }
5622:
5623: if (*ptr != CHAR_RIGHT_PARENTHESIS)
5624: {
5625: *errorcodeptr = ERR60;
5626: goto FAILED;
5627: }
5628:
5629: /* Scan the table of verb names */
5630:
5631: for (i = 0; i < verbcount; i++)
5632: {
5633: if (namelen == verbs[i].len &&
1.1.1.2 ! misho 5634: STRNCMP_UC_C8(name, vn, namelen) == 0)
1.1 misho 5635: {
1.1.1.2 ! misho 5636: int setverb;
! 5637:
! 5638: /* Check for open captures before ACCEPT and convert it to
! 5639: ASSERT_ACCEPT if in an assertion. */
1.1 misho 5640:
5641: if (verbs[i].op == OP_ACCEPT)
5642: {
5643: open_capitem *oc;
1.1.1.2 ! misho 5644: if (arglen != 0)
! 5645: {
! 5646: *errorcodeptr = ERR59;
! 5647: goto FAILED;
! 5648: }
1.1 misho 5649: cd->had_accept = TRUE;
5650: for (oc = cd->open_caps; oc != NULL; oc = oc->next)
5651: {
5652: *code++ = OP_CLOSE;
5653: PUT2INC(code, 0, oc->number);
5654: }
1.1.1.2 ! misho 5655: setverb = *code++ =
! 5656: (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
! 5657:
! 5658: /* Do not set firstchar after *ACCEPT */
! 5659: if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
1.1 misho 5660: }
5661:
1.1.1.2 ! misho 5662: /* Handle other cases with/without an argument */
1.1 misho 5663:
1.1.1.2 ! misho 5664: else if (arglen == 0)
1.1 misho 5665: {
5666: if (verbs[i].op < 0) /* Argument is mandatory */
5667: {
5668: *errorcodeptr = ERR66;
5669: goto FAILED;
5670: }
1.1.1.2 ! misho 5671: setverb = *code++ = verbs[i].op;
1.1 misho 5672: }
5673:
5674: else
5675: {
5676: if (verbs[i].op_arg < 0) /* Argument is forbidden */
5677: {
5678: *errorcodeptr = ERR59;
5679: goto FAILED;
5680: }
1.1.1.2 ! misho 5681: setverb = *code++ = verbs[i].op_arg;
1.1 misho 5682: *code++ = arglen;
1.1.1.2 ! misho 5683: memcpy(code, arg, IN_UCHARS(arglen));
1.1 misho 5684: code += arglen;
5685: *code++ = 0;
5686: }
5687:
1.1.1.2 ! misho 5688: switch (setverb)
! 5689: {
! 5690: case OP_THEN:
! 5691: case OP_THEN_ARG:
! 5692: cd->external_flags |= PCRE_HASTHEN;
! 5693: break;
! 5694:
! 5695: case OP_PRUNE:
! 5696: case OP_PRUNE_ARG:
! 5697: case OP_SKIP:
! 5698: case OP_SKIP_ARG:
! 5699: cd->had_pruneorskip = TRUE;
! 5700: break;
! 5701: }
! 5702:
1.1 misho 5703: break; /* Found verb, exit loop */
5704: }
5705:
5706: vn += verbs[i].len + 1;
5707: }
5708:
5709: if (i < verbcount) continue; /* Successfully handled a verb */
5710: *errorcodeptr = ERR60; /* Verb not recognized */
5711: goto FAILED;
5712: }
5713:
5714: /* Deal with the extended parentheses; all are introduced by '?', and the
5715: appearance of any of them means that this is not a capturing group. */
5716:
5717: else if (*ptr == CHAR_QUESTION_MARK)
5718: {
5719: int i, set, unset, namelen;
5720: int *optset;
1.1.1.2 ! misho 5721: const pcre_uchar *name;
! 5722: pcre_uchar *slot;
1.1 misho 5723:
5724: switch (*(++ptr))
5725: {
5726: case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
5727: ptr++;
1.1.1.2 ! misho 5728: while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
! 5729: if (*ptr == CHAR_NULL)
1.1 misho 5730: {
5731: *errorcodeptr = ERR18;
5732: goto FAILED;
5733: }
5734: continue;
5735:
5736:
5737: /* ------------------------------------------------------------ */
5738: case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
5739: reset_bracount = TRUE;
5740: /* Fall through */
5741:
5742: /* ------------------------------------------------------------ */
5743: case CHAR_COLON: /* Non-capturing bracket */
5744: bravalue = OP_BRA;
5745: ptr++;
5746: break;
5747:
5748:
5749: /* ------------------------------------------------------------ */
5750: case CHAR_LEFT_PARENTHESIS:
5751: bravalue = OP_COND; /* Conditional group */
5752:
5753: /* A condition can be an assertion, a number (referring to a numbered
5754: group), a name (referring to a named group), or 'R', referring to
5755: recursion. R<digits> and R&name are also permitted for recursion tests.
5756:
5757: There are several syntaxes for testing a named group: (?(name)) is used
5758: by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
5759:
5760: There are two unfortunate ambiguities, caused by history. (a) 'R' can
5761: be the recursive thing or the name 'R' (and similarly for 'R' followed
5762: by digits), and (b) a number could be a name that consists of digits.
5763: In both cases, we look for a name first; if not found, we try the other
5764: cases. */
5765:
5766: /* For conditions that are assertions, check the syntax, and then exit
5767: the switch. This will take control down to where bracketed groups,
5768: including assertions, are processed. */
5769:
5770: if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
5771: ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
5772: break;
5773:
5774: /* Most other conditions use OP_CREF (a couple change to OP_RREF
1.1.1.2 ! misho 5775: below), and all need to skip 1+IMM2_SIZE bytes at the start of the group. */
1.1 misho 5776:
5777: code[1+LINK_SIZE] = OP_CREF;
1.1.1.2 ! misho 5778: skipbytes = 1+IMM2_SIZE;
1.1 misho 5779: refsign = -1;
5780:
5781: /* Check for a test for recursion in a named group. */
5782:
5783: if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
5784: {
5785: terminator = -1;
5786: ptr += 2;
5787: code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
5788: }
5789:
5790: /* Check for a test for a named group's having been set, using the Perl
5791: syntax (?(<name>) or (?('name') */
5792:
5793: else if (ptr[1] == CHAR_LESS_THAN_SIGN)
5794: {
5795: terminator = CHAR_GREATER_THAN_SIGN;
5796: ptr++;
5797: }
5798: else if (ptr[1] == CHAR_APOSTROPHE)
5799: {
5800: terminator = CHAR_APOSTROPHE;
5801: ptr++;
5802: }
5803: else
5804: {
1.1.1.2 ! misho 5805: terminator = CHAR_NULL;
1.1 misho 5806: if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
5807: }
5808:
5809: /* We now expect to read a name; any thing else is an error */
5810:
1.1.1.2 ! misho 5811: if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
1.1 misho 5812: {
5813: ptr += 1; /* To get the right offset */
5814: *errorcodeptr = ERR28;
5815: goto FAILED;
5816: }
5817:
5818: /* Read the name, but also get it as a number if it's all digits */
5819:
5820: recno = 0;
5821: name = ++ptr;
1.1.1.2 ! misho 5822: while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
1.1 misho 5823: {
5824: if (recno >= 0)
1.1.1.2 ! misho 5825: recno = (IS_DIGIT(*ptr))? recno * 10 + (int)(*ptr - CHAR_0) : -1;
1.1 misho 5826: ptr++;
5827: }
5828: namelen = (int)(ptr - name);
5829:
1.1.1.2 ! misho 5830: if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
1.1 misho 5831: *ptr++ != CHAR_RIGHT_PARENTHESIS)
5832: {
5833: ptr--; /* Error offset */
5834: *errorcodeptr = ERR26;
5835: goto FAILED;
5836: }
5837:
5838: /* Do no further checking in the pre-compile phase. */
5839:
5840: if (lengthptr != NULL) break;
5841:
5842: /* In the real compile we do the work of looking for the actual
5843: reference. If the string started with "+" or "-" we require the rest to
5844: be digits, in which case recno will be set. */
5845:
5846: if (refsign > 0)
5847: {
5848: if (recno <= 0)
5849: {
5850: *errorcodeptr = ERR58;
5851: goto FAILED;
5852: }
5853: recno = (refsign == CHAR_MINUS)?
5854: cd->bracount - recno + 1 : recno +cd->bracount;
5855: if (recno <= 0 || recno > cd->final_bracount)
5856: {
5857: *errorcodeptr = ERR15;
5858: goto FAILED;
5859: }
5860: PUT2(code, 2+LINK_SIZE, recno);
5861: break;
5862: }
5863:
5864: /* Otherwise (did not start with "+" or "-"), start by looking for the
5865: name. If we find a name, add one to the opcode to change OP_CREF or
5866: OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,
5867: except they record that the reference was originally to a name. The
5868: information is used to check duplicate names. */
5869:
5870: slot = cd->name_table;
5871: for (i = 0; i < cd->names_found; i++)
5872: {
1.1.1.2 ! misho 5873: if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
1.1 misho 5874: slot += cd->name_entry_size;
5875: }
5876:
5877: /* Found a previous named subpattern */
5878:
5879: if (i < cd->names_found)
5880: {
5881: recno = GET2(slot, 0);
5882: PUT2(code, 2+LINK_SIZE, recno);
5883: code[1+LINK_SIZE]++;
5884: }
5885:
5886: /* Search the pattern for a forward reference */
5887:
5888: else if ((i = find_parens(cd, name, namelen,
1.1.1.2 ! misho 5889: (options & PCRE_EXTENDED) != 0, utf)) > 0)
1.1 misho 5890: {
5891: PUT2(code, 2+LINK_SIZE, i);
5892: code[1+LINK_SIZE]++;
5893: }
5894:
1.1.1.2 ! misho 5895: /* If terminator == CHAR_NULL it means that the name followed directly
! 5896: after the opening parenthesis [e.g. (?(abc)...] and in this case there
! 5897: are some further alternatives to try. For the cases where terminator !=
! 5898: 0 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
1.1 misho 5899: now checked all the possibilities, so give an error. */
5900:
1.1.1.2 ! misho 5901: else if (terminator != CHAR_NULL)
1.1 misho 5902: {
5903: *errorcodeptr = ERR15;
5904: goto FAILED;
5905: }
5906:
5907: /* Check for (?(R) for recursion. Allow digits after R to specify a
5908: specific group number. */
5909:
5910: else if (*name == CHAR_R)
5911: {
5912: recno = 0;
5913: for (i = 1; i < namelen; i++)
5914: {
1.1.1.2 ! misho 5915: if (!IS_DIGIT(name[i]))
1.1 misho 5916: {
5917: *errorcodeptr = ERR15;
5918: goto FAILED;
5919: }
5920: recno = recno * 10 + name[i] - CHAR_0;
5921: }
5922: if (recno == 0) recno = RREF_ANY;
5923: code[1+LINK_SIZE] = OP_RREF; /* Change test type */
5924: PUT2(code, 2+LINK_SIZE, recno);
5925: }
5926:
5927: /* Similarly, check for the (?(DEFINE) "condition", which is always
5928: false. */
5929:
1.1.1.2 ! misho 5930: else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
1.1 misho 5931: {
5932: code[1+LINK_SIZE] = OP_DEF;
5933: skipbytes = 1;
5934: }
5935:
5936: /* Check for the "name" actually being a subpattern number. We are
5937: in the second pass here, so final_bracount is set. */
5938:
5939: else if (recno > 0 && recno <= cd->final_bracount)
5940: {
5941: PUT2(code, 2+LINK_SIZE, recno);
5942: }
5943:
5944: /* Either an unidentified subpattern, or a reference to (?(0) */
5945:
5946: else
5947: {
5948: *errorcodeptr = (recno == 0)? ERR35: ERR15;
5949: goto FAILED;
5950: }
5951: break;
5952:
5953:
5954: /* ------------------------------------------------------------ */
5955: case CHAR_EQUALS_SIGN: /* Positive lookahead */
5956: bravalue = OP_ASSERT;
1.1.1.2 ! misho 5957: cd->assert_depth += 1;
1.1 misho 5958: ptr++;
5959: break;
5960:
5961:
5962: /* ------------------------------------------------------------ */
5963: case CHAR_EXCLAMATION_MARK: /* Negative lookahead */
5964: ptr++;
5965: if (*ptr == CHAR_RIGHT_PARENTHESIS) /* Optimize (?!) */
5966: {
5967: *code++ = OP_FAIL;
5968: previous = NULL;
5969: continue;
5970: }
5971: bravalue = OP_ASSERT_NOT;
1.1.1.2 ! misho 5972: cd->assert_depth += 1;
1.1 misho 5973: break;
5974:
5975:
5976: /* ------------------------------------------------------------ */
5977: case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */
5978: switch (ptr[1])
5979: {
5980: case CHAR_EQUALS_SIGN: /* Positive lookbehind */
5981: bravalue = OP_ASSERTBACK;
1.1.1.2 ! misho 5982: cd->assert_depth += 1;
1.1 misho 5983: ptr += 2;
5984: break;
5985:
5986: case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
5987: bravalue = OP_ASSERTBACK_NOT;
1.1.1.2 ! misho 5988: cd->assert_depth += 1;
1.1 misho 5989: ptr += 2;
5990: break;
5991:
5992: default: /* Could be name define, else bad */
1.1.1.2 ! misho 5993: if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
! 5994: goto DEFINE_NAME;
1.1 misho 5995: ptr++; /* Correct offset for error */
5996: *errorcodeptr = ERR24;
5997: goto FAILED;
5998: }
5999: break;
6000:
6001:
6002: /* ------------------------------------------------------------ */
6003: case CHAR_GREATER_THAN_SIGN: /* One-time brackets */
6004: bravalue = OP_ONCE;
6005: ptr++;
6006: break;
6007:
6008:
6009: /* ------------------------------------------------------------ */
6010: case CHAR_C: /* Callout - may be followed by digits; */
1.1.1.2 ! misho 6011: previous_callout = code; /* Save for later completion */
! 6012: after_manual_callout = 1; /* Skip one item before completing */
1.1 misho 6013: *code++ = OP_CALLOUT;
6014: {
6015: int n = 0;
1.1.1.2 ! misho 6016: ptr++;
! 6017: while(IS_DIGIT(*ptr))
! 6018: n = n * 10 + *ptr++ - CHAR_0;
1.1 misho 6019: if (*ptr != CHAR_RIGHT_PARENTHESIS)
6020: {
6021: *errorcodeptr = ERR39;
6022: goto FAILED;
6023: }
6024: if (n > 255)
6025: {
6026: *errorcodeptr = ERR38;
6027: goto FAILED;
6028: }
6029: *code++ = n;
6030: PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
6031: PUT(code, LINK_SIZE, 0); /* Default length */
6032: code += 2 * LINK_SIZE;
6033: }
6034: previous = NULL;
6035: continue;
6036:
6037:
6038: /* ------------------------------------------------------------ */
6039: case CHAR_P: /* Python-style named subpattern handling */
6040: if (*(++ptr) == CHAR_EQUALS_SIGN ||
6041: *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */
6042: {
6043: is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
6044: terminator = CHAR_RIGHT_PARENTHESIS;
6045: goto NAMED_REF_OR_RECURSE;
6046: }
6047: else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */
6048: {
6049: *errorcodeptr = ERR41;
6050: goto FAILED;
6051: }
6052: /* Fall through to handle (?P< as (?< is handled */
6053:
6054:
6055: /* ------------------------------------------------------------ */
6056: DEFINE_NAME: /* Come here from (?< handling */
6057: case CHAR_APOSTROPHE:
6058: {
6059: terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
6060: CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6061: name = ++ptr;
6062:
1.1.1.2 ! misho 6063: while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
1.1 misho 6064: namelen = (int)(ptr - name);
6065:
6066: /* In the pre-compile phase, just do a syntax check. */
6067:
6068: if (lengthptr != NULL)
6069: {
1.1.1.2 ! misho 6070: if (*ptr != (pcre_uchar)terminator)
1.1 misho 6071: {
6072: *errorcodeptr = ERR42;
6073: goto FAILED;
6074: }
6075: if (cd->names_found >= MAX_NAME_COUNT)
6076: {
6077: *errorcodeptr = ERR49;
6078: goto FAILED;
6079: }
1.1.1.2 ! misho 6080: if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
1.1 misho 6081: {
1.1.1.2 ! misho 6082: cd->name_entry_size = namelen + IMM2_SIZE + 1;
1.1 misho 6083: if (namelen > MAX_NAME_SIZE)
6084: {
6085: *errorcodeptr = ERR48;
6086: goto FAILED;
6087: }
6088: }
6089: }
6090:
6091: /* In the real compile, create the entry in the table, maintaining
6092: alphabetical order. Duplicate names for different numbers are
6093: permitted only if PCRE_DUPNAMES is set. Duplicate names for the same
6094: number are always OK. (An existing number can be re-used if (?|
6095: appears in the pattern.) In either event, a duplicate name results in
6096: a duplicate entry in the table, even if the number is the same. This
6097: is because the number of names, and hence the table size, is computed
6098: in the pre-compile, and it affects various numbers and pointers which
6099: would all have to be modified, and the compiled code moved down, if
6100: duplicates with the same number were omitted from the table. This
6101: doesn't seem worth the hassle. However, *different* names for the
6102: same number are not permitted. */
6103:
6104: else
6105: {
6106: BOOL dupname = FALSE;
6107: slot = cd->name_table;
6108:
6109: for (i = 0; i < cd->names_found; i++)
6110: {
1.1.1.2 ! misho 6111: int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(namelen));
1.1 misho 6112: if (crc == 0)
6113: {
1.1.1.2 ! misho 6114: if (slot[IMM2_SIZE+namelen] == 0)
1.1 misho 6115: {
6116: if (GET2(slot, 0) != cd->bracount + 1 &&
6117: (options & PCRE_DUPNAMES) == 0)
6118: {
6119: *errorcodeptr = ERR43;
6120: goto FAILED;
6121: }
6122: else dupname = TRUE;
6123: }
6124: else crc = -1; /* Current name is a substring */
6125: }
6126:
6127: /* Make space in the table and break the loop for an earlier
6128: name. For a duplicate or later name, carry on. We do this for
6129: duplicates so that in the simple case (when ?(| is not used) they
6130: are in order of their numbers. */
6131:
6132: if (crc < 0)
6133: {
6134: memmove(slot + cd->name_entry_size, slot,
1.1.1.2 ! misho 6135: IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
1.1 misho 6136: break;
6137: }
6138:
6139: /* Continue the loop for a later or duplicate name */
6140:
6141: slot += cd->name_entry_size;
6142: }
6143:
6144: /* For non-duplicate names, check for a duplicate number before
6145: adding the new name. */
6146:
6147: if (!dupname)
6148: {
1.1.1.2 ! misho 6149: pcre_uchar *cslot = cd->name_table;
1.1 misho 6150: for (i = 0; i < cd->names_found; i++)
6151: {
6152: if (cslot != slot)
6153: {
6154: if (GET2(cslot, 0) == cd->bracount + 1)
6155: {
6156: *errorcodeptr = ERR65;
6157: goto FAILED;
6158: }
6159: }
6160: else i--;
6161: cslot += cd->name_entry_size;
6162: }
6163: }
6164:
6165: PUT2(slot, 0, cd->bracount + 1);
1.1.1.2 ! misho 6166: memcpy(slot + IMM2_SIZE, name, IN_UCHARS(namelen));
! 6167: slot[IMM2_SIZE + namelen] = 0;
1.1 misho 6168: }
6169: }
6170:
6171: /* In both pre-compile and compile, count the number of names we've
6172: encountered. */
6173:
6174: cd->names_found++;
6175: ptr++; /* Move past > or ' */
6176: goto NUMBERED_GROUP;
6177:
6178:
6179: /* ------------------------------------------------------------ */
6180: case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */
6181: terminator = CHAR_RIGHT_PARENTHESIS;
6182: is_recurse = TRUE;
6183: /* Fall through */
6184:
6185: /* We come here from the Python syntax above that handles both
6186: references (?P=name) and recursion (?P>name), as well as falling
6187: through from the Perl recursion syntax (?&name). We also come here from
6188: the Perl \k<name> or \k'name' back reference syntax and the \k{name}
6189: .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
6190:
6191: NAMED_REF_OR_RECURSE:
6192: name = ++ptr;
1.1.1.2 ! misho 6193: while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
1.1 misho 6194: namelen = (int)(ptr - name);
6195:
6196: /* In the pre-compile phase, do a syntax check. We used to just set
6197: a dummy reference number, because it was not used in the first pass.
6198: However, with the change of recursive back references to be atomic,
6199: we have to look for the number so that this state can be identified, as
6200: otherwise the incorrect length is computed. If it's not a backwards
6201: reference, the dummy number will do. */
6202:
6203: if (lengthptr != NULL)
6204: {
1.1.1.2 ! misho 6205: const pcre_uchar *temp;
1.1 misho 6206:
6207: if (namelen == 0)
6208: {
6209: *errorcodeptr = ERR62;
6210: goto FAILED;
6211: }
1.1.1.2 ! misho 6212: if (*ptr != (pcre_uchar)terminator)
1.1 misho 6213: {
6214: *errorcodeptr = ERR42;
6215: goto FAILED;
6216: }
6217: if (namelen > MAX_NAME_SIZE)
6218: {
6219: *errorcodeptr = ERR48;
6220: goto FAILED;
6221: }
6222:
6223: /* The name table does not exist in the first pass, so we cannot
6224: do a simple search as in the code below. Instead, we have to scan the
6225: pattern to find the number. It is important that we scan it only as
6226: far as we have got because the syntax of named subpatterns has not
6227: been checked for the rest of the pattern, and find_parens() assumes
6228: correct syntax. In any case, it's a waste of resources to scan
6229: further. We stop the scan at the current point by temporarily
6230: adjusting the value of cd->endpattern. */
6231:
6232: temp = cd->end_pattern;
6233: cd->end_pattern = ptr;
6234: recno = find_parens(cd, name, namelen,
1.1.1.2 ! misho 6235: (options & PCRE_EXTENDED) != 0, utf);
1.1 misho 6236: cd->end_pattern = temp;
6237: if (recno < 0) recno = 0; /* Forward ref; set dummy number */
6238: }
6239:
6240: /* In the real compile, seek the name in the table. We check the name
6241: first, and then check that we have reached the end of the name in the
6242: table. That way, if the name that is longer than any in the table,
6243: the comparison will fail without reading beyond the table entry. */
6244:
6245: else
6246: {
6247: slot = cd->name_table;
6248: for (i = 0; i < cd->names_found; i++)
6249: {
1.1.1.2 ! misho 6250: if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
! 6251: slot[IMM2_SIZE+namelen] == 0)
1.1 misho 6252: break;
6253: slot += cd->name_entry_size;
6254: }
6255:
6256: if (i < cd->names_found) /* Back reference */
6257: {
6258: recno = GET2(slot, 0);
6259: }
6260: else if ((recno = /* Forward back reference */
6261: find_parens(cd, name, namelen,
1.1.1.2 ! misho 6262: (options & PCRE_EXTENDED) != 0, utf)) <= 0)
1.1 misho 6263: {
6264: *errorcodeptr = ERR15;
6265: goto FAILED;
6266: }
6267: }
6268:
6269: /* In both phases, we can now go to the code than handles numerical
6270: recursion or backreferences. */
6271:
6272: if (is_recurse) goto HANDLE_RECURSION;
6273: else goto HANDLE_REFERENCE;
6274:
6275:
6276: /* ------------------------------------------------------------ */
6277: case CHAR_R: /* Recursion */
6278: ptr++; /* Same as (?0) */
6279: /* Fall through */
6280:
6281:
6282: /* ------------------------------------------------------------ */
6283: case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */
6284: case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
6285: case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
6286: {
1.1.1.2 ! misho 6287: const pcre_uchar *called;
1.1 misho 6288: terminator = CHAR_RIGHT_PARENTHESIS;
6289:
6290: /* Come here from the \g<...> and \g'...' code (Oniguruma
6291: compatibility). However, the syntax has been checked to ensure that
6292: the ... are a (signed) number, so that neither ERR63 nor ERR29 will
6293: be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
6294: ever be taken. */
6295:
6296: HANDLE_NUMERICAL_RECURSION:
6297:
6298: if ((refsign = *ptr) == CHAR_PLUS)
6299: {
6300: ptr++;
1.1.1.2 ! misho 6301: if (!IS_DIGIT(*ptr))
1.1 misho 6302: {
6303: *errorcodeptr = ERR63;
6304: goto FAILED;
6305: }
6306: }
6307: else if (refsign == CHAR_MINUS)
6308: {
1.1.1.2 ! misho 6309: if (!IS_DIGIT(ptr[1]))
1.1 misho 6310: goto OTHER_CHAR_AFTER_QUERY;
6311: ptr++;
6312: }
6313:
6314: recno = 0;
1.1.1.2 ! misho 6315: while(IS_DIGIT(*ptr))
1.1 misho 6316: recno = recno * 10 + *ptr++ - CHAR_0;
6317:
1.1.1.2 ! misho 6318: if (*ptr != (pcre_uchar)terminator)
1.1 misho 6319: {
6320: *errorcodeptr = ERR29;
6321: goto FAILED;
6322: }
6323:
6324: if (refsign == CHAR_MINUS)
6325: {
6326: if (recno == 0)
6327: {
6328: *errorcodeptr = ERR58;
6329: goto FAILED;
6330: }
6331: recno = cd->bracount - recno + 1;
6332: if (recno <= 0)
6333: {
6334: *errorcodeptr = ERR15;
6335: goto FAILED;
6336: }
6337: }
6338: else if (refsign == CHAR_PLUS)
6339: {
6340: if (recno == 0)
6341: {
6342: *errorcodeptr = ERR58;
6343: goto FAILED;
6344: }
6345: recno += cd->bracount;
6346: }
6347:
6348: /* Come here from code above that handles a named recursion */
6349:
6350: HANDLE_RECURSION:
6351:
6352: previous = code;
6353: called = cd->start_code;
6354:
6355: /* When we are actually compiling, find the bracket that is being
6356: referenced. Temporarily end the regex in case it doesn't exist before
6357: this point. If we end up with a forward reference, first check that
6358: the bracket does occur later so we can give the error (and position)
6359: now. Then remember this forward reference in the workspace so it can
6360: be filled in at the end. */
6361:
6362: if (lengthptr == NULL)
6363: {
6364: *code = OP_END;
6365: if (recno != 0)
1.1.1.2 ! misho 6366: called = PRIV(find_bracket)(cd->start_code, utf, recno);
1.1 misho 6367:
6368: /* Forward reference */
6369:
6370: if (called == NULL)
6371: {
6372: if (find_parens(cd, NULL, recno,
1.1.1.2 ! misho 6373: (options & PCRE_EXTENDED) != 0, utf) < 0)
1.1 misho 6374: {
6375: *errorcodeptr = ERR15;
6376: goto FAILED;
6377: }
6378:
6379: /* Fudge the value of "called" so that when it is inserted as an
6380: offset below, what it actually inserted is the reference number
1.1.1.2 ! misho 6381: of the group. Then remember the forward reference. */
1.1 misho 6382:
6383: called = cd->start_code + recno;
1.1.1.2 ! misho 6384: if (cd->hwm >= cd->start_workspace + cd->workspace_size -
! 6385: WORK_SIZE_SAFETY_MARGIN)
! 6386: {
! 6387: *errorcodeptr = expand_workspace(cd);
! 6388: if (*errorcodeptr != 0) goto FAILED;
! 6389: }
! 6390: PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
1.1 misho 6391: }
6392:
6393: /* If not a forward reference, and the subpattern is still open,
6394: this is a recursive call. We check to see if this is a left
1.1.1.2 ! misho 6395: recursion that could loop for ever, and diagnose that case. We
! 6396: must not, however, do this check if we are in a conditional
! 6397: subpattern because the condition might be testing for recursion in
! 6398: a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid.
! 6399: Forever loops are also detected at runtime, so those that occur in
! 6400: conditional subpatterns will be picked up then. */
1.1 misho 6401:
1.1.1.2 ! misho 6402: else if (GET(called, 1) == 0 && cond_depth <= 0 &&
! 6403: could_be_empty(called, code, bcptr, utf, cd))
1.1 misho 6404: {
6405: *errorcodeptr = ERR40;
6406: goto FAILED;
6407: }
6408: }
6409:
1.1.1.2 ! misho 6410: /* Insert the recursion/subroutine item. It does not have a set first
! 6411: character (relevant if it is repeated, because it will then be
! 6412: wrapped with ONCE brackets). */
1.1 misho 6413:
6414: *code = OP_RECURSE;
6415: PUT(code, 1, (int)(called - cd->start_code));
6416: code += 1 + LINK_SIZE;
1.1.1.2 ! misho 6417: groupsetfirstchar = FALSE;
1.1 misho 6418: }
6419:
6420: /* Can't determine a first byte now */
6421:
1.1.1.2 ! misho 6422: if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
1.1 misho 6423: continue;
6424:
6425:
6426: /* ------------------------------------------------------------ */
6427: default: /* Other characters: check option setting */
6428: OTHER_CHAR_AFTER_QUERY:
6429: set = unset = 0;
6430: optset = &set;
6431:
6432: while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
6433: {
6434: switch (*ptr++)
6435: {
6436: case CHAR_MINUS: optset = &unset; break;
6437:
6438: case CHAR_J: /* Record that it changed in the external options */
6439: *optset |= PCRE_DUPNAMES;
6440: cd->external_flags |= PCRE_JCHANGED;
6441: break;
6442:
6443: case CHAR_i: *optset |= PCRE_CASELESS; break;
6444: case CHAR_m: *optset |= PCRE_MULTILINE; break;
6445: case CHAR_s: *optset |= PCRE_DOTALL; break;
6446: case CHAR_x: *optset |= PCRE_EXTENDED; break;
6447: case CHAR_U: *optset |= PCRE_UNGREEDY; break;
6448: case CHAR_X: *optset |= PCRE_EXTRA; break;
6449:
6450: default: *errorcodeptr = ERR12;
6451: ptr--; /* Correct the offset */
6452: goto FAILED;
6453: }
6454: }
6455:
6456: /* Set up the changed option bits, but don't change anything yet. */
6457:
6458: newoptions = (options | set) & (~unset);
6459:
6460: /* If the options ended with ')' this is not the start of a nested
6461: group with option changes, so the options change at this level. If this
6462: item is right at the start of the pattern, the options can be
6463: abstracted and made external in the pre-compile phase, and ignored in
6464: the compile phase. This can be helpful when matching -- for instance in
6465: caseless checking of required bytes.
6466:
6467: If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
6468: definitely *not* at the start of the pattern because something has been
6469: compiled. In the pre-compile phase, however, the code pointer can have
6470: that value after the start, because it gets reset as code is discarded
6471: during the pre-compile. However, this can happen only at top level - if
6472: we are within parentheses, the starting BRA will still be present. At
6473: any parenthesis level, the length value can be used to test if anything
6474: has been compiled at that level. Thus, a test for both these conditions
6475: is necessary to ensure we correctly detect the start of the pattern in
6476: both phases.
6477:
1.1.1.2 ! misho 6478: If we are not at the pattern start, reset the greedy defaults and the
! 6479: case value for firstchar and reqchar. */
1.1 misho 6480:
6481: if (*ptr == CHAR_RIGHT_PARENTHESIS)
6482: {
6483: if (code == cd->start_code + 1 + LINK_SIZE &&
6484: (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
6485: {
6486: cd->external_options = newoptions;
6487: }
6488: else
6489: {
6490: greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
6491: greedy_non_default = greedy_default ^ 1;
1.1.1.2 ! misho 6492: req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
1.1 misho 6493: }
6494:
6495: /* Change options at this level, and pass them back for use
1.1.1.2 ! misho 6496: in subsequent branches. */
1.1 misho 6497:
6498: *optionsptr = options = newoptions;
6499: previous = NULL; /* This item can't be repeated */
6500: continue; /* It is complete */
6501: }
6502:
6503: /* If the options ended with ':' we are heading into a nested group
6504: with possible change of options. Such groups are non-capturing and are
6505: not assertions of any kind. All we need to do is skip over the ':';
6506: the newoptions value is handled below. */
6507:
6508: bravalue = OP_BRA;
6509: ptr++;
6510: } /* End of switch for character following (? */
6511: } /* End of (? handling */
6512:
6513: /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
6514: is set, all unadorned brackets become non-capturing and behave like (?:...)
6515: brackets. */
6516:
6517: else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
6518: {
6519: bravalue = OP_BRA;
6520: }
6521:
6522: /* Else we have a capturing group. */
6523:
6524: else
6525: {
6526: NUMBERED_GROUP:
6527: cd->bracount += 1;
6528: PUT2(code, 1+LINK_SIZE, cd->bracount);
1.1.1.2 ! misho 6529: skipbytes = IMM2_SIZE;
1.1 misho 6530: }
6531:
1.1.1.2 ! misho 6532: /* Process nested bracketed regex. Assertions used not to be repeatable,
! 6533: but this was changed for Perl compatibility, so all kinds can now be
! 6534: repeated. We copy code into a non-register variable (tempcode) in order to
! 6535: be able to pass its address because some compilers complain otherwise. */
1.1 misho 6536:
1.1.1.2 ! misho 6537: previous = code; /* For handling repetition */
1.1 misho 6538: *code = bravalue;
6539: tempcode = code;
1.1.1.2 ! misho 6540: tempreqvary = cd->req_varyopt; /* Save value before bracket */
! 6541: tempbracount = cd->bracount; /* Save value before bracket */
! 6542: length_prevgroup = 0; /* Initialize for pre-compile phase */
1.1 misho 6543:
6544: if (!compile_regex(
1.1.1.2 ! misho 6545: newoptions, /* The complete new option state */
! 6546: &tempcode, /* Where to put code (updated) */
! 6547: &ptr, /* Input pointer (updated) */
! 6548: errorcodeptr, /* Where to put an error message */
1.1 misho 6549: (bravalue == OP_ASSERTBACK ||
6550: bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1.1.1.2 ! misho 6551: reset_bracount, /* True if (?| group */
! 6552: skipbytes, /* Skip over bracket number */
! 6553: cond_depth +
! 6554: ((bravalue == OP_COND)?1:0), /* Depth of condition subpatterns */
! 6555: &subfirstchar, /* For possible first char */
! 6556: &subfirstcharflags,
! 6557: &subreqchar, /* For possible last char */
! 6558: &subreqcharflags,
! 6559: bcptr, /* Current branch chain */
! 6560: cd, /* Tables block */
! 6561: (lengthptr == NULL)? NULL : /* Actual compile phase */
! 6562: &length_prevgroup /* Pre-compile phase */
1.1 misho 6563: ))
6564: goto FAILED;
6565:
1.1.1.2 ! misho 6566: /* If this was an atomic group and there are no capturing groups within it,
! 6567: generate OP_ONCE_NC instead of OP_ONCE. */
! 6568:
! 6569: if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
! 6570: *code = OP_ONCE_NC;
! 6571:
! 6572: if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
! 6573: cd->assert_depth -= 1;
! 6574:
1.1 misho 6575: /* At the end of compiling, code is still pointing to the start of the
1.1.1.2 ! misho 6576: group, while tempcode has been updated to point past the end of the group.
! 6577: The pattern pointer (ptr) is on the bracket.
1.1 misho 6578:
1.1.1.2 ! misho 6579: If this is a conditional bracket, check that there are no more than
1.1 misho 6580: two branches in the group, or just one if it's a DEFINE group. We do this
6581: in the real compile phase, not in the pre-pass, where the whole group may
6582: not be available. */
6583:
6584: if (bravalue == OP_COND && lengthptr == NULL)
6585: {
1.1.1.2 ! misho 6586: pcre_uchar *tc = code;
1.1 misho 6587: int condcount = 0;
6588:
6589: do {
6590: condcount++;
6591: tc += GET(tc,1);
6592: }
6593: while (*tc != OP_KET);
6594:
6595: /* A DEFINE group is never obeyed inline (the "condition" is always
6596: false). It must have only one branch. */
6597:
6598: if (code[LINK_SIZE+1] == OP_DEF)
6599: {
6600: if (condcount > 1)
6601: {
6602: *errorcodeptr = ERR54;
6603: goto FAILED;
6604: }
6605: bravalue = OP_DEF; /* Just a flag to suppress char handling below */
6606: }
6607:
6608: /* A "normal" conditional group. If there is just one branch, we must not
1.1.1.2 ! misho 6609: make use of its firstchar or reqchar, because this is equivalent to an
1.1 misho 6610: empty second branch. */
6611:
6612: else
6613: {
6614: if (condcount > 2)
6615: {
6616: *errorcodeptr = ERR27;
6617: goto FAILED;
6618: }
1.1.1.2 ! misho 6619: if (condcount == 1) subfirstcharflags = subreqcharflags = REQ_NONE;
1.1 misho 6620: }
6621: }
6622:
6623: /* Error if hit end of pattern */
6624:
6625: if (*ptr != CHAR_RIGHT_PARENTHESIS)
6626: {
6627: *errorcodeptr = ERR14;
6628: goto FAILED;
6629: }
6630:
6631: /* In the pre-compile phase, update the length by the length of the group,
6632: less the brackets at either end. Then reduce the compiled code to just a
6633: set of non-capturing brackets so that it doesn't use much memory if it is
6634: duplicated by a quantifier.*/
6635:
6636: if (lengthptr != NULL)
6637: {
6638: if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6639: {
6640: *errorcodeptr = ERR20;
6641: goto FAILED;
6642: }
6643: *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
1.1.1.2 ! misho 6644: code++; /* This already contains bravalue */
1.1 misho 6645: PUTINC(code, 0, 1 + LINK_SIZE);
6646: *code++ = OP_KET;
6647: PUTINC(code, 0, 1 + LINK_SIZE);
6648: break; /* No need to waste time with special character handling */
6649: }
6650:
6651: /* Otherwise update the main code pointer to the end of the group. */
6652:
6653: code = tempcode;
6654:
6655: /* For a DEFINE group, required and first character settings are not
6656: relevant. */
6657:
6658: if (bravalue == OP_DEF) break;
6659:
6660: /* Handle updating of the required and first characters for other types of
6661: group. Update for normal brackets of all kinds, and conditions with two
6662: branches (see code above). If the bracket is followed by a quantifier with
1.1.1.2 ! misho 6663: zero repeat, we have to back off. Hence the definition of zeroreqchar and
! 6664: zerofirstchar outside the main loop so that they can be accessed for the
1.1 misho 6665: back off. */
6666:
1.1.1.2 ! misho 6667: zeroreqchar = reqchar;
! 6668: zeroreqcharflags = reqcharflags;
! 6669: zerofirstchar = firstchar;
! 6670: zerofirstcharflags = firstcharflags;
! 6671: groupsetfirstchar = FALSE;
1.1 misho 6672:
6673: if (bravalue >= OP_ONCE)
6674: {
1.1.1.2 ! misho 6675: /* If we have not yet set a firstchar in this branch, take it from the
1.1 misho 6676: subpattern, remembering that it was set here so that a repeat of more
1.1.1.2 ! misho 6677: than one can replicate it as reqchar if necessary. If the subpattern has
! 6678: no firstchar, set "none" for the whole branch. In both cases, a zero
! 6679: repeat forces firstchar to "none". */
1.1 misho 6680:
1.1.1.2 ! misho 6681: if (firstcharflags == REQ_UNSET)
1.1 misho 6682: {
1.1.1.2 ! misho 6683: if (subfirstcharflags >= 0)
1.1 misho 6684: {
1.1.1.2 ! misho 6685: firstchar = subfirstchar;
! 6686: firstcharflags = subfirstcharflags;
! 6687: groupsetfirstchar = TRUE;
1.1 misho 6688: }
1.1.1.2 ! misho 6689: else firstcharflags = REQ_NONE;
! 6690: zerofirstcharflags = REQ_NONE;
1.1 misho 6691: }
6692:
1.1.1.2 ! misho 6693: /* If firstchar was previously set, convert the subpattern's firstchar
! 6694: into reqchar if there wasn't one, using the vary flag that was in
1.1 misho 6695: existence beforehand. */
6696:
1.1.1.2 ! misho 6697: else if (subfirstcharflags >= 0 && subreqcharflags < 0)
! 6698: {
! 6699: subreqchar = subfirstchar;
! 6700: subreqcharflags = subfirstcharflags | tempreqvary;
! 6701: }
1.1 misho 6702:
6703: /* If the subpattern set a required byte (or set a first byte that isn't
6704: really the first byte - see above), set it. */
6705:
1.1.1.2 ! misho 6706: if (subreqcharflags >= 0)
! 6707: {
! 6708: reqchar = subreqchar;
! 6709: reqcharflags = subreqcharflags;
! 6710: }
1.1 misho 6711: }
6712:
1.1.1.2 ! misho 6713: /* For a forward assertion, we take the reqchar, if set. This can be
1.1 misho 6714: helpful if the pattern that follows the assertion doesn't set a different
1.1.1.2 ! misho 6715: char. For example, it's useful for /(?=abcde).+/. We can't set firstchar
1.1 misho 6716: for an assertion, however because it leads to incorrect effect for patterns
1.1.1.2 ! misho 6717: such as /(?=a)a.+/ when the "real" "a" would then become a reqchar instead
! 6718: of a firstchar. This is overcome by a scan at the end if there's no
! 6719: firstchar, looking for an asserted first char. */
1.1 misho 6720:
1.1.1.2 ! misho 6721: else if (bravalue == OP_ASSERT && subreqcharflags >= 0)
! 6722: {
! 6723: reqchar = subreqchar;
! 6724: reqcharflags = subreqcharflags;
! 6725: }
1.1 misho 6726: break; /* End of processing '(' */
6727:
6728:
6729: /* ===================================================================*/
6730: /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
6731: are arranged to be the negation of the corresponding OP_values in the
6732: default case when PCRE_UCP is not set. For the back references, the values
1.1.1.2 ! misho 6733: are negative the reference number. Only back references and those types
1.1 misho 6734: that consume a character may be repeated. We can test for values between
6735: ESC_b and ESC_Z for the latter; this may have to change if any new ones are
6736: ever created. */
6737:
6738: case CHAR_BACKSLASH:
6739: tempptr = ptr;
1.1.1.2 ! misho 6740: escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options, FALSE);
! 6741:
1.1 misho 6742: if (*errorcodeptr != 0) goto FAILED;
6743:
1.1.1.2 ! misho 6744: if (escape == 0)
! 6745: c = ec;
! 6746: else
1.1 misho 6747: {
1.1.1.2 ! misho 6748: if (escape == ESC_Q) /* Handle start of quoted string */
1.1 misho 6749: {
6750: if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
6751: ptr += 2; /* avoid empty string */
6752: else inescq = TRUE;
6753: continue;
6754: }
6755:
1.1.1.2 ! misho 6756: if (escape == ESC_E) continue; /* Perl ignores an orphan \E */
1.1 misho 6757:
6758: /* For metasequences that actually match a character, we disable the
6759: setting of a first character if it hasn't already been set. */
6760:
1.1.1.2 ! misho 6761: if (firstcharflags == REQ_UNSET && escape > ESC_b && escape < ESC_Z)
! 6762: firstcharflags = REQ_NONE;
1.1 misho 6763:
6764: /* Set values to reset to if this is followed by a zero repeat. */
6765:
1.1.1.2 ! misho 6766: zerofirstchar = firstchar;
! 6767: zerofirstcharflags = firstcharflags;
! 6768: zeroreqchar = reqchar;
! 6769: zeroreqcharflags = reqcharflags;
1.1 misho 6770:
6771: /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
6772: is a subroutine call by number (Oniguruma syntax). In fact, the value
1.1.1.2 ! misho 6773: ESC_g is returned only for these cases. So we don't need to check for <
! 6774: or ' if the value is ESC_g. For the Perl syntax \g{n} the value is
! 6775: -n, and for the Perl syntax \g{name} the result is ESC_k (as
1.1 misho 6776: that is a synonym for a named back reference). */
6777:
1.1.1.2 ! misho 6778: if (escape == ESC_g)
1.1 misho 6779: {
1.1.1.2 ! misho 6780: const pcre_uchar *p;
1.1 misho 6781: save_hwm = cd->hwm; /* Normally this is set when '(' is read */
6782: terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
6783: CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6784:
6785: /* These two statements stop the compiler for warning about possibly
6786: unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
6787: fact, because we actually check for a number below, the paths that
6788: would actually be in error are never taken. */
6789:
6790: skipbytes = 0;
6791: reset_bracount = FALSE;
6792:
6793: /* Test for a name */
6794:
6795: if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
6796: {
1.1.1.2 ! misho 6797: BOOL is_a_number = TRUE;
! 6798: for (p = ptr + 1; *p != CHAR_NULL && *p != (pcre_uchar)terminator; p++)
1.1 misho 6799: {
1.1.1.2 ! misho 6800: if (!MAX_255(*p)) { is_a_number = FALSE; break; }
! 6801: if ((cd->ctypes[*p] & ctype_digit) == 0) is_a_number = FALSE;
1.1 misho 6802: if ((cd->ctypes[*p] & ctype_word) == 0) break;
6803: }
1.1.1.2 ! misho 6804: if (*p != (pcre_uchar)terminator)
1.1 misho 6805: {
6806: *errorcodeptr = ERR57;
6807: break;
6808: }
1.1.1.2 ! misho 6809: if (is_a_number)
1.1 misho 6810: {
6811: ptr++;
6812: goto HANDLE_NUMERICAL_RECURSION;
6813: }
6814: is_recurse = TRUE;
6815: goto NAMED_REF_OR_RECURSE;
6816: }
6817:
6818: /* Test a signed number in angle brackets or quotes. */
6819:
6820: p = ptr + 2;
1.1.1.2 ! misho 6821: while (IS_DIGIT(*p)) p++;
! 6822: if (*p != (pcre_uchar)terminator)
1.1 misho 6823: {
6824: *errorcodeptr = ERR57;
6825: break;
6826: }
6827: ptr++;
6828: goto HANDLE_NUMERICAL_RECURSION;
6829: }
6830:
6831: /* \k<name> or \k'name' is a back reference by name (Perl syntax).
1.1.1.2 ! misho 6832: We also support \k{name} (.NET syntax). */
1.1 misho 6833:
1.1.1.2 ! misho 6834: if (escape == ESC_k)
1.1 misho 6835: {
1.1.1.2 ! misho 6836: if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
! 6837: ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
! 6838: {
! 6839: *errorcodeptr = ERR69;
! 6840: break;
! 6841: }
1.1 misho 6842: is_recurse = FALSE;
6843: terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
6844: CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
6845: CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
6846: goto NAMED_REF_OR_RECURSE;
6847: }
6848:
1.1.1.2 ! misho 6849: /* Back references are handled specially; must disable firstchar if
1.1 misho 6850: not set to cope with cases like (?=(\w+))\1: which would otherwise set
6851: ':' later. */
6852:
1.1.1.2 ! misho 6853: if (escape < 0)
1.1 misho 6854: {
6855: open_capitem *oc;
1.1.1.2 ! misho 6856: recno = -escape;
1.1 misho 6857:
6858: HANDLE_REFERENCE: /* Come here from named backref handling */
1.1.1.2 ! misho 6859: if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
1.1 misho 6860: previous = code;
1.1.1.2 ! misho 6861: *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
1.1 misho 6862: PUT2INC(code, 0, recno);
6863: cd->backref_map |= (recno < 32)? (1 << recno) : 1;
6864: if (recno > cd->top_backref) cd->top_backref = recno;
6865:
6866: /* Check to see if this back reference is recursive, that it, it
6867: is inside the group that it references. A flag is set so that the
6868: group can be made atomic. */
6869:
6870: for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6871: {
6872: if (oc->number == recno)
6873: {
6874: oc->flag = TRUE;
6875: break;
6876: }
6877: }
6878: }
6879:
6880: /* So are Unicode property matches, if supported. */
6881:
6882: #ifdef SUPPORT_UCP
1.1.1.2 ! misho 6883: else if (escape == ESC_P || escape == ESC_p)
1.1 misho 6884: {
6885: BOOL negated;
1.1.1.2 ! misho 6886: unsigned int ptype = 0, pdata = 0;
! 6887: if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
! 6888: goto FAILED;
1.1 misho 6889: previous = code;
1.1.1.2 ! misho 6890: *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
1.1 misho 6891: *code++ = ptype;
6892: *code++ = pdata;
6893: }
6894: #else
6895:
6896: /* If Unicode properties are not supported, \X, \P, and \p are not
6897: allowed. */
6898:
1.1.1.2 ! misho 6899: else if (escape == ESC_X || escape == ESC_P || escape == ESC_p)
1.1 misho 6900: {
6901: *errorcodeptr = ERR45;
6902: goto FAILED;
6903: }
6904: #endif
6905:
6906: /* For the rest (including \X when Unicode properties are supported), we
6907: can obtain the OP value by negating the escape value in the default
6908: situation when PCRE_UCP is not set. When it *is* set, we substitute
1.1.1.2 ! misho 6909: Unicode property tests. Note that \b and \B do a one-character
! 6910: lookbehind. */
1.1 misho 6911:
6912: else
6913: {
1.1.1.2 ! misho 6914: if ((escape == ESC_b || escape == ESC_B) && cd->max_lookbehind == 0)
! 6915: cd->max_lookbehind = 1;
1.1 misho 6916: #ifdef SUPPORT_UCP
1.1.1.2 ! misho 6917: if (escape >= ESC_DU && escape <= ESC_wu)
1.1 misho 6918: {
6919: nestptr = ptr + 1; /* Where to resume */
1.1.1.2 ! misho 6920: ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
1.1 misho 6921: }
6922: else
6923: #endif
1.1.1.2 ! misho 6924: /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
! 6925: so that it works in DFA mode and in lookbehinds. */
! 6926:
1.1 misho 6927: {
1.1.1.2 ! misho 6928: previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
! 6929: *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
1.1 misho 6930: }
6931: }
6932: continue;
6933: }
6934:
6935: /* We have a data character whose value is in c. In UTF-8 mode it may have
6936: a value > 127. We set its representation in the length/buffer, and then
6937: handle it as a data character. */
6938:
1.1.1.2 ! misho 6939: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
! 6940: if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
! 6941: mclength = PRIV(ord2utf)(c, mcbuffer);
1.1 misho 6942: else
6943: #endif
6944:
6945: {
6946: mcbuffer[0] = c;
6947: mclength = 1;
6948: }
6949: goto ONE_CHAR;
6950:
6951:
6952: /* ===================================================================*/
6953: /* Handle a literal character. It is guaranteed not to be whitespace or #
6954: when the extended flag is set. If we are in UTF-8 mode, it may be a
6955: multi-byte literal character. */
6956:
6957: default:
6958: NORMAL_CHAR:
6959: mclength = 1;
6960: mcbuffer[0] = c;
6961:
1.1.1.2 ! misho 6962: #ifdef SUPPORT_UTF
! 6963: if (utf && HAS_EXTRALEN(c))
! 6964: ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
1.1 misho 6965: #endif
6966:
6967: /* At this point we have the character's bytes in mcbuffer, and the length
6968: in mclength. When not in UTF-8 mode, the length is always 1. */
6969:
6970: ONE_CHAR:
6971: previous = code;
1.1.1.2 ! misho 6972:
! 6973: /* For caseless UTF-8 mode when UCP support is available, check whether
! 6974: this character has more than one other case. If so, generate a special
! 6975: OP_PROP item instead of OP_CHARI. */
! 6976:
! 6977: #ifdef SUPPORT_UCP
! 6978: if (utf && (options & PCRE_CASELESS) != 0)
! 6979: {
! 6980: GETCHAR(c, mcbuffer);
! 6981: if ((c = UCD_CASESET(c)) != 0)
! 6982: {
! 6983: *code++ = OP_PROP;
! 6984: *code++ = PT_CLIST;
! 6985: *code++ = c;
! 6986: if (firstcharflags == REQ_UNSET) firstcharflags = zerofirstcharflags = REQ_NONE;
! 6987: break;
! 6988: }
! 6989: }
! 6990: #endif
! 6991:
! 6992: /* Caseful matches, or not one of the multicase characters. */
! 6993:
! 6994: *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR;
1.1 misho 6995: for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
6996:
6997: /* Remember if \r or \n were seen */
6998:
6999: if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
7000: cd->external_flags |= PCRE_HASCRORLF;
7001:
7002: /* Set the first and required bytes appropriately. If no previous first
7003: byte, set it from this character, but revert to none on a zero repeat.
1.1.1.2 ! misho 7004: Otherwise, leave the firstchar value alone, and don't change it on a zero
1.1 misho 7005: repeat. */
7006:
1.1.1.2 ! misho 7007: if (firstcharflags == REQ_UNSET)
1.1 misho 7008: {
1.1.1.2 ! misho 7009: zerofirstcharflags = REQ_NONE;
! 7010: zeroreqchar = reqchar;
! 7011: zeroreqcharflags = reqcharflags;
1.1 misho 7012:
1.1.1.2 ! misho 7013: /* If the character is more than one byte long, we can set firstchar
1.1 misho 7014: only if it is not to be matched caselessly. */
7015:
7016: if (mclength == 1 || req_caseopt == 0)
7017: {
1.1.1.2 ! misho 7018: firstchar = mcbuffer[0] | req_caseopt;
! 7019: firstchar = mcbuffer[0];
! 7020: firstcharflags = req_caseopt;
! 7021:
! 7022: if (mclength != 1)
! 7023: {
! 7024: reqchar = code[-1];
! 7025: reqcharflags = cd->req_varyopt;
! 7026: }
1.1 misho 7027: }
1.1.1.2 ! misho 7028: else firstcharflags = reqcharflags = REQ_NONE;
1.1 misho 7029: }
7030:
1.1.1.2 ! misho 7031: /* firstchar was previously set; we can set reqchar only if the length is
1.1 misho 7032: 1 or the matching is caseful. */
7033:
7034: else
7035: {
1.1.1.2 ! misho 7036: zerofirstchar = firstchar;
! 7037: zerofirstcharflags = firstcharflags;
! 7038: zeroreqchar = reqchar;
! 7039: zeroreqcharflags = reqcharflags;
1.1 misho 7040: if (mclength == 1 || req_caseopt == 0)
1.1.1.2 ! misho 7041: {
! 7042: reqchar = code[-1];
! 7043: reqcharflags = req_caseopt | cd->req_varyopt;
! 7044: }
1.1 misho 7045: }
7046:
7047: break; /* End of literal character handling */
7048: }
7049: } /* end of big loop */
7050:
7051:
7052: /* Control never reaches here by falling through, only by a goto for all the
7053: error states. Pass back the position in the pattern so that it can be displayed
7054: to the user for diagnosing the error. */
7055:
7056: FAILED:
7057: *ptrptr = ptr;
7058: return FALSE;
7059: }
7060:
7061:
7062:
7063: /*************************************************
7064: * Compile sequence of alternatives *
7065: *************************************************/
7066:
7067: /* On entry, ptr is pointing past the bracket character, but on return it
7068: points to the closing bracket, or vertical bar, or end of string. The code
7069: variable is pointing at the byte into which the BRA operator has been stored.
7070: This function is used during the pre-compile phase when we are trying to find
7071: out the amount of memory needed, as well as during the real compile phase. The
7072: value of lengthptr distinguishes the two phases.
7073:
7074: Arguments:
7075: options option bits, including any changes for this subpattern
7076: codeptr -> the address of the current code pointer
7077: ptrptr -> the address of the current pattern pointer
7078: errorcodeptr -> pointer to error code variable
7079: lookbehind TRUE if this is a lookbehind assertion
7080: reset_bracount TRUE to reset the count for each branch
7081: skipbytes skip this many bytes at start (for brackets and OP_COND)
1.1.1.2 ! misho 7082: cond_depth depth of nesting for conditional subpatterns
! 7083: firstcharptr place to put the first required character
! 7084: firstcharflagsptr place to put the first character flags, or a negative number
! 7085: reqcharptr place to put the last required character
! 7086: reqcharflagsptr place to put the last required character flags, or a negative number
1.1 misho 7087: bcptr pointer to the chain of currently open branches
7088: cd points to the data block with tables pointers etc.
7089: lengthptr NULL during the real compile phase
7090: points to length accumulator during pre-compile phase
7091:
7092: Returns: TRUE on success
7093: */
7094:
7095: static BOOL
1.1.1.2 ! misho 7096: compile_regex(int options, pcre_uchar **codeptr, const pcre_uchar **ptrptr,
1.1 misho 7097: int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
1.1.1.2 ! misho 7098: int cond_depth,
! 7099: pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
! 7100: pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
! 7101: branch_chain *bcptr, compile_data *cd, int *lengthptr)
1.1 misho 7102: {
1.1.1.2 ! misho 7103: const pcre_uchar *ptr = *ptrptr;
! 7104: pcre_uchar *code = *codeptr;
! 7105: pcre_uchar *last_branch = code;
! 7106: pcre_uchar *start_bracket = code;
! 7107: pcre_uchar *reverse_count = NULL;
1.1 misho 7108: open_capitem capitem;
7109: int capnumber = 0;
1.1.1.2 ! misho 7110: pcre_uint32 firstchar, reqchar;
! 7111: pcre_int32 firstcharflags, reqcharflags;
! 7112: pcre_uint32 branchfirstchar, branchreqchar;
! 7113: pcre_int32 branchfirstcharflags, branchreqcharflags;
1.1 misho 7114: int length;
1.1.1.2 ! misho 7115: unsigned int orig_bracount;
! 7116: unsigned int max_bracount;
1.1 misho 7117: branch_chain bc;
7118:
7119: bc.outer = bcptr;
7120: bc.current_branch = code;
7121:
1.1.1.2 ! misho 7122: firstchar = reqchar = 0;
! 7123: firstcharflags = reqcharflags = REQ_UNSET;
1.1 misho 7124:
7125: /* Accumulate the length for use in the pre-compile phase. Start with the
7126: length of the BRA and KET and any extra bytes that are required at the
7127: beginning. We accumulate in a local variable to save frequent testing of
7128: lenthptr for NULL. We cannot do this by looking at the value of code at the
7129: start and end of each alternative, because compiled items are discarded during
7130: the pre-compile phase so that the work space is not exceeded. */
7131:
7132: length = 2 + 2*LINK_SIZE + skipbytes;
7133:
7134: /* WARNING: If the above line is changed for any reason, you must also change
7135: the code that abstracts option settings at the start of the pattern and makes
7136: them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
7137: pre-compile phase to find out whether anything has yet been compiled or not. */
7138:
7139: /* If this is a capturing subpattern, add to the chain of open capturing items
7140: so that we can detect them if (*ACCEPT) is encountered. This is also used to
1.1.1.2 ! misho 7141: detect groups that contain recursive back references to themselves. Note that
! 7142: only OP_CBRA need be tested here; changing this opcode to one of its variants,
! 7143: e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
1.1 misho 7144:
7145: if (*code == OP_CBRA)
7146: {
7147: capnumber = GET2(code, 1 + LINK_SIZE);
7148: capitem.number = capnumber;
7149: capitem.next = cd->open_caps;
7150: capitem.flag = FALSE;
7151: cd->open_caps = &capitem;
7152: }
7153:
7154: /* Offset is set zero to mark that this bracket is still open */
7155:
7156: PUT(code, 1, 0);
7157: code += 1 + LINK_SIZE + skipbytes;
7158:
7159: /* Loop for each alternative branch */
7160:
7161: orig_bracount = max_bracount = cd->bracount;
7162: for (;;)
7163: {
7164: /* For a (?| group, reset the capturing bracket count so that each branch
7165: uses the same numbers. */
7166:
7167: if (reset_bracount) cd->bracount = orig_bracount;
7168:
7169: /* Set up dummy OP_REVERSE if lookbehind assertion */
7170:
7171: if (lookbehind)
7172: {
7173: *code++ = OP_REVERSE;
7174: reverse_count = code;
7175: PUTINC(code, 0, 0);
7176: length += 1 + LINK_SIZE;
7177: }
7178:
7179: /* Now compile the branch; in the pre-compile phase its length gets added
7180: into the length. */
7181:
1.1.1.2 ! misho 7182: if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstchar,
! 7183: &branchfirstcharflags, &branchreqchar, &branchreqcharflags, &bc,
! 7184: cond_depth, cd, (lengthptr == NULL)? NULL : &length))
1.1 misho 7185: {
7186: *ptrptr = ptr;
7187: return FALSE;
7188: }
7189:
7190: /* Keep the highest bracket count in case (?| was used and some branch
7191: has fewer than the rest. */
7192:
7193: if (cd->bracount > max_bracount) max_bracount = cd->bracount;
7194:
7195: /* In the real compile phase, there is some post-processing to be done. */
7196:
7197: if (lengthptr == NULL)
7198: {
1.1.1.2 ! misho 7199: /* If this is the first branch, the firstchar and reqchar values for the
1.1 misho 7200: branch become the values for the regex. */
7201:
7202: if (*last_branch != OP_ALT)
7203: {
1.1.1.2 ! misho 7204: firstchar = branchfirstchar;
! 7205: firstcharflags = branchfirstcharflags;
! 7206: reqchar = branchreqchar;
! 7207: reqcharflags = branchreqcharflags;
1.1 misho 7208: }
7209:
1.1.1.2 ! misho 7210: /* If this is not the first branch, the first char and reqchar have to
1.1 misho 7211: match the values from all the previous branches, except that if the
1.1.1.2 ! misho 7212: previous value for reqchar didn't have REQ_VARY set, it can still match,
1.1 misho 7213: and we set REQ_VARY for the regex. */
7214:
7215: else
7216: {
1.1.1.2 ! misho 7217: /* If we previously had a firstchar, but it doesn't match the new branch,
! 7218: we have to abandon the firstchar for the regex, but if there was
! 7219: previously no reqchar, it takes on the value of the old firstchar. */
1.1 misho 7220:
1.1.1.2 ! misho 7221: if (firstcharflags >= 0 &&
! 7222: (firstcharflags != branchfirstcharflags || firstchar != branchfirstchar))
1.1 misho 7223: {
1.1.1.2 ! misho 7224: if (reqcharflags < 0)
! 7225: {
! 7226: reqchar = firstchar;
! 7227: reqcharflags = firstcharflags;
! 7228: }
! 7229: firstcharflags = REQ_NONE;
1.1 misho 7230: }
7231:
1.1.1.2 ! misho 7232: /* If we (now or from before) have no firstchar, a firstchar from the
! 7233: branch becomes a reqchar if there isn't a branch reqchar. */
1.1 misho 7234:
1.1.1.2 ! misho 7235: if (firstcharflags < 0 && branchfirstcharflags >= 0 && branchreqcharflags < 0)
! 7236: {
! 7237: branchreqchar = branchfirstchar;
! 7238: branchreqcharflags = branchfirstcharflags;
! 7239: }
1.1 misho 7240:
1.1.1.2 ! misho 7241: /* Now ensure that the reqchars match */
1.1 misho 7242:
1.1.1.2 ! misho 7243: if (((reqcharflags & ~REQ_VARY) != (branchreqcharflags & ~REQ_VARY)) ||
! 7244: reqchar != branchreqchar)
! 7245: reqcharflags = REQ_NONE;
! 7246: else
! 7247: {
! 7248: reqchar = branchreqchar;
! 7249: reqcharflags |= branchreqcharflags; /* To "or" REQ_VARY */
! 7250: }
1.1 misho 7251: }
7252:
7253: /* If lookbehind, check that this branch matches a fixed-length string, and
7254: put the length into the OP_REVERSE item. Temporarily mark the end of the
7255: branch with OP_END. If the branch contains OP_RECURSE, the result is -3
7256: because there may be forward references that we can't check here. Set a
7257: flag to cause another lookbehind check at the end. Why not do it all at the
7258: end? Because common, erroneous checks are picked up here and the offset of
7259: the problem can be shown. */
7260:
7261: if (lookbehind)
7262: {
7263: int fixed_length;
7264: *code = OP_END;
1.1.1.2 ! misho 7265: fixed_length = find_fixedlength(last_branch, (options & PCRE_UTF8) != 0,
! 7266: FALSE, cd);
1.1 misho 7267: DPRINTF(("fixed length = %d\n", fixed_length));
7268: if (fixed_length == -3)
7269: {
7270: cd->check_lookbehind = TRUE;
7271: }
7272: else if (fixed_length < 0)
7273: {
1.1.1.2 ! misho 7274: *errorcodeptr = (fixed_length == -2)? ERR36 :
! 7275: (fixed_length == -4)? ERR70: ERR25;
1.1 misho 7276: *ptrptr = ptr;
7277: return FALSE;
7278: }
1.1.1.2 ! misho 7279: else
! 7280: {
! 7281: if (fixed_length > cd->max_lookbehind)
! 7282: cd->max_lookbehind = fixed_length;
! 7283: PUT(reverse_count, 0, fixed_length);
! 7284: }
1.1 misho 7285: }
7286: }
7287:
7288: /* Reached end of expression, either ')' or end of pattern. In the real
7289: compile phase, go back through the alternative branches and reverse the chain
7290: of offsets, with the field in the BRA item now becoming an offset to the
7291: first alternative. If there are no alternatives, it points to the end of the
7292: group. The length in the terminating ket is always the length of the whole
1.1.1.2 ! misho 7293: bracketed item. Return leaving the pointer at the terminating char. */
1.1 misho 7294:
7295: if (*ptr != CHAR_VERTICAL_LINE)
7296: {
7297: if (lengthptr == NULL)
7298: {
7299: int branch_length = (int)(code - last_branch);
7300: do
7301: {
7302: int prev_length = GET(last_branch, 1);
7303: PUT(last_branch, 1, branch_length);
7304: branch_length = prev_length;
7305: last_branch -= branch_length;
7306: }
7307: while (branch_length > 0);
7308: }
7309:
7310: /* Fill in the ket */
7311:
7312: *code = OP_KET;
7313: PUT(code, 1, (int)(code - start_bracket));
7314: code += 1 + LINK_SIZE;
7315:
7316: /* If it was a capturing subpattern, check to see if it contained any
7317: recursive back references. If so, we must wrap it in atomic brackets.
7318: In any event, remove the block from the chain. */
7319:
7320: if (capnumber > 0)
7321: {
7322: if (cd->open_caps->flag)
7323: {
7324: memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
1.1.1.2 ! misho 7325: IN_UCHARS(code - start_bracket));
1.1 misho 7326: *start_bracket = OP_ONCE;
7327: code += 1 + LINK_SIZE;
7328: PUT(start_bracket, 1, (int)(code - start_bracket));
7329: *code = OP_KET;
7330: PUT(code, 1, (int)(code - start_bracket));
7331: code += 1 + LINK_SIZE;
7332: length += 2 + 2*LINK_SIZE;
7333: }
7334: cd->open_caps = cd->open_caps->next;
7335: }
7336:
7337: /* Retain the highest bracket number, in case resetting was used. */
7338:
7339: cd->bracount = max_bracount;
7340:
7341: /* Set values to pass back */
7342:
7343: *codeptr = code;
7344: *ptrptr = ptr;
1.1.1.2 ! misho 7345: *firstcharptr = firstchar;
! 7346: *firstcharflagsptr = firstcharflags;
! 7347: *reqcharptr = reqchar;
! 7348: *reqcharflagsptr = reqcharflags;
1.1 misho 7349: if (lengthptr != NULL)
7350: {
7351: if (OFLOW_MAX - *lengthptr < length)
7352: {
7353: *errorcodeptr = ERR20;
7354: return FALSE;
7355: }
7356: *lengthptr += length;
7357: }
7358: return TRUE;
7359: }
7360:
7361: /* Another branch follows. In the pre-compile phase, we can move the code
7362: pointer back to where it was for the start of the first branch. (That is,
7363: pretend that each branch is the only one.)
7364:
7365: In the real compile phase, insert an ALT node. Its length field points back
7366: to the previous branch while the bracket remains open. At the end the chain
7367: is reversed. It's done like this so that the start of the bracket has a
7368: zero offset until it is closed, making it possible to detect recursion. */
7369:
7370: if (lengthptr != NULL)
7371: {
7372: code = *codeptr + 1 + LINK_SIZE + skipbytes;
7373: length += 1 + LINK_SIZE;
7374: }
7375: else
7376: {
7377: *code = OP_ALT;
7378: PUT(code, 1, (int)(code - last_branch));
7379: bc.current_branch = last_branch = code;
7380: code += 1 + LINK_SIZE;
7381: }
7382:
7383: ptr++;
7384: }
7385: /* Control never reaches here */
7386: }
7387:
7388:
7389:
7390:
7391: /*************************************************
7392: * Check for anchored expression *
7393: *************************************************/
7394:
7395: /* Try to find out if this is an anchored regular expression. Consider each
7396: alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
7397: all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
1.1.1.2 ! misho 7398: it's anchored. However, if this is a multiline pattern, then only OP_SOD will
! 7399: be found, because ^ generates OP_CIRCM in that mode.
1.1 misho 7400:
7401: We can also consider a regex to be anchored if OP_SOM starts all its branches.
7402: This is the code for \G, which means "match at start of match position, taking
7403: into account the match offset".
7404:
7405: A branch is also implicitly anchored if it starts with .* and DOTALL is set,
7406: because that will try the rest of the pattern at all possible matching points,
7407: so there is no point trying again.... er ....
7408:
7409: .... except when the .* appears inside capturing parentheses, and there is a
7410: subsequent back reference to those parentheses. We haven't enough information
7411: to catch that case precisely.
7412:
7413: At first, the best we could do was to detect when .* was in capturing brackets
7414: and the highest back reference was greater than or equal to that level.
7415: However, by keeping a bitmap of the first 31 back references, we can catch some
7416: of the more common cases more precisely.
7417:
1.1.1.2 ! misho 7418: ... A second exception is when the .* appears inside an atomic group, because
! 7419: this prevents the number of characters it matches from being adjusted.
! 7420:
1.1 misho 7421: Arguments:
7422: code points to start of expression (the bracket)
7423: bracket_map a bitmap of which brackets we are inside while testing; this
7424: handles up to substring 31; after that we just have to take
7425: the less precise approach
1.1.1.2 ! misho 7426: cd points to the compile data block
! 7427: atomcount atomic group level
1.1 misho 7428:
7429: Returns: TRUE or FALSE
7430: */
7431:
7432: static BOOL
1.1.1.2 ! misho 7433: is_anchored(register const pcre_uchar *code, unsigned int bracket_map,
! 7434: compile_data *cd, int atomcount)
1.1 misho 7435: {
7436: do {
1.1.1.2 ! misho 7437: const pcre_uchar *scode = first_significant_code(
! 7438: code + PRIV(OP_lengths)[*code], FALSE);
1.1 misho 7439: register int op = *scode;
7440:
7441: /* Non-capturing brackets */
7442:
1.1.1.2 ! misho 7443: if (op == OP_BRA || op == OP_BRAPOS ||
! 7444: op == OP_SBRA || op == OP_SBRAPOS)
1.1 misho 7445: {
1.1.1.2 ! misho 7446: if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
1.1 misho 7447: }
7448:
7449: /* Capturing brackets */
7450:
1.1.1.2 ! misho 7451: else if (op == OP_CBRA || op == OP_CBRAPOS ||
! 7452: op == OP_SCBRA || op == OP_SCBRAPOS)
1.1 misho 7453: {
7454: int n = GET2(scode, 1+LINK_SIZE);
7455: int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
1.1.1.2 ! misho 7456: if (!is_anchored(scode, new_map, cd, atomcount)) return FALSE;
! 7457: }
! 7458:
! 7459: /* Positive forward assertions and conditions */
! 7460:
! 7461: else if (op == OP_ASSERT || op == OP_COND)
! 7462: {
! 7463: if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
1.1 misho 7464: }
7465:
1.1.1.2 ! misho 7466: /* Atomic groups */
1.1 misho 7467:
1.1.1.2 ! misho 7468: else if (op == OP_ONCE || op == OP_ONCE_NC)
1.1 misho 7469: {
1.1.1.2 ! misho 7470: if (!is_anchored(scode, bracket_map, cd, atomcount + 1))
! 7471: return FALSE;
1.1 misho 7472: }
7473:
7474: /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
1.1.1.2 ! misho 7475: it isn't in brackets that are or may be referenced or inside an atomic
! 7476: group. */
1.1 misho 7477:
7478: else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
7479: op == OP_TYPEPOSSTAR))
7480: {
1.1.1.2 ! misho 7481: if (scode[1] != OP_ALLANY || (bracket_map & cd->backref_map) != 0 ||
! 7482: atomcount > 0 || cd->had_pruneorskip)
1.1 misho 7483: return FALSE;
7484: }
7485:
7486: /* Check for explicit anchoring */
7487:
1.1.1.2 ! misho 7488: else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
! 7489:
1.1 misho 7490: code += GET(code, 1);
7491: }
7492: while (*code == OP_ALT); /* Loop for each alternative */
7493: return TRUE;
7494: }
7495:
7496:
7497:
7498: /*************************************************
7499: * Check for starting with ^ or .* *
7500: *************************************************/
7501:
7502: /* This is called to find out if every branch starts with ^ or .* so that
7503: "first char" processing can be done to speed things up in multiline
7504: matching and for non-DOTALL patterns that start with .* (which must start at
7505: the beginning or after \n). As in the case of is_anchored() (see above), we
7506: have to take account of back references to capturing brackets that contain .*
1.1.1.2 ! misho 7507: because in that case we can't make the assumption. Also, the appearance of .*
! 7508: inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
! 7509: count, because once again the assumption no longer holds.
1.1 misho 7510:
7511: Arguments:
7512: code points to start of expression (the bracket)
7513: bracket_map a bitmap of which brackets we are inside while testing; this
7514: handles up to substring 31; after that we just have to take
7515: the less precise approach
1.1.1.2 ! misho 7516: cd points to the compile data
! 7517: atomcount atomic group level
1.1 misho 7518:
7519: Returns: TRUE or FALSE
7520: */
7521:
7522: static BOOL
1.1.1.2 ! misho 7523: is_startline(const pcre_uchar *code, unsigned int bracket_map,
! 7524: compile_data *cd, int atomcount)
1.1 misho 7525: {
7526: do {
1.1.1.2 ! misho 7527: const pcre_uchar *scode = first_significant_code(
! 7528: code + PRIV(OP_lengths)[*code], FALSE);
1.1 misho 7529: register int op = *scode;
7530:
7531: /* If we are at the start of a conditional assertion group, *both* the
7532: conditional assertion *and* what follows the condition must satisfy the test
7533: for start of line. Other kinds of condition fail. Note that there may be an
7534: auto-callout at the start of a condition. */
7535:
7536: if (op == OP_COND)
7537: {
7538: scode += 1 + LINK_SIZE;
1.1.1.2 ! misho 7539: if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
1.1 misho 7540: switch (*scode)
7541: {
7542: case OP_CREF:
7543: case OP_NCREF:
7544: case OP_RREF:
7545: case OP_NRREF:
7546: case OP_DEF:
7547: return FALSE;
7548:
7549: default: /* Assertion */
1.1.1.2 ! misho 7550: if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
1.1 misho 7551: do scode += GET(scode, 1); while (*scode == OP_ALT);
7552: scode += 1 + LINK_SIZE;
7553: break;
7554: }
1.1.1.2 ! misho 7555: scode = first_significant_code(scode, FALSE);
1.1 misho 7556: op = *scode;
7557: }
7558:
7559: /* Non-capturing brackets */
7560:
1.1.1.2 ! misho 7561: if (op == OP_BRA || op == OP_BRAPOS ||
! 7562: op == OP_SBRA || op == OP_SBRAPOS)
1.1 misho 7563: {
1.1.1.2 ! misho 7564: if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
1.1 misho 7565: }
7566:
7567: /* Capturing brackets */
7568:
1.1.1.2 ! misho 7569: else if (op == OP_CBRA || op == OP_CBRAPOS ||
! 7570: op == OP_SCBRA || op == OP_SCBRAPOS)
1.1 misho 7571: {
7572: int n = GET2(scode, 1+LINK_SIZE);
7573: int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
1.1.1.2 ! misho 7574: if (!is_startline(scode, new_map, cd, atomcount)) return FALSE;
! 7575: }
! 7576:
! 7577: /* Positive forward assertions */
! 7578:
! 7579: else if (op == OP_ASSERT)
! 7580: {
! 7581: if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
1.1 misho 7582: }
7583:
1.1.1.2 ! misho 7584: /* Atomic brackets */
1.1 misho 7585:
1.1.1.2 ! misho 7586: else if (op == OP_ONCE || op == OP_ONCE_NC)
1.1 misho 7587: {
1.1.1.2 ! misho 7588: if (!is_startline(scode, bracket_map, cd, atomcount + 1)) return FALSE;
1.1 misho 7589: }
7590:
1.1.1.2 ! misho 7591: /* .* means "start at start or after \n" if it isn't in atomic brackets or
! 7592: brackets that may be referenced, as long as the pattern does not contain
! 7593: *PRUNE or *SKIP, because these break the feature. Consider, for example,
! 7594: /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
! 7595: start of a line. */
1.1 misho 7596:
7597: else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
7598: {
1.1.1.2 ! misho 7599: if (scode[1] != OP_ANY || (bracket_map & cd->backref_map) != 0 ||
! 7600: atomcount > 0 || cd->had_pruneorskip)
! 7601: return FALSE;
1.1 misho 7602: }
7603:
1.1.1.2 ! misho 7604: /* Check for explicit circumflex; anything else gives a FALSE result. Note
! 7605: in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC
! 7606: because the number of characters matched by .* cannot be adjusted inside
! 7607: them. */
1.1 misho 7608:
1.1.1.2 ! misho 7609: else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
1.1 misho 7610:
7611: /* Move on to the next alternative */
7612:
7613: code += GET(code, 1);
7614: }
7615: while (*code == OP_ALT); /* Loop for each alternative */
7616: return TRUE;
7617: }
7618:
7619:
7620:
7621: /*************************************************
7622: * Check for asserted fixed first char *
7623: *************************************************/
7624:
7625: /* During compilation, the "first char" settings from forward assertions are
7626: discarded, because they can cause conflicts with actual literals that follow.
7627: However, if we end up without a first char setting for an unanchored pattern,
7628: it is worth scanning the regex to see if there is an initial asserted first
7629: char. If all branches start with the same asserted char, or with a bracket all
7630: of whose alternatives start with the same asserted char (recurse ad lib), then
7631: we return that char, otherwise -1.
7632:
7633: Arguments:
7634: code points to start of expression (the bracket)
1.1.1.2 ! misho 7635: flags points to the first char flags, or to REQ_NONE
1.1 misho 7636: inassert TRUE if in an assertion
7637:
1.1.1.2 ! misho 7638: Returns: the fixed first char, or 0 with REQ_NONE in flags
1.1 misho 7639: */
7640:
1.1.1.2 ! misho 7641: static pcre_uint32
! 7642: find_firstassertedchar(const pcre_uchar *code, pcre_int32 *flags,
! 7643: BOOL inassert)
1.1 misho 7644: {
1.1.1.2 ! misho 7645: register pcre_uint32 c = 0;
! 7646: int cflags = REQ_NONE;
! 7647:
! 7648: *flags = REQ_NONE;
1.1 misho 7649: do {
1.1.1.2 ! misho 7650: pcre_uint32 d;
! 7651: int dflags;
! 7652: int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
! 7653: *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
! 7654: const pcre_uchar *scode = first_significant_code(code + 1+LINK_SIZE + xl,
! 7655: TRUE);
! 7656: register pcre_uchar op = *scode;
1.1 misho 7657:
7658: switch(op)
7659: {
7660: default:
1.1.1.2 ! misho 7661: return 0;
1.1 misho 7662:
7663: case OP_BRA:
1.1.1.2 ! misho 7664: case OP_BRAPOS:
1.1 misho 7665: case OP_CBRA:
1.1.1.2 ! misho 7666: case OP_SCBRA:
! 7667: case OP_CBRAPOS:
! 7668: case OP_SCBRAPOS:
1.1 misho 7669: case OP_ASSERT:
7670: case OP_ONCE:
1.1.1.2 ! misho 7671: case OP_ONCE_NC:
1.1 misho 7672: case OP_COND:
1.1.1.2 ! misho 7673: d = find_firstassertedchar(scode, &dflags, op == OP_ASSERT);
! 7674: if (dflags < 0)
! 7675: return 0;
! 7676: if (cflags < 0) { c = d; cflags = dflags; } else if (c != d || cflags != dflags) return 0;
1.1 misho 7677: break;
7678:
1.1.1.2 ! misho 7679: case OP_EXACT:
! 7680: scode += IMM2_SIZE;
! 7681: /* Fall through */
1.1 misho 7682:
7683: case OP_CHAR:
7684: case OP_PLUS:
7685: case OP_MINPLUS:
7686: case OP_POSPLUS:
1.1.1.2 ! misho 7687: if (!inassert) return 0;
! 7688: if (cflags < 0) { c = scode[1]; cflags = 0; }
! 7689: else if (c != scode[1]) return 0;
! 7690: break;
! 7691:
! 7692: case OP_EXACTI:
! 7693: scode += IMM2_SIZE;
! 7694: /* Fall through */
! 7695:
! 7696: case OP_CHARI:
! 7697: case OP_PLUSI:
! 7698: case OP_MINPLUSI:
! 7699: case OP_POSPLUSI:
! 7700: if (!inassert) return 0;
! 7701: if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
! 7702: else if (c != scode[1]) return 0;
1.1 misho 7703: break;
7704: }
7705:
7706: code += GET(code, 1);
7707: }
7708: while (*code == OP_ALT);
1.1.1.2 ! misho 7709:
! 7710: *flags = cflags;
1.1 misho 7711: return c;
7712: }
7713:
7714:
7715:
7716: /*************************************************
7717: * Compile a Regular Expression *
7718: *************************************************/
7719:
7720: /* This function takes a string and returns a pointer to a block of store
7721: holding a compiled version of the expression. The original API for this
7722: function had no error code return variable; it is retained for backwards
7723: compatibility. The new function is given a new name.
7724:
7725: Arguments:
7726: pattern the regular expression
7727: options various option bits
7728: errorcodeptr pointer to error code variable (pcre_compile2() only)
7729: can be NULL if you don't want a code value
7730: errorptr pointer to pointer to error text
7731: erroroffset ptr offset in pattern where error was detected
7732: tables pointer to character tables or NULL
7733:
7734: Returns: pointer to compiled data block, or NULL on error,
7735: with errorptr and erroroffset set
7736: */
7737:
1.1.1.2 ! misho 7738: #if defined COMPILE_PCRE8
1.1 misho 7739: PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
7740: pcre_compile(const char *pattern, int options, const char **errorptr,
7741: int *erroroffset, const unsigned char *tables)
1.1.1.2 ! misho 7742: #elif defined COMPILE_PCRE16
! 7743: PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
! 7744: pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr,
! 7745: int *erroroffset, const unsigned char *tables)
! 7746: #elif defined COMPILE_PCRE32
! 7747: PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
! 7748: pcre32_compile(PCRE_SPTR32 pattern, int options, const char **errorptr,
! 7749: int *erroroffset, const unsigned char *tables)
! 7750: #endif
1.1 misho 7751: {
1.1.1.2 ! misho 7752: #if defined COMPILE_PCRE8
1.1 misho 7753: return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
1.1.1.2 ! misho 7754: #elif defined COMPILE_PCRE16
! 7755: return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
! 7756: #elif defined COMPILE_PCRE32
! 7757: return pcre32_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
! 7758: #endif
1.1 misho 7759: }
7760:
7761:
1.1.1.2 ! misho 7762: #if defined COMPILE_PCRE8
1.1 misho 7763: PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
7764: pcre_compile2(const char *pattern, int options, int *errorcodeptr,
7765: const char **errorptr, int *erroroffset, const unsigned char *tables)
1.1.1.2 ! misho 7766: #elif defined COMPILE_PCRE16
! 7767: PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
! 7768: pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr,
! 7769: const char **errorptr, int *erroroffset, const unsigned char *tables)
! 7770: #elif defined COMPILE_PCRE32
! 7771: PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
! 7772: pcre32_compile2(PCRE_SPTR32 pattern, int options, int *errorcodeptr,
! 7773: const char **errorptr, int *erroroffset, const unsigned char *tables)
! 7774: #endif
1.1 misho 7775: {
1.1.1.2 ! misho 7776: REAL_PCRE *re;
1.1 misho 7777: int length = 1; /* For final END opcode */
1.1.1.2 ! misho 7778: pcre_uint32 firstchar, reqchar;
! 7779: pcre_int32 firstcharflags, reqcharflags;
! 7780: int newline;
1.1 misho 7781: int errorcode = 0;
7782: int skipatstart = 0;
1.1.1.2 ! misho 7783: BOOL utf;
1.1 misho 7784: size_t size;
1.1.1.2 ! misho 7785: pcre_uchar *code;
! 7786: const pcre_uchar *codestart;
! 7787: const pcre_uchar *ptr;
1.1 misho 7788: compile_data compile_block;
7789: compile_data *cd = &compile_block;
7790:
7791: /* This space is used for "compiling" into during the first phase, when we are
7792: computing the amount of memory that is needed. Compiled items are thrown away
7793: as soon as possible, so that a fairly large buffer should be sufficient for
7794: this purpose. The same space is used in the second phase for remembering where
1.1.1.2 ! misho 7795: to fill in forward references to subpatterns. That may overflow, in which case
! 7796: new memory is obtained from malloc(). */
1.1 misho 7797:
1.1.1.2 ! misho 7798: pcre_uchar cworkspace[COMPILE_WORK_SIZE];
1.1 misho 7799:
7800: /* Set this early so that early errors get offset 0. */
7801:
1.1.1.2 ! misho 7802: ptr = (const pcre_uchar *)pattern;
1.1 misho 7803:
7804: /* We can't pass back an error message if errorptr is NULL; I guess the best we
7805: can do is just return NULL, but we can set a code value if there is a code
7806: pointer. */
7807:
7808: if (errorptr == NULL)
7809: {
7810: if (errorcodeptr != NULL) *errorcodeptr = 99;
7811: return NULL;
7812: }
7813:
7814: *errorptr = NULL;
7815: if (errorcodeptr != NULL) *errorcodeptr = ERR0;
7816:
7817: /* However, we can give a message for this error */
7818:
7819: if (erroroffset == NULL)
7820: {
7821: errorcode = ERR16;
7822: goto PCRE_EARLY_ERROR_RETURN2;
7823: }
7824:
7825: *erroroffset = 0;
7826:
7827: /* Set up pointers to the individual character tables */
7828:
1.1.1.2 ! misho 7829: if (tables == NULL) tables = PRIV(default_tables);
1.1 misho 7830: cd->lcc = tables + lcc_offset;
7831: cd->fcc = tables + fcc_offset;
7832: cd->cbits = tables + cbits_offset;
7833: cd->ctypes = tables + ctypes_offset;
7834:
7835: /* Check that all undefined public option bits are zero */
7836:
7837: if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
7838: {
7839: errorcode = ERR17;
7840: goto PCRE_EARLY_ERROR_RETURN;
7841: }
7842:
7843: /* Check for global one-time settings at the start of the pattern, and remember
7844: the offset for later. */
7845:
7846: while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
7847: ptr[skipatstart+1] == CHAR_ASTERISK)
7848: {
7849: int newnl = 0;
7850: int newbsr = 0;
7851:
1.1.1.2 ! misho 7852: /* For completeness and backward compatibility, (*UTFn) is supported in the
! 7853: relevant libraries, but (*UTF) is generic and always supported. Note that
! 7854: PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. */
! 7855:
! 7856: #ifdef COMPILE_PCRE8
! 7857: if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF8_RIGHTPAR, 5) == 0)
1.1 misho 7858: { skipatstart += 7; options |= PCRE_UTF8; continue; }
1.1.1.2 ! misho 7859: #endif
! 7860: #ifdef COMPILE_PCRE16
! 7861: if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF16_RIGHTPAR, 6) == 0)
! 7862: { skipatstart += 8; options |= PCRE_UTF16; continue; }
! 7863: #endif
! 7864: #ifdef COMPILE_PCRE32
! 7865: if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF32_RIGHTPAR, 6) == 0)
! 7866: { skipatstart += 8; options |= PCRE_UTF32; continue; }
! 7867: #endif
! 7868:
! 7869: else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 4) == 0)
! 7870: { skipatstart += 6; options |= PCRE_UTF8; continue; }
! 7871: else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
1.1 misho 7872: { skipatstart += 6; options |= PCRE_UCP; continue; }
1.1.1.2 ! misho 7873: else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
1.1 misho 7874: { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
7875:
1.1.1.2 ! misho 7876: if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CR_RIGHTPAR, 3) == 0)
1.1 misho 7877: { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
1.1.1.2 ! misho 7878: else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LF_RIGHTPAR, 3) == 0)
1.1 misho 7879: { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
1.1.1.2 ! misho 7880: else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CRLF_RIGHTPAR, 5) == 0)
1.1 misho 7881: { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
1.1.1.2 ! misho 7882: else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANY_RIGHTPAR, 4) == 0)
1.1 misho 7883: { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
1.1.1.2 ! misho 7884: else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANYCRLF_RIGHTPAR, 8) == 0)
1.1 misho 7885: { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
7886:
1.1.1.2 ! misho 7887: else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
1.1 misho 7888: { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
1.1.1.2 ! misho 7889: else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
1.1 misho 7890: { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
7891:
7892: if (newnl != 0)
7893: options = (options & ~PCRE_NEWLINE_BITS) | newnl;
7894: else if (newbsr != 0)
7895: options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
7896: else break;
7897: }
7898:
1.1.1.2 ! misho 7899: /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
! 7900: utf = (options & PCRE_UTF8) != 0;
1.1 misho 7901:
1.1.1.2 ! misho 7902: /* Can't support UTF unless PCRE has been compiled to include the code. The
! 7903: return of an error code from PRIV(valid_utf)() is a new feature, introduced in
! 7904: release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
! 7905: not used here. */
! 7906:
! 7907: #ifdef SUPPORT_UTF
! 7908: if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
! 7909: (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
1.1 misho 7910: {
1.1.1.2 ! misho 7911: #if defined COMPILE_PCRE8
1.1 misho 7912: errorcode = ERR44;
1.1.1.2 ! misho 7913: #elif defined COMPILE_PCRE16
! 7914: errorcode = ERR74;
! 7915: #elif defined COMPILE_PCRE32
! 7916: errorcode = ERR77;
! 7917: #endif
1.1 misho 7918: goto PCRE_EARLY_ERROR_RETURN2;
7919: }
7920: #else
1.1.1.2 ! misho 7921: if (utf)
1.1 misho 7922: {
7923: errorcode = ERR32;
7924: goto PCRE_EARLY_ERROR_RETURN;
7925: }
7926: #endif
7927:
7928: /* Can't support UCP unless PCRE has been compiled to include the code. */
7929:
7930: #ifndef SUPPORT_UCP
7931: if ((options & PCRE_UCP) != 0)
7932: {
7933: errorcode = ERR67;
7934: goto PCRE_EARLY_ERROR_RETURN;
7935: }
7936: #endif
7937:
7938: /* Check validity of \R options. */
7939:
1.1.1.2 ! misho 7940: if ((options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) ==
! 7941: (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
1.1 misho 7942: {
1.1.1.2 ! misho 7943: errorcode = ERR56;
! 7944: goto PCRE_EARLY_ERROR_RETURN;
1.1 misho 7945: }
7946:
7947: /* Handle different types of newline. The three bits give seven cases. The
7948: current code allows for fixed one- or two-byte sequences, plus "any" and
7949: "anycrlf". */
7950:
7951: switch (options & PCRE_NEWLINE_BITS)
7952: {
7953: case 0: newline = NEWLINE; break; /* Build-time default */
7954: case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
7955: case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
7956: case PCRE_NEWLINE_CR+
7957: PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
7958: case PCRE_NEWLINE_ANY: newline = -1; break;
7959: case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
7960: default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
7961: }
7962:
7963: if (newline == -2)
7964: {
7965: cd->nltype = NLTYPE_ANYCRLF;
7966: }
7967: else if (newline < 0)
7968: {
7969: cd->nltype = NLTYPE_ANY;
7970: }
7971: else
7972: {
7973: cd->nltype = NLTYPE_FIXED;
7974: if (newline > 255)
7975: {
7976: cd->nllen = 2;
7977: cd->nl[0] = (newline >> 8) & 255;
7978: cd->nl[1] = newline & 255;
7979: }
7980: else
7981: {
7982: cd->nllen = 1;
7983: cd->nl[0] = newline;
7984: }
7985: }
7986:
7987: /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
7988: references to help in deciding whether (.*) can be treated as anchored or not.
7989: */
7990:
7991: cd->top_backref = 0;
7992: cd->backref_map = 0;
7993:
7994: /* Reflect pattern for debugging output */
7995:
7996: DPRINTF(("------------------------------------------------------------------\n"));
1.1.1.2 ! misho 7997: #ifdef PCRE_DEBUG
! 7998: print_puchar(stdout, (PCRE_PUCHAR)pattern);
! 7999: #endif
! 8000: DPRINTF(("\n"));
1.1 misho 8001:
8002: /* Pretend to compile the pattern while actually just accumulating the length
8003: of memory required. This behaviour is triggered by passing a non-NULL final
8004: argument to compile_regex(). We pass a block of workspace (cworkspace) for it
8005: to compile parts of the pattern into; the compiled code is discarded when it is
8006: no longer needed, so hopefully this workspace will never overflow, though there
8007: is a test for its doing so. */
8008:
8009: cd->bracount = cd->final_bracount = 0;
8010: cd->names_found = 0;
8011: cd->name_entry_size = 0;
8012: cd->name_table = NULL;
8013: cd->start_code = cworkspace;
8014: cd->hwm = cworkspace;
1.1.1.2 ! misho 8015: cd->start_workspace = cworkspace;
! 8016: cd->workspace_size = COMPILE_WORK_SIZE;
! 8017: cd->start_pattern = (const pcre_uchar *)pattern;
! 8018: cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
1.1 misho 8019: cd->req_varyopt = 0;
1.1.1.2 ! misho 8020: cd->assert_depth = 0;
! 8021: cd->max_lookbehind = 0;
1.1 misho 8022: cd->external_options = options;
8023: cd->external_flags = 0;
8024: cd->open_caps = NULL;
8025:
8026: /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
8027: don't need to look at the result of the function here. The initial options have
8028: been put into the cd block so that they can be changed if an option setting is
8029: found within the regex right at the beginning. Bringing initial option settings
8030: outside can help speed up starting point checks. */
8031:
8032: ptr += skipatstart;
8033: code = cworkspace;
8034: *code = OP_BRA;
1.1.1.2 ! misho 8035: (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
! 8036: FALSE, 0, 0, &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL,
! 8037: cd, &length);
1.1 misho 8038: if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
8039:
8040: DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
1.1.1.2 ! misho 8041: (int)(cd->hwm - cworkspace)));
1.1 misho 8042:
8043: if (length > MAX_PATTERN_SIZE)
8044: {
8045: errorcode = ERR20;
8046: goto PCRE_EARLY_ERROR_RETURN;
8047: }
8048:
8049: /* Compute the size of data block needed and get it, either from malloc or
8050: externally provided function. Integer overflow should no longer be possible
8051: because nowadays we limit the maximum value of cd->names_found and
8052: cd->name_entry_size. */
8053:
1.1.1.2 ! misho 8054: size = sizeof(REAL_PCRE) + (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
! 8055: re = (REAL_PCRE *)(PUBL(malloc))(size);
1.1 misho 8056:
8057: if (re == NULL)
8058: {
8059: errorcode = ERR21;
8060: goto PCRE_EARLY_ERROR_RETURN;
8061: }
8062:
8063: /* Put in the magic number, and save the sizes, initial options, internal
8064: flags, and character table pointer. NULL is used for the default character
8065: tables. The nullpad field is at the end; it's there to help in the case when a
8066: regex compiled on a system with 4-byte pointers is run on another with 8-byte
8067: pointers. */
8068:
8069: re->magic_number = MAGIC_NUMBER;
8070: re->size = (int)size;
8071: re->options = cd->external_options;
8072: re->flags = cd->external_flags;
1.1.1.2 ! misho 8073: re->first_char = 0;
! 8074: re->req_char = 0;
! 8075: re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);
1.1 misho 8076: re->name_entry_size = cd->name_entry_size;
8077: re->name_count = cd->names_found;
8078: re->ref_count = 0;
1.1.1.2 ! misho 8079: re->tables = (tables == PRIV(default_tables))? NULL : tables;
1.1 misho 8080: re->nullpad = NULL;
1.1.1.2 ! misho 8081: #ifdef COMPILE_PCRE32
! 8082: re->dummy1 = re->dummy2 = 0;
! 8083: #endif
1.1 misho 8084:
8085: /* The starting points of the name/number translation table and of the code are
8086: passed around in the compile data block. The start/end pattern and initial
8087: options are already set from the pre-compile phase, as is the name_entry_size
8088: field. Reset the bracket count and the names_found field. Also reset the hwm
8089: field; this time it's used for remembering forward references to subpatterns.
8090: */
8091:
8092: cd->final_bracount = cd->bracount; /* Save for checking forward references */
1.1.1.2 ! misho 8093: cd->assert_depth = 0;
1.1 misho 8094: cd->bracount = 0;
1.1.1.2 ! misho 8095: cd->max_lookbehind = 0;
1.1 misho 8096: cd->names_found = 0;
1.1.1.2 ! misho 8097: cd->name_table = (pcre_uchar *)re + re->name_table_offset;
1.1 misho 8098: codestart = cd->name_table + re->name_entry_size * re->name_count;
8099: cd->start_code = codestart;
1.1.1.2 ! misho 8100: cd->hwm = (pcre_uchar *)(cd->start_workspace);
1.1 misho 8101: cd->req_varyopt = 0;
8102: cd->had_accept = FALSE;
1.1.1.2 ! misho 8103: cd->had_pruneorskip = FALSE;
1.1 misho 8104: cd->check_lookbehind = FALSE;
8105: cd->open_caps = NULL;
8106:
8107: /* Set up a starting, non-extracting bracket, then compile the expression. On
8108: error, errorcode will be set non-zero, so we don't need to look at the result
8109: of the function here. */
8110:
1.1.1.2 ! misho 8111: ptr = (const pcre_uchar *)pattern + skipatstart;
! 8112: code = (pcre_uchar *)codestart;
1.1 misho 8113: *code = OP_BRA;
1.1.1.2 ! misho 8114: (void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0,
! 8115: &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL, cd, NULL);
1.1 misho 8116: re->top_bracket = cd->bracount;
8117: re->top_backref = cd->top_backref;
1.1.1.2 ! misho 8118: re->max_lookbehind = cd->max_lookbehind;
! 8119: re->flags = cd->external_flags | PCRE_MODE;
1.1 misho 8120:
1.1.1.2 ! misho 8121: if (cd->had_accept)
! 8122: {
! 8123: reqchar = 0; /* Must disable after (*ACCEPT) */
! 8124: reqcharflags = REQ_NONE;
! 8125: }
1.1 misho 8126:
8127: /* If not reached end of pattern on success, there's an excess bracket. */
8128:
1.1.1.2 ! misho 8129: if (errorcode == 0 && *ptr != CHAR_NULL) errorcode = ERR22;
1.1 misho 8130:
8131: /* Fill in the terminating state and check for disastrous overflow, but
8132: if debugging, leave the test till after things are printed out. */
8133:
8134: *code++ = OP_END;
8135:
8136: #ifndef PCRE_DEBUG
8137: if (code - codestart > length) errorcode = ERR23;
8138: #endif
8139:
1.1.1.2 ! misho 8140: #ifdef SUPPORT_VALGRIND
! 8141: /* If the estimated length exceeds the really used length, mark the extra
! 8142: allocated memory as unaddressable, so that any out-of-bound reads can be
! 8143: detected. */
! 8144: VALGRIND_MAKE_MEM_NOACCESS(code, (length - (code - codestart)) * sizeof(pcre_uchar));
! 8145: #endif
! 8146:
! 8147: /* Fill in any forward references that are required. There may be repeated
! 8148: references; optimize for them, as searching a large regex takes time. */
1.1 misho 8149:
1.1.1.2 ! misho 8150: if (cd->hwm > cd->start_workspace)
1.1 misho 8151: {
1.1.1.2 ! misho 8152: int prev_recno = -1;
! 8153: const pcre_uchar *groupptr = NULL;
! 8154: while (errorcode == 0 && cd->hwm > cd->start_workspace)
! 8155: {
! 8156: int offset, recno;
! 8157: cd->hwm -= LINK_SIZE;
! 8158: offset = GET(cd->hwm, 0);
! 8159: recno = GET(codestart, offset);
! 8160: if (recno != prev_recno)
! 8161: {
! 8162: groupptr = PRIV(find_bracket)(codestart, utf, recno);
! 8163: prev_recno = recno;
! 8164: }
! 8165: if (groupptr == NULL) errorcode = ERR53;
! 8166: else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));
! 8167: }
1.1 misho 8168: }
8169:
1.1.1.2 ! misho 8170: /* If the workspace had to be expanded, free the new memory. */
! 8171:
! 8172: if (cd->workspace_size > COMPILE_WORK_SIZE)
! 8173: (PUBL(free))((void *)cd->start_workspace);
! 8174:
1.1 misho 8175: /* Give an error if there's back reference to a non-existent capturing
8176: subpattern. */
8177:
8178: if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
8179:
8180: /* If there were any lookbehind assertions that contained OP_RECURSE
8181: (recursions or subroutine calls), a flag is set for them to be checked here,
1.1.1.2 ! misho 8182: because they may contain forward references. Actual recursions cannot be fixed
1.1 misho 8183: length, but subroutine calls can. It is done like this so that those without
8184: OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
8185: exceptional ones forgo this. We scan the pattern to check that they are fixed
8186: length, and set their lengths. */
8187:
8188: if (cd->check_lookbehind)
8189: {
1.1.1.2 ! misho 8190: pcre_uchar *cc = (pcre_uchar *)codestart;
1.1 misho 8191:
8192: /* Loop, searching for OP_REVERSE items, and process those that do not have
8193: their length set. (Actually, it will also re-process any that have a length
8194: of zero, but that is a pathological case, and it does no harm.) When we find
8195: one, we temporarily terminate the branch it is in while we scan it. */
8196:
1.1.1.2 ! misho 8197: for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf, -1);
1.1 misho 8198: cc != NULL;
1.1.1.2 ! misho 8199: cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf, -1))
1.1 misho 8200: {
8201: if (GET(cc, 1) == 0)
8202: {
8203: int fixed_length;
1.1.1.2 ! misho 8204: pcre_uchar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
1.1 misho 8205: int end_op = *be;
8206: *be = OP_END;
1.1.1.2 ! misho 8207: fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE,
! 8208: cd);
1.1 misho 8209: *be = end_op;
8210: DPRINTF(("fixed length = %d\n", fixed_length));
8211: if (fixed_length < 0)
8212: {
1.1.1.2 ! misho 8213: errorcode = (fixed_length == -2)? ERR36 :
! 8214: (fixed_length == -4)? ERR70 : ERR25;
1.1 misho 8215: break;
8216: }
1.1.1.2 ! misho 8217: if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length;
1.1 misho 8218: PUT(cc, 1, fixed_length);
8219: }
8220: cc += 1 + LINK_SIZE;
8221: }
8222: }
8223:
8224: /* Failed to compile, or error while post-processing */
8225:
8226: if (errorcode != 0)
8227: {
1.1.1.2 ! misho 8228: (PUBL(free))(re);
1.1 misho 8229: PCRE_EARLY_ERROR_RETURN:
1.1.1.2 ! misho 8230: *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);
1.1 misho 8231: PCRE_EARLY_ERROR_RETURN2:
8232: *errorptr = find_error_text(errorcode);
8233: if (errorcodeptr != NULL) *errorcodeptr = errorcode;
8234: return NULL;
8235: }
8236:
8237: /* If the anchored option was not passed, set the flag if we can determine that
1.1.1.2 ! misho 8238: the pattern is anchored by virtue of ^ characters or \A or anything else, such
! 8239: as starting with non-atomic .* when DOTALL is set and there are no occurrences
! 8240: of *PRUNE or *SKIP.
1.1 misho 8241:
8242: Otherwise, if we know what the first byte has to be, save it, because that
8243: speeds up unanchored matches no end. If not, see if we can set the
8244: PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
1.1.1.2 ! misho 8245: start with ^. and also when all branches start with non-atomic .* for
! 8246: non-DOTALL matches when *PRUNE and SKIP are not present. */
1.1 misho 8247:
8248: if ((re->options & PCRE_ANCHORED) == 0)
8249: {
1.1.1.2 ! misho 8250: if (is_anchored(codestart, 0, cd, 0)) re->options |= PCRE_ANCHORED;
1.1 misho 8251: else
8252: {
1.1.1.2 ! misho 8253: if (firstcharflags < 0)
! 8254: firstchar = find_firstassertedchar(codestart, &firstcharflags, FALSE);
! 8255: if (firstcharflags >= 0) /* Remove caseless flag for non-caseable chars */
! 8256: {
! 8257: #if defined COMPILE_PCRE8
! 8258: re->first_char = firstchar & 0xff;
! 8259: #elif defined COMPILE_PCRE16
! 8260: re->first_char = firstchar & 0xffff;
! 8261: #elif defined COMPILE_PCRE32
! 8262: re->first_char = firstchar;
! 8263: #endif
! 8264: if ((firstcharflags & REQ_CASELESS) != 0)
! 8265: {
! 8266: #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
! 8267: /* We ignore non-ASCII first chars in 8 bit mode. */
! 8268: if (utf)
! 8269: {
! 8270: if (re->first_char < 128)
! 8271: {
! 8272: if (cd->fcc[re->first_char] != re->first_char)
! 8273: re->flags |= PCRE_FCH_CASELESS;
! 8274: }
! 8275: else if (UCD_OTHERCASE(re->first_char) != re->first_char)
! 8276: re->flags |= PCRE_FCH_CASELESS;
! 8277: }
! 8278: else
! 8279: #endif
! 8280: if (MAX_255(re->first_char)
! 8281: && cd->fcc[re->first_char] != re->first_char)
! 8282: re->flags |= PCRE_FCH_CASELESS;
! 8283: }
! 8284:
1.1 misho 8285: re->flags |= PCRE_FIRSTSET;
8286: }
1.1.1.2 ! misho 8287:
! 8288: else if (is_startline(codestart, 0, cd, 0)) re->flags |= PCRE_STARTLINE;
1.1 misho 8289: }
8290: }
8291:
8292: /* For an anchored pattern, we use the "required byte" only if it follows a
8293: variable length item in the regex. Remove the caseless flag for non-caseable
8294: bytes. */
8295:
1.1.1.2 ! misho 8296: if (reqcharflags >= 0 &&
! 8297: ((re->options & PCRE_ANCHORED) == 0 || (reqcharflags & REQ_VARY) != 0))
1.1 misho 8298: {
1.1.1.2 ! misho 8299: #if defined COMPILE_PCRE8
! 8300: re->req_char = reqchar & 0xff;
! 8301: #elif defined COMPILE_PCRE16
! 8302: re->req_char = reqchar & 0xffff;
! 8303: #elif defined COMPILE_PCRE32
! 8304: re->req_char = reqchar;
! 8305: #endif
! 8306: if ((reqcharflags & REQ_CASELESS) != 0)
! 8307: {
! 8308: #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
! 8309: /* We ignore non-ASCII first chars in 8 bit mode. */
! 8310: if (utf)
! 8311: {
! 8312: if (re->req_char < 128)
! 8313: {
! 8314: if (cd->fcc[re->req_char] != re->req_char)
! 8315: re->flags |= PCRE_RCH_CASELESS;
! 8316: }
! 8317: else if (UCD_OTHERCASE(re->req_char) != re->req_char)
! 8318: re->flags |= PCRE_RCH_CASELESS;
! 8319: }
! 8320: else
! 8321: #endif
! 8322: if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char)
! 8323: re->flags |= PCRE_RCH_CASELESS;
! 8324: }
! 8325:
1.1 misho 8326: re->flags |= PCRE_REQCHSET;
8327: }
8328:
8329: /* Print out the compiled data if debugging is enabled. This is never the
8330: case when building a production library. */
8331:
8332: #ifdef PCRE_DEBUG
8333: printf("Length = %d top_bracket = %d top_backref = %d\n",
8334: length, re->top_bracket, re->top_backref);
8335:
8336: printf("Options=%08x\n", re->options);
8337:
8338: if ((re->flags & PCRE_FIRSTSET) != 0)
8339: {
1.1.1.2 ! misho 8340: pcre_uchar ch = re->first_char;
! 8341: const char *caseless =
! 8342: ((re->flags & PCRE_FCH_CASELESS) == 0)? "" : " (caseless)";
! 8343: if (PRINTABLE(ch)) printf("First char = %c%s\n", ch, caseless);
1.1 misho 8344: else printf("First char = \\x%02x%s\n", ch, caseless);
8345: }
8346:
8347: if ((re->flags & PCRE_REQCHSET) != 0)
8348: {
1.1.1.2 ! misho 8349: pcre_uchar ch = re->req_char;
! 8350: const char *caseless =
! 8351: ((re->flags & PCRE_RCH_CASELESS) == 0)? "" : " (caseless)";
! 8352: if (PRINTABLE(ch)) printf("Req char = %c%s\n", ch, caseless);
1.1 misho 8353: else printf("Req char = \\x%02x%s\n", ch, caseless);
8354: }
8355:
1.1.1.2 ! misho 8356: #if defined COMPILE_PCRE8
! 8357: pcre_printint((pcre *)re, stdout, TRUE);
! 8358: #elif defined COMPILE_PCRE16
! 8359: pcre16_printint((pcre *)re, stdout, TRUE);
! 8360: #elif defined COMPILE_PCRE32
! 8361: pcre32_printint((pcre *)re, stdout, TRUE);
! 8362: #endif
1.1 misho 8363:
8364: /* This check is done here in the debugging case so that the code that
8365: was compiled can be seen. */
8366:
8367: if (code - codestart > length)
8368: {
1.1.1.2 ! misho 8369: (PUBL(free))(re);
1.1 misho 8370: *errorptr = find_error_text(ERR23);
1.1.1.2 ! misho 8371: *erroroffset = ptr - (pcre_uchar *)pattern;
1.1 misho 8372: if (errorcodeptr != NULL) *errorcodeptr = ERR23;
8373: return NULL;
8374: }
8375: #endif /* PCRE_DEBUG */
8376:
1.1.1.2 ! misho 8377: #if defined COMPILE_PCRE8
1.1 misho 8378: return (pcre *)re;
1.1.1.2 ! misho 8379: #elif defined COMPILE_PCRE16
! 8380: return (pcre16 *)re;
! 8381: #elif defined COMPILE_PCRE32
! 8382: return (pcre32 *)re;
! 8383: #endif
1.1 misho 8384: }
8385:
8386: /* End of pcre_compile.c */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>