Annotation of embedaddon/pcre/pcre_dfa_exec.c, revision 1.1.1.5
1.1 misho 1: /*************************************************
2: * Perl-Compatible Regular Expressions *
3: *************************************************/
4:
5: /* PCRE is a library of functions to support regular expressions whose syntax
6: and semantics are as close as possible to those of the Perl 5 language (but see
7: below for why this module is different).
8:
9: Written by Philip Hazel
1.1.1.4 misho 10: Copyright (c) 1997-2013 University of Cambridge
1.1 misho 11:
12: -----------------------------------------------------------------------------
13: Redistribution and use in source and binary forms, with or without
14: modification, are permitted provided that the following conditions are met:
15:
16: * Redistributions of source code must retain the above copyright notice,
17: this list of conditions and the following disclaimer.
18:
19: * Redistributions in binary form must reproduce the above copyright
20: notice, this list of conditions and the following disclaimer in the
21: documentation and/or other materials provided with the distribution.
22:
23: * Neither the name of the University of Cambridge nor the names of its
24: contributors may be used to endorse or promote products derived from
25: this software without specific prior written permission.
26:
27: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37: POSSIBILITY OF SUCH DAMAGE.
38: -----------------------------------------------------------------------------
39: */
40:
41: /* This module contains the external function pcre_dfa_exec(), which is an
42: alternative matching function that uses a sort of DFA algorithm (not a true
1.1.1.3 misho 43: FSM). This is NOT Perl-compatible, but it has advantages in certain
1.1 misho 44: applications. */
45:
46:
47: /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
48: the performance of his patterns greatly. I could not use it as it stood, as it
49: was not thread safe, and made assumptions about pattern sizes. Also, it caused
50: test 7 to loop, and test 9 to crash with a segfault.
51:
52: The issue is the check for duplicate states, which is done by a simple linear
53: search up the state list. (Grep for "duplicate" below to find the code.) For
54: many patterns, there will never be many states active at one time, so a simple
55: linear search is fine. In patterns that have many active states, it might be a
56: bottleneck. The suggested code used an indexing scheme to remember which states
57: had previously been used for each character, and avoided the linear search when
58: it knew there was no chance of a duplicate. This was implemented when adding
59: states to the state lists.
60:
61: I wrote some thread-safe, not-limited code to try something similar at the time
62: of checking for duplicates (instead of when adding states), using index vectors
63: on the stack. It did give a 13% improvement with one specially constructed
64: pattern for certain subject strings, but on other strings and on many of the
65: simpler patterns in the test suite it did worse. The major problem, I think,
66: was the extra time to initialize the index. This had to be done for each call
67: of internal_dfa_exec(). (The supplied patch used a static vector, initialized
68: only once - I suspect this was the cause of the problems with the tests.)
69:
70: Overall, I concluded that the gains in some cases did not outweigh the losses
71: in others, so I abandoned this code. */
72:
73:
74:
75: #ifdef HAVE_CONFIG_H
76: #include "config.h"
77: #endif
78:
79: #define NLBLOCK md /* Block containing newline information */
80: #define PSSTART start_subject /* Field containing processed string start */
81: #define PSEND end_subject /* Field containing processed string end */
82:
83: #include "pcre_internal.h"
84:
85:
86: /* For use to indent debugging output */
87:
88: #define SP " "
89:
90:
91: /*************************************************
92: * Code parameters and static tables *
93: *************************************************/
94:
95: /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
96: into others, under special conditions. A gap of 20 between the blocks should be
97: enough. The resulting opcodes don't have to be less than 256 because they are
98: never stored, so we push them well clear of the normal opcodes. */
99:
100: #define OP_PROP_EXTRA 300
101: #define OP_EXTUNI_EXTRA 320
102: #define OP_ANYNL_EXTRA 340
103: #define OP_HSPACE_EXTRA 360
104: #define OP_VSPACE_EXTRA 380
105:
106:
107: /* This table identifies those opcodes that are followed immediately by a
108: character that is to be tested in some way. This makes it possible to
109: centralize the loading of these characters. In the case of Type * etc, the
110: "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
111: small value. Non-zero values in the table are the offsets from the opcode where
112: the character is to be found. ***NOTE*** If the start of this table is
113: modified, the three tables that follow must also be modified. */
114:
1.1.1.2 misho 115: static const pcre_uint8 coptable[] = {
1.1 misho 116: 0, /* End */
117: 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
118: 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
119: 0, 0, 0, /* Any, AllAny, Anybyte */
120: 0, 0, /* \P, \p */
121: 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
122: 0, /* \X */
1.1.1.5 ! misho 123: 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
1.1 misho 124: 1, /* Char */
125: 1, /* Chari */
126: 1, /* not */
127: 1, /* noti */
128: /* Positive single-char repeats */
129: 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
1.1.1.2 misho 130: 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
131: 1+IMM2_SIZE, /* exact */
132: 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
1.1 misho 133: 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
1.1.1.2 misho 134: 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
135: 1+IMM2_SIZE, /* exact I */
136: 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
1.1 misho 137: /* Negative single-char repeats - only for chars < 256 */
138: 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
1.1.1.2 misho 139: 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
140: 1+IMM2_SIZE, /* NOT exact */
141: 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
1.1 misho 142: 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
1.1.1.2 misho 143: 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
144: 1+IMM2_SIZE, /* NOT exact I */
145: 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
1.1 misho 146: /* Positive type repeats */
147: 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
1.1.1.2 misho 148: 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
149: 1+IMM2_SIZE, /* Type exact */
150: 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
1.1 misho 151: /* Character class & ref repeats */
152: 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
153: 0, 0, /* CRRANGE, CRMINRANGE */
1.1.1.5 ! misho 154: 0, 0, 0, 0, /* Possessive *+, ++, ?+, CRPOSRANGE */
1.1 misho 155: 0, /* CLASS */
156: 0, /* NCLASS */
157: 0, /* XCLASS - variable length */
158: 0, /* REF */
159: 0, /* REFI */
1.1.1.5 ! misho 160: 0, /* DNREF */
! 161: 0, /* DNREFI */
1.1 misho 162: 0, /* RECURSE */
163: 0, /* CALLOUT */
164: 0, /* Alt */
165: 0, /* Ket */
166: 0, /* KetRmax */
167: 0, /* KetRmin */
168: 0, /* KetRpos */
169: 0, /* Reverse */
170: 0, /* Assert */
171: 0, /* Assert not */
172: 0, /* Assert behind */
173: 0, /* Assert behind not */
174: 0, 0, /* ONCE, ONCE_NC */
175: 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
176: 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
1.1.1.5 ! misho 177: 0, 0, /* CREF, DNCREF */
! 178: 0, 0, /* RREF, DNRREF */
1.1 misho 179: 0, /* DEF */
180: 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
181: 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
182: 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
183: 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
184: 0, 0 /* CLOSE, SKIPZERO */
185: };
186:
187: /* This table identifies those opcodes that inspect a character. It is used to
188: remember the fact that a character could have been inspected when the end of
189: the subject is reached. ***NOTE*** If the start of this table is modified, the
190: two tables that follow must also be modified. */
191:
1.1.1.2 misho 192: static const pcre_uint8 poptable[] = {
1.1 misho 193: 0, /* End */
194: 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
195: 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
196: 1, 1, 1, /* Any, AllAny, Anybyte */
197: 1, 1, /* \P, \p */
198: 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
199: 1, /* \X */
1.1.1.5 ! misho 200: 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
1.1 misho 201: 1, /* Char */
202: 1, /* Chari */
203: 1, /* not */
204: 1, /* noti */
205: /* Positive single-char repeats */
206: 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
207: 1, 1, 1, /* upto, minupto, exact */
208: 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
209: 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
210: 1, 1, 1, /* upto I, minupto I, exact I */
211: 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
212: /* Negative single-char repeats - only for chars < 256 */
213: 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
214: 1, 1, 1, /* NOT upto, minupto, exact */
215: 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
216: 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
217: 1, 1, 1, /* NOT upto I, minupto I, exact I */
218: 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
219: /* Positive type repeats */
220: 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
221: 1, 1, 1, /* Type upto, minupto, exact */
222: 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
223: /* Character class & ref repeats */
224: 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
225: 1, 1, /* CRRANGE, CRMINRANGE */
1.1.1.5 ! misho 226: 1, 1, 1, 1, /* Possessive *+, ++, ?+, CRPOSRANGE */
1.1 misho 227: 1, /* CLASS */
228: 1, /* NCLASS */
229: 1, /* XCLASS - variable length */
230: 0, /* REF */
231: 0, /* REFI */
1.1.1.5 ! misho 232: 0, /* DNREF */
! 233: 0, /* DNREFI */
1.1 misho 234: 0, /* RECURSE */
235: 0, /* CALLOUT */
236: 0, /* Alt */
237: 0, /* Ket */
238: 0, /* KetRmax */
239: 0, /* KetRmin */
240: 0, /* KetRpos */
241: 0, /* Reverse */
242: 0, /* Assert */
243: 0, /* Assert not */
244: 0, /* Assert behind */
245: 0, /* Assert behind not */
246: 0, 0, /* ONCE, ONCE_NC */
247: 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
248: 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
1.1.1.5 ! misho 249: 0, 0, /* CREF, DNCREF */
! 250: 0, 0, /* RREF, DNRREF */
1.1 misho 251: 0, /* DEF */
252: 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
253: 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
254: 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
255: 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
256: 0, 0 /* CLOSE, SKIPZERO */
257: };
258:
259: /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
260: and \w */
261:
1.1.1.2 misho 262: static const pcre_uint8 toptable1[] = {
1.1 misho 263: 0, 0, 0, 0, 0, 0,
264: ctype_digit, ctype_digit,
265: ctype_space, ctype_space,
266: ctype_word, ctype_word,
267: 0, 0 /* OP_ANY, OP_ALLANY */
268: };
269:
1.1.1.2 misho 270: static const pcre_uint8 toptable2[] = {
1.1 misho 271: 0, 0, 0, 0, 0, 0,
272: ctype_digit, 0,
273: ctype_space, 0,
274: ctype_word, 0,
275: 1, 1 /* OP_ANY, OP_ALLANY */
276: };
277:
278:
279: /* Structure for holding data about a particular state, which is in effect the
280: current data for an active path through the match tree. It must consist
281: entirely of ints because the working vector we are passed, and which we put
282: these structures in, is a vector of ints. */
283:
284: typedef struct stateblock {
285: int offset; /* Offset to opcode */
286: int count; /* Count for repeats */
287: int data; /* Some use extra data */
288: } stateblock;
289:
1.1.1.3 misho 290: #define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
1.1 misho 291:
292:
293: #ifdef PCRE_DEBUG
294: /*************************************************
295: * Print character string *
296: *************************************************/
297:
298: /* Character string printing function for debugging.
299:
300: Arguments:
301: p points to string
302: length number of bytes
303: f where to print
304:
305: Returns: nothing
306: */
307:
308: static void
1.1.1.2 misho 309: pchars(const pcre_uchar *p, int length, FILE *f)
1.1 misho 310: {
1.1.1.4 misho 311: pcre_uint32 c;
1.1 misho 312: while (length-- > 0)
313: {
314: if (isprint(c = *(p++)))
315: fprintf(f, "%c", c);
316: else
1.1.1.4 misho 317: fprintf(f, "\\x{%02x}", c);
1.1 misho 318: }
319: }
320: #endif
321:
322:
323:
324: /*************************************************
325: * Execute a Regular Expression - DFA engine *
326: *************************************************/
327:
328: /* This internal function applies a compiled pattern to a subject string,
329: starting at a given point, using a DFA engine. This function is called from the
330: external one, possibly multiple times if the pattern is not anchored. The
331: function calls itself recursively for some kinds of subpattern.
332:
333: Arguments:
334: md the match_data block with fixed information
335: this_start_code the opening bracket of this subexpression's code
336: current_subject where we currently are in the subject string
337: start_offset start offset in the subject string
338: offsets vector to contain the matching string offsets
339: offsetcount size of same
340: workspace vector of workspace
341: wscount size of same
342: rlevel function call recursion level
343:
344: Returns: > 0 => number of match offset pairs placed in offsets
345: = 0 => offsets overflowed; longest matches are present
346: -1 => failed to match
347: < -1 => some kind of unexpected problem
348:
349: The following macros are used for adding states to the two state vectors (one
350: for the current character, one for the following character). */
351:
352: #define ADD_ACTIVE(x,y) \
353: if (active_count++ < wscount) \
354: { \
355: next_active_state->offset = (x); \
356: next_active_state->count = (y); \
357: next_active_state++; \
358: DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
359: } \
360: else return PCRE_ERROR_DFA_WSSIZE
361:
362: #define ADD_ACTIVE_DATA(x,y,z) \
363: if (active_count++ < wscount) \
364: { \
365: next_active_state->offset = (x); \
366: next_active_state->count = (y); \
367: next_active_state->data = (z); \
368: next_active_state++; \
369: DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
370: } \
371: else return PCRE_ERROR_DFA_WSSIZE
372:
373: #define ADD_NEW(x,y) \
374: if (new_count++ < wscount) \
375: { \
376: next_new_state->offset = (x); \
377: next_new_state->count = (y); \
378: next_new_state++; \
379: DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
380: } \
381: else return PCRE_ERROR_DFA_WSSIZE
382:
383: #define ADD_NEW_DATA(x,y,z) \
384: if (new_count++ < wscount) \
385: { \
386: next_new_state->offset = (x); \
387: next_new_state->count = (y); \
388: next_new_state->data = (z); \
389: next_new_state++; \
1.1.1.3 misho 390: DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
391: (x), (y), (z), __LINE__)); \
1.1 misho 392: } \
393: else return PCRE_ERROR_DFA_WSSIZE
394:
395: /* And now, here is the code */
396:
397: static int
398: internal_dfa_exec(
399: dfa_match_data *md,
1.1.1.2 misho 400: const pcre_uchar *this_start_code,
401: const pcre_uchar *current_subject,
1.1 misho 402: int start_offset,
403: int *offsets,
404: int offsetcount,
405: int *workspace,
406: int wscount,
407: int rlevel)
408: {
409: stateblock *active_states, *new_states, *temp_states;
410: stateblock *next_active_state, *next_new_state;
411:
1.1.1.2 misho 412: const pcre_uint8 *ctypes, *lcc, *fcc;
413: const pcre_uchar *ptr;
414: const pcre_uchar *end_code, *first_op;
1.1 misho 415:
416: dfa_recursion_info new_recursive;
417:
418: int active_count, new_count, match_count;
419:
420: /* Some fields in the md block are frequently referenced, so we load them into
421: independent variables in the hope that this will perform better. */
422:
1.1.1.2 misho 423: const pcre_uchar *start_subject = md->start_subject;
424: const pcre_uchar *end_subject = md->end_subject;
425: const pcre_uchar *start_code = md->start_code;
1.1 misho 426:
1.1.1.2 misho 427: #ifdef SUPPORT_UTF
428: BOOL utf = (md->poptions & PCRE_UTF8) != 0;
1.1 misho 429: #else
1.1.1.2 misho 430: BOOL utf = FALSE;
1.1 misho 431: #endif
432:
1.1.1.3 misho 433: BOOL reset_could_continue = FALSE;
434:
1.1 misho 435: rlevel++;
436: offsetcount &= (-2);
437:
438: wscount -= 2;
439: wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
440: (2 * INTS_PER_STATEBLOCK);
441:
442: DPRINTF(("\n%.*s---------------------\n"
443: "%.*sCall to internal_dfa_exec f=%d\n",
444: rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
445:
446: ctypes = md->tables + ctypes_offset;
447: lcc = md->tables + lcc_offset;
448: fcc = md->tables + fcc_offset;
449:
450: match_count = PCRE_ERROR_NOMATCH; /* A negative number */
451:
452: active_states = (stateblock *)(workspace + 2);
453: next_new_state = new_states = active_states + wscount;
454: new_count = 0;
455:
456: first_op = this_start_code + 1 + LINK_SIZE +
457: ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
1.1.1.2 misho 458: *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
459: ? IMM2_SIZE:0);
1.1 misho 460:
461: /* The first thing in any (sub) pattern is a bracket of some sort. Push all
462: the alternative states onto the list, and find out where the end is. This
463: makes is possible to use this function recursively, when we want to stop at a
464: matching internal ket rather than at the end.
465:
466: If the first opcode in the first alternative is OP_REVERSE, we are dealing with
467: a backward assertion. In that case, we have to find out the maximum amount to
468: move back, and set up each alternative appropriately. */
469:
470: if (*first_op == OP_REVERSE)
471: {
472: int max_back = 0;
473: int gone_back;
474:
475: end_code = this_start_code;
476: do
477: {
478: int back = GET(end_code, 2+LINK_SIZE);
479: if (back > max_back) max_back = back;
480: end_code += GET(end_code, 1);
481: }
482: while (*end_code == OP_ALT);
483:
484: /* If we can't go back the amount required for the longest lookbehind
485: pattern, go back as far as we can; some alternatives may still be viable. */
486:
1.1.1.2 misho 487: #ifdef SUPPORT_UTF
1.1 misho 488: /* In character mode we have to step back character by character */
489:
1.1.1.2 misho 490: if (utf)
1.1 misho 491: {
492: for (gone_back = 0; gone_back < max_back; gone_back++)
493: {
494: if (current_subject <= start_subject) break;
495: current_subject--;
1.1.1.2 misho 496: ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
1.1 misho 497: }
498: }
499: else
500: #endif
501:
502: /* In byte-mode we can do this quickly. */
503:
504: {
505: gone_back = (current_subject - max_back < start_subject)?
506: (int)(current_subject - start_subject) : max_back;
507: current_subject -= gone_back;
508: }
509:
510: /* Save the earliest consulted character */
511:
512: if (current_subject < md->start_used_ptr)
513: md->start_used_ptr = current_subject;
514:
515: /* Now we can process the individual branches. */
516:
517: end_code = this_start_code;
518: do
519: {
520: int back = GET(end_code, 2+LINK_SIZE);
521: if (back <= gone_back)
522: {
523: int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
524: ADD_NEW_DATA(-bstate, 0, gone_back - back);
525: }
526: end_code += GET(end_code, 1);
527: }
528: while (*end_code == OP_ALT);
529: }
530:
531: /* This is the code for a "normal" subpattern (not a backward assertion). The
532: start of a whole pattern is always one of these. If we are at the top level,
533: we may be asked to restart matching from the same point that we reached for a
534: previous partial match. We still have to scan through the top-level branches to
535: find the end state. */
536:
537: else
538: {
539: end_code = this_start_code;
540:
541: /* Restarting */
542:
543: if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
544: {
545: do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
546: new_count = workspace[1];
547: if (!workspace[0])
548: memcpy(new_states, active_states, new_count * sizeof(stateblock));
549: }
550:
551: /* Not restarting */
552:
553: else
554: {
555: int length = 1 + LINK_SIZE +
556: ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
1.1.1.2 misho 557: *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
558: ? IMM2_SIZE:0);
1.1 misho 559: do
560: {
561: ADD_NEW((int)(end_code - start_code + length), 0);
562: end_code += GET(end_code, 1);
563: length = 1 + LINK_SIZE;
564: }
565: while (*end_code == OP_ALT);
566: }
567: }
568:
569: workspace[0] = 0; /* Bit indicating which vector is current */
570:
1.1.1.2 misho 571: DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
1.1 misho 572:
573: /* Loop for scanning the subject */
574:
575: ptr = current_subject;
576: for (;;)
577: {
578: int i, j;
579: int clen, dlen;
1.1.1.4 misho 580: pcre_uint32 c, d;
1.1 misho 581: int forced_fail = 0;
1.1.1.3 misho 582: BOOL partial_newline = FALSE;
583: BOOL could_continue = reset_could_continue;
584: reset_could_continue = FALSE;
1.1 misho 585:
586: /* Make the new state list into the active state list and empty the
587: new state list. */
588:
589: temp_states = active_states;
590: active_states = new_states;
591: new_states = temp_states;
592: active_count = new_count;
593: new_count = 0;
594:
595: workspace[0] ^= 1; /* Remember for the restarting feature */
596: workspace[1] = active_count;
597:
598: #ifdef PCRE_DEBUG
599: printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
1.1.1.2 misho 600: pchars(ptr, STRLEN_UC(ptr), stdout);
1.1 misho 601: printf("\"\n");
602:
603: printf("%.*sActive states: ", rlevel*2-2, SP);
604: for (i = 0; i < active_count; i++)
605: printf("%d/%d ", active_states[i].offset, active_states[i].count);
606: printf("\n");
607: #endif
608:
609: /* Set the pointers for adding new states */
610:
611: next_active_state = active_states + active_count;
612: next_new_state = new_states;
613:
614: /* Load the current character from the subject outside the loop, as many
615: different states may want to look at it, and we assume that at least one
616: will. */
617:
618: if (ptr < end_subject)
619: {
1.1.1.3 misho 620: clen = 1; /* Number of data items in the character */
1.1.1.2 misho 621: #ifdef SUPPORT_UTF
1.1.1.4 misho 622: GETCHARLENTEST(c, ptr, clen);
623: #else
1.1 misho 624: c = *ptr;
1.1.1.4 misho 625: #endif /* SUPPORT_UTF */
1.1 misho 626: }
627: else
628: {
629: clen = 0; /* This indicates the end of the subject */
630: c = NOTACHAR; /* This value should never actually be used */
631: }
632:
633: /* Scan up the active states and act on each one. The result of an action
634: may be to add more states to the currently active list (e.g. on hitting a
635: parenthesis) or it may be to put states on the new list, for considering
636: when we move the character pointer on. */
637:
638: for (i = 0; i < active_count; i++)
639: {
640: stateblock *current_state = active_states + i;
641: BOOL caseless = FALSE;
1.1.1.2 misho 642: const pcre_uchar *code;
1.1 misho 643: int state_offset = current_state->offset;
1.1.1.4 misho 644: int codevalue, rrc;
645: int count;
1.1 misho 646:
647: #ifdef PCRE_DEBUG
648: printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
649: if (clen == 0) printf("EOL\n");
650: else if (c > 32 && c < 127) printf("'%c'\n", c);
651: else printf("0x%02x\n", c);
652: #endif
653:
654: /* A negative offset is a special case meaning "hold off going to this
655: (negated) state until the number of characters in the data field have
1.1.1.3 misho 656: been skipped". If the could_continue flag was passed over from a previous
657: state, arrange for it to passed on. */
1.1 misho 658:
659: if (state_offset < 0)
660: {
661: if (current_state->data > 0)
662: {
663: DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
664: ADD_NEW_DATA(state_offset, current_state->count,
665: current_state->data - 1);
1.1.1.3 misho 666: if (could_continue) reset_could_continue = TRUE;
1.1 misho 667: continue;
668: }
669: else
670: {
671: current_state->offset = state_offset = -state_offset;
672: }
673: }
674:
675: /* Check for a duplicate state with the same count, and skip if found.
676: See the note at the head of this module about the possibility of improving
677: performance here. */
678:
679: for (j = 0; j < i; j++)
680: {
681: if (active_states[j].offset == state_offset &&
682: active_states[j].count == current_state->count)
683: {
684: DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
685: goto NEXT_ACTIVE_STATE;
686: }
687: }
688:
689: /* The state offset is the offset to the opcode */
690:
691: code = start_code + state_offset;
692: codevalue = *code;
693:
694: /* If this opcode inspects a character, but we are at the end of the
695: subject, remember the fact for use when testing for a partial match. */
696:
697: if (clen == 0 && poptable[codevalue] != 0)
698: could_continue = TRUE;
699:
700: /* If this opcode is followed by an inline character, load it. It is
701: tempting to test for the presence of a subject character here, but that
702: is wrong, because sometimes zero repetitions of the subject are
703: permitted.
704:
705: We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
1.1.1.3 misho 706: argument that is not a data character - but is always one byte long because
707: the values are small. We have to take special action to deal with \P, \p,
708: \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
709: these ones to new opcodes. */
1.1 misho 710:
711: if (coptable[codevalue] > 0)
712: {
713: dlen = 1;
1.1.1.2 misho 714: #ifdef SUPPORT_UTF
715: if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
716: #endif /* SUPPORT_UTF */
1.1 misho 717: d = code[coptable[codevalue]];
718: if (codevalue >= OP_TYPESTAR)
719: {
720: switch(d)
721: {
722: case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
723: case OP_NOTPROP:
724: case OP_PROP: codevalue += OP_PROP_EXTRA; break;
725: case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
726: case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
727: case OP_NOT_HSPACE:
728: case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
729: case OP_NOT_VSPACE:
730: case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
731: default: break;
732: }
733: }
734: }
735: else
736: {
737: dlen = 0; /* Not strictly necessary, but compilers moan */
738: d = NOTACHAR; /* if these variables are not set. */
739: }
740:
741:
742: /* Now process the individual opcodes */
743:
744: switch (codevalue)
745: {
746: /* ========================================================================== */
747: /* These cases are never obeyed. This is a fudge that causes a compile-
748: time error if the vectors coptable or poptable, which are indexed by
749: opcode, are not the correct length. It seems to be the only way to do
750: such a check at compile time, as the sizeof() operator does not work
751: in the C preprocessor. */
752:
753: case OP_TABLE_LENGTH:
754: case OP_TABLE_LENGTH +
755: ((sizeof(coptable) == OP_TABLE_LENGTH) &&
756: (sizeof(poptable) == OP_TABLE_LENGTH)):
757: break;
758:
759: /* ========================================================================== */
760: /* Reached a closing bracket. If not at the end of the pattern, carry
761: on with the next opcode. For repeating opcodes, also add the repeat
762: state. Note that KETRPOS will always be encountered at the end of the
763: subpattern, because the possessive subpattern repeats are always handled
764: using recursive calls. Thus, it never adds any new states.
765:
766: At the end of the (sub)pattern, unless we have an empty string and
767: PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
768: start of the subject, save the match data, shifting up all previous
769: matches so we always have the longest first. */
770:
771: case OP_KET:
772: case OP_KETRMIN:
773: case OP_KETRMAX:
774: case OP_KETRPOS:
775: if (code != end_code)
776: {
777: ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
778: if (codevalue != OP_KET)
779: {
780: ADD_ACTIVE(state_offset - GET(code, 1), 0);
781: }
782: }
783: else
784: {
785: if (ptr > current_subject ||
786: ((md->moptions & PCRE_NOTEMPTY) == 0 &&
787: ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
788: current_subject > start_subject + md->start_offset)))
789: {
790: if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
791: else if (match_count > 0 && ++match_count * 2 > offsetcount)
792: match_count = 0;
793: count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
794: if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
795: if (offsetcount >= 2)
796: {
797: offsets[0] = (int)(current_subject - start_subject);
798: offsets[1] = (int)(ptr - start_subject);
799: DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
1.1.1.3 misho 800: offsets[1] - offsets[0], (char *)current_subject));
1.1 misho 801: }
802: if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
803: {
804: DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
805: "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
806: match_count, rlevel*2-2, SP));
807: return match_count;
808: }
809: }
810: }
811: break;
812:
813: /* ========================================================================== */
814: /* These opcodes add to the current list of states without looking
815: at the current character. */
816:
817: /*-----------------------------------------------------------------*/
818: case OP_ALT:
819: do { code += GET(code, 1); } while (*code == OP_ALT);
820: ADD_ACTIVE((int)(code - start_code), 0);
821: break;
822:
823: /*-----------------------------------------------------------------*/
824: case OP_BRA:
825: case OP_SBRA:
826: do
827: {
828: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
829: code += GET(code, 1);
830: }
831: while (*code == OP_ALT);
832: break;
833:
834: /*-----------------------------------------------------------------*/
835: case OP_CBRA:
836: case OP_SCBRA:
1.1.1.2 misho 837: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
1.1 misho 838: code += GET(code, 1);
839: while (*code == OP_ALT)
840: {
841: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
842: code += GET(code, 1);
843: }
844: break;
845:
846: /*-----------------------------------------------------------------*/
847: case OP_BRAZERO:
848: case OP_BRAMINZERO:
849: ADD_ACTIVE(state_offset + 1, 0);
850: code += 1 + GET(code, 2);
851: while (*code == OP_ALT) code += GET(code, 1);
852: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
853: break;
854:
855: /*-----------------------------------------------------------------*/
856: case OP_SKIPZERO:
857: code += 1 + GET(code, 2);
858: while (*code == OP_ALT) code += GET(code, 1);
859: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
860: break;
861:
862: /*-----------------------------------------------------------------*/
863: case OP_CIRC:
864: if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
865: { ADD_ACTIVE(state_offset + 1, 0); }
866: break;
867:
868: /*-----------------------------------------------------------------*/
869: case OP_CIRCM:
870: if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
871: (ptr != end_subject && WAS_NEWLINE(ptr)))
872: { ADD_ACTIVE(state_offset + 1, 0); }
873: break;
874:
875: /*-----------------------------------------------------------------*/
876: case OP_EOD:
877: if (ptr >= end_subject)
878: {
879: if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
880: could_continue = TRUE;
881: else { ADD_ACTIVE(state_offset + 1, 0); }
882: }
883: break;
884:
885: /*-----------------------------------------------------------------*/
886: case OP_SOD:
887: if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
888: break;
889:
890: /*-----------------------------------------------------------------*/
891: case OP_SOM:
892: if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
893: break;
894:
895:
896: /* ========================================================================== */
897: /* These opcodes inspect the next subject character, and sometimes
898: the previous one as well, but do not have an argument. The variable
899: clen contains the length of the current character and is zero if we are
900: at the end of the subject. */
901:
902: /*-----------------------------------------------------------------*/
903: case OP_ANY:
904: if (clen > 0 && !IS_NEWLINE(ptr))
1.1.1.3 misho 905: {
906: if (ptr + 1 >= md->end_subject &&
907: (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
908: NLBLOCK->nltype == NLTYPE_FIXED &&
909: NLBLOCK->nllen == 2 &&
910: c == NLBLOCK->nl[0])
911: {
912: could_continue = partial_newline = TRUE;
913: }
914: else
915: {
916: ADD_NEW(state_offset + 1, 0);
917: }
918: }
1.1 misho 919: break;
920:
921: /*-----------------------------------------------------------------*/
922: case OP_ALLANY:
923: if (clen > 0)
924: { ADD_NEW(state_offset + 1, 0); }
925: break;
926:
927: /*-----------------------------------------------------------------*/
928: case OP_EODN:
929: if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
930: could_continue = TRUE;
931: else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
932: { ADD_ACTIVE(state_offset + 1, 0); }
933: break;
934:
935: /*-----------------------------------------------------------------*/
936: case OP_DOLL:
937: if ((md->moptions & PCRE_NOTEOL) == 0)
938: {
939: if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
940: could_continue = TRUE;
941: else if (clen == 0 ||
942: ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
943: (ptr == end_subject - md->nllen)
944: ))
945: { ADD_ACTIVE(state_offset + 1, 0); }
1.1.1.3 misho 946: else if (ptr + 1 >= md->end_subject &&
947: (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
948: NLBLOCK->nltype == NLTYPE_FIXED &&
949: NLBLOCK->nllen == 2 &&
950: c == NLBLOCK->nl[0])
951: {
952: if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
953: {
954: reset_could_continue = TRUE;
955: ADD_NEW_DATA(-(state_offset + 1), 0, 1);
956: }
957: else could_continue = partial_newline = TRUE;
958: }
1.1 misho 959: }
960: break;
961:
962: /*-----------------------------------------------------------------*/
963: case OP_DOLLM:
964: if ((md->moptions & PCRE_NOTEOL) == 0)
965: {
966: if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
967: could_continue = TRUE;
968: else if (clen == 0 ||
969: ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
970: { ADD_ACTIVE(state_offset + 1, 0); }
1.1.1.3 misho 971: else if (ptr + 1 >= md->end_subject &&
972: (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
973: NLBLOCK->nltype == NLTYPE_FIXED &&
974: NLBLOCK->nllen == 2 &&
975: c == NLBLOCK->nl[0])
976: {
977: if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
978: {
979: reset_could_continue = TRUE;
980: ADD_NEW_DATA(-(state_offset + 1), 0, 1);
981: }
982: else could_continue = partial_newline = TRUE;
983: }
1.1 misho 984: }
985: else if (IS_NEWLINE(ptr))
986: { ADD_ACTIVE(state_offset + 1, 0); }
987: break;
988:
989: /*-----------------------------------------------------------------*/
990:
991: case OP_DIGIT:
992: case OP_WHITESPACE:
993: case OP_WORDCHAR:
994: if (clen > 0 && c < 256 &&
995: ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
996: { ADD_NEW(state_offset + 1, 0); }
997: break;
998:
999: /*-----------------------------------------------------------------*/
1000: case OP_NOT_DIGIT:
1001: case OP_NOT_WHITESPACE:
1002: case OP_NOT_WORDCHAR:
1003: if (clen > 0 && (c >= 256 ||
1004: ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
1005: { ADD_NEW(state_offset + 1, 0); }
1006: break;
1007:
1008: /*-----------------------------------------------------------------*/
1009: case OP_WORD_BOUNDARY:
1010: case OP_NOT_WORD_BOUNDARY:
1011: {
1012: int left_word, right_word;
1013:
1014: if (ptr > start_subject)
1015: {
1.1.1.2 misho 1016: const pcre_uchar *temp = ptr - 1;
1.1 misho 1017: if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1.1.1.4 misho 1018: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
1.1.1.2 misho 1019: if (utf) { BACKCHAR(temp); }
1.1 misho 1020: #endif
1021: GETCHARTEST(d, temp);
1022: #ifdef SUPPORT_UCP
1023: if ((md->poptions & PCRE_UCP) != 0)
1024: {
1025: if (d == '_') left_word = TRUE; else
1026: {
1027: int cat = UCD_CATEGORY(d);
1028: left_word = (cat == ucp_L || cat == ucp_N);
1029: }
1030: }
1031: else
1032: #endif
1033: left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1034: }
1035: else left_word = FALSE;
1036:
1037: if (clen > 0)
1038: {
1039: #ifdef SUPPORT_UCP
1040: if ((md->poptions & PCRE_UCP) != 0)
1041: {
1042: if (c == '_') right_word = TRUE; else
1043: {
1044: int cat = UCD_CATEGORY(c);
1045: right_word = (cat == ucp_L || cat == ucp_N);
1046: }
1047: }
1048: else
1049: #endif
1050: right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1051: }
1052: else right_word = FALSE;
1053:
1054: if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1055: { ADD_ACTIVE(state_offset + 1, 0); }
1056: }
1057: break;
1058:
1059:
1060: /*-----------------------------------------------------------------*/
1061: /* Check the next character by Unicode property. We will get here only
1062: if the support is in the binary; otherwise a compile-time error occurs.
1063: */
1064:
1065: #ifdef SUPPORT_UCP
1066: case OP_PROP:
1067: case OP_NOTPROP:
1068: if (clen > 0)
1069: {
1070: BOOL OK;
1.1.1.4 misho 1071: const pcre_uint32 *cp;
1.1 misho 1072: const ucd_record * prop = GET_UCD(c);
1073: switch(code[1])
1074: {
1075: case PT_ANY:
1076: OK = TRUE;
1077: break;
1078:
1079: case PT_LAMP:
1080: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1081: prop->chartype == ucp_Lt;
1082: break;
1083:
1084: case PT_GC:
1.1.1.2 misho 1085: OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1.1 misho 1086: break;
1087:
1088: case PT_PC:
1089: OK = prop->chartype == code[2];
1090: break;
1091:
1092: case PT_SC:
1093: OK = prop->script == code[2];
1094: break;
1095:
1096: /* These are specials for combination cases. */
1097:
1098: case PT_ALNUM:
1.1.1.2 misho 1099: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1100: PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1.1 misho 1101: break;
1102:
1.1.1.5 ! misho 1103: /* Perl space used to exclude VT, but from Perl 5.18 it is included,
! 1104: which means that Perl space and POSIX space are now identical. PCRE
! 1105: was changed at release 8.34. */
1.1 misho 1106:
1.1.1.5 ! misho 1107: case PT_SPACE: /* Perl space */
1.1 misho 1108: case PT_PXSPACE: /* POSIX space */
1.1.1.5 ! misho 1109: switch(c)
! 1110: {
! 1111: HSPACE_CASES:
! 1112: VSPACE_CASES:
! 1113: OK = TRUE;
! 1114: break;
! 1115:
! 1116: default:
! 1117: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
! 1118: break;
! 1119: }
1.1 misho 1120: break;
1121:
1122: case PT_WORD:
1.1.1.2 misho 1123: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1124: PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1.1 misho 1125: c == CHAR_UNDERSCORE;
1126: break;
1127:
1.1.1.4 misho 1128: case PT_CLIST:
1129: cp = PRIV(ucd_caseless_sets) + code[2];
1130: for (;;)
1131: {
1132: if (c < *cp) { OK = FALSE; break; }
1133: if (c == *cp++) { OK = TRUE; break; }
1134: }
1135: break;
1136:
1137: case PT_UCNC:
1138: OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1139: c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1140: c >= 0xe000;
1141: break;
1142:
1.1 misho 1143: /* Should never occur, but keep compilers from grumbling. */
1144:
1145: default:
1146: OK = codevalue != OP_PROP;
1147: break;
1148: }
1149:
1150: if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1151: }
1152: break;
1153: #endif
1154:
1155:
1156:
1157: /* ========================================================================== */
1158: /* These opcodes likewise inspect the subject character, but have an
1159: argument that is not a data character. It is one of these opcodes:
1160: OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1161: OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1162:
1163: case OP_TYPEPLUS:
1164: case OP_TYPEMINPLUS:
1165: case OP_TYPEPOSPLUS:
1166: count = current_state->count; /* Already matched */
1167: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1168: if (clen > 0)
1169: {
1.1.1.3 misho 1170: if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1171: (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1172: NLBLOCK->nltype == NLTYPE_FIXED &&
1173: NLBLOCK->nllen == 2 &&
1174: c == NLBLOCK->nl[0])
1175: {
1176: could_continue = partial_newline = TRUE;
1177: }
1178: else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1.1 misho 1179: (c < 256 &&
1180: (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1181: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1182: {
1183: if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1184: {
1185: active_count--; /* Remove non-match possibility */
1186: next_active_state--;
1187: }
1188: count++;
1189: ADD_NEW(state_offset, count);
1190: }
1191: }
1192: break;
1193:
1194: /*-----------------------------------------------------------------*/
1195: case OP_TYPEQUERY:
1196: case OP_TYPEMINQUERY:
1197: case OP_TYPEPOSQUERY:
1198: ADD_ACTIVE(state_offset + 2, 0);
1199: if (clen > 0)
1200: {
1.1.1.3 misho 1201: if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1202: (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1203: NLBLOCK->nltype == NLTYPE_FIXED &&
1204: NLBLOCK->nllen == 2 &&
1205: c == NLBLOCK->nl[0])
1206: {
1207: could_continue = partial_newline = TRUE;
1208: }
1209: else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1.1 misho 1210: (c < 256 &&
1211: (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1212: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1213: {
1214: if (codevalue == OP_TYPEPOSQUERY)
1215: {
1216: active_count--; /* Remove non-match possibility */
1217: next_active_state--;
1218: }
1219: ADD_NEW(state_offset + 2, 0);
1220: }
1221: }
1222: break;
1223:
1224: /*-----------------------------------------------------------------*/
1225: case OP_TYPESTAR:
1226: case OP_TYPEMINSTAR:
1227: case OP_TYPEPOSSTAR:
1228: ADD_ACTIVE(state_offset + 2, 0);
1229: if (clen > 0)
1230: {
1.1.1.3 misho 1231: if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1232: (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1233: NLBLOCK->nltype == NLTYPE_FIXED &&
1234: NLBLOCK->nllen == 2 &&
1235: c == NLBLOCK->nl[0])
1236: {
1237: could_continue = partial_newline = TRUE;
1238: }
1239: else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1.1 misho 1240: (c < 256 &&
1241: (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1242: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1243: {
1244: if (codevalue == OP_TYPEPOSSTAR)
1245: {
1246: active_count--; /* Remove non-match possibility */
1247: next_active_state--;
1248: }
1249: ADD_NEW(state_offset, 0);
1250: }
1251: }
1252: break;
1253:
1254: /*-----------------------------------------------------------------*/
1255: case OP_TYPEEXACT:
1256: count = current_state->count; /* Number already matched */
1257: if (clen > 0)
1258: {
1.1.1.3 misho 1259: if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1260: (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1261: NLBLOCK->nltype == NLTYPE_FIXED &&
1262: NLBLOCK->nllen == 2 &&
1263: c == NLBLOCK->nl[0])
1264: {
1265: could_continue = partial_newline = TRUE;
1266: }
1267: else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1.1 misho 1268: (c < 256 &&
1269: (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1270: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1271: {
1.1.1.4 misho 1272: if (++count >= (int)GET2(code, 1))
1.1.1.2 misho 1273: { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1.1 misho 1274: else
1275: { ADD_NEW(state_offset, count); }
1276: }
1277: }
1278: break;
1279:
1280: /*-----------------------------------------------------------------*/
1281: case OP_TYPEUPTO:
1282: case OP_TYPEMINUPTO:
1283: case OP_TYPEPOSUPTO:
1.1.1.2 misho 1284: ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1.1 misho 1285: count = current_state->count; /* Number already matched */
1286: if (clen > 0)
1287: {
1.1.1.3 misho 1288: if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1289: (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1290: NLBLOCK->nltype == NLTYPE_FIXED &&
1291: NLBLOCK->nllen == 2 &&
1292: c == NLBLOCK->nl[0])
1293: {
1294: could_continue = partial_newline = TRUE;
1295: }
1296: else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1.1 misho 1297: (c < 256 &&
1298: (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1299: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1300: {
1301: if (codevalue == OP_TYPEPOSUPTO)
1302: {
1303: active_count--; /* Remove non-match possibility */
1304: next_active_state--;
1305: }
1.1.1.4 misho 1306: if (++count >= (int)GET2(code, 1))
1.1.1.2 misho 1307: { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1.1 misho 1308: else
1309: { ADD_NEW(state_offset, count); }
1310: }
1311: }
1312: break;
1313:
1314: /* ========================================================================== */
1315: /* These are virtual opcodes that are used when something like
1316: OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1317: argument. It keeps the code above fast for the other cases. The argument
1318: is in the d variable. */
1319:
1320: #ifdef SUPPORT_UCP
1321: case OP_PROP_EXTRA + OP_TYPEPLUS:
1322: case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1323: case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1324: count = current_state->count; /* Already matched */
1325: if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1326: if (clen > 0)
1327: {
1328: BOOL OK;
1.1.1.4 misho 1329: const pcre_uint32 *cp;
1.1 misho 1330: const ucd_record * prop = GET_UCD(c);
1331: switch(code[2])
1332: {
1333: case PT_ANY:
1334: OK = TRUE;
1335: break;
1336:
1337: case PT_LAMP:
1338: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1339: prop->chartype == ucp_Lt;
1340: break;
1341:
1342: case PT_GC:
1.1.1.2 misho 1343: OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1.1 misho 1344: break;
1345:
1346: case PT_PC:
1347: OK = prop->chartype == code[3];
1348: break;
1349:
1350: case PT_SC:
1351: OK = prop->script == code[3];
1352: break;
1353:
1354: /* These are specials for combination cases. */
1355:
1356: case PT_ALNUM:
1.1.1.2 misho 1357: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1358: PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1.1 misho 1359: break;
1360:
1.1.1.5 ! misho 1361: /* Perl space used to exclude VT, but from Perl 5.18 it is included,
! 1362: which means that Perl space and POSIX space are now identical. PCRE
! 1363: was changed at release 8.34. */
1.1 misho 1364:
1.1.1.5 ! misho 1365: case PT_SPACE: /* Perl space */
1.1 misho 1366: case PT_PXSPACE: /* POSIX space */
1.1.1.5 ! misho 1367: switch(c)
! 1368: {
! 1369: HSPACE_CASES:
! 1370: VSPACE_CASES:
! 1371: OK = TRUE;
! 1372: break;
! 1373:
! 1374: default:
! 1375: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
! 1376: break;
! 1377: }
1.1 misho 1378: break;
1379:
1380: case PT_WORD:
1.1.1.2 misho 1381: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1382: PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1.1 misho 1383: c == CHAR_UNDERSCORE;
1384: break;
1385:
1.1.1.4 misho 1386: case PT_CLIST:
1387: cp = PRIV(ucd_caseless_sets) + code[3];
1388: for (;;)
1389: {
1390: if (c < *cp) { OK = FALSE; break; }
1391: if (c == *cp++) { OK = TRUE; break; }
1392: }
1393: break;
1394:
1395: case PT_UCNC:
1396: OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1397: c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1398: c >= 0xe000;
1399: break;
1400:
1.1 misho 1401: /* Should never occur, but keep compilers from grumbling. */
1402:
1403: default:
1404: OK = codevalue != OP_PROP;
1405: break;
1406: }
1407:
1408: if (OK == (d == OP_PROP))
1409: {
1410: if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1411: {
1412: active_count--; /* Remove non-match possibility */
1413: next_active_state--;
1414: }
1415: count++;
1416: ADD_NEW(state_offset, count);
1417: }
1418: }
1419: break;
1420:
1421: /*-----------------------------------------------------------------*/
1422: case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1423: case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1424: case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1425: count = current_state->count; /* Already matched */
1426: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1.1.1.4 misho 1427: if (clen > 0)
1.1 misho 1428: {
1.1.1.4 misho 1429: int lgb, rgb;
1.1.1.2 misho 1430: const pcre_uchar *nptr = ptr + clen;
1.1 misho 1431: int ncount = 0;
1432: if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1433: {
1434: active_count--; /* Remove non-match possibility */
1435: next_active_state--;
1436: }
1.1.1.4 misho 1437: lgb = UCD_GRAPHBREAK(c);
1.1 misho 1438: while (nptr < end_subject)
1439: {
1.1.1.4 misho 1440: dlen = 1;
1441: if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1442: rgb = UCD_GRAPHBREAK(d);
1443: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1.1 misho 1444: ncount++;
1.1.1.4 misho 1445: lgb = rgb;
1446: nptr += dlen;
1.1 misho 1447: }
1448: count++;
1449: ADD_NEW_DATA(-state_offset, count, ncount);
1450: }
1451: break;
1452: #endif
1453:
1454: /*-----------------------------------------------------------------*/
1455: case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1456: case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1457: case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1458: count = current_state->count; /* Already matched */
1459: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1460: if (clen > 0)
1461: {
1462: int ncount = 0;
1463: switch (c)
1464: {
1.1.1.4 misho 1465: case CHAR_VT:
1466: case CHAR_FF:
1467: case CHAR_NEL:
1468: #ifndef EBCDIC
1.1 misho 1469: case 0x2028:
1470: case 0x2029:
1.1.1.4 misho 1471: #endif /* Not EBCDIC */
1.1 misho 1472: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1473: goto ANYNL01;
1474:
1.1.1.4 misho 1475: case CHAR_CR:
1476: if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1.1 misho 1477: /* Fall through */
1478:
1479: ANYNL01:
1.1.1.4 misho 1480: case CHAR_LF:
1.1 misho 1481: if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1482: {
1483: active_count--; /* Remove non-match possibility */
1484: next_active_state--;
1485: }
1486: count++;
1487: ADD_NEW_DATA(-state_offset, count, ncount);
1488: break;
1489:
1490: default:
1491: break;
1492: }
1493: }
1494: break;
1495:
1496: /*-----------------------------------------------------------------*/
1497: case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1498: case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1499: case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1500: count = current_state->count; /* Already matched */
1501: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1502: if (clen > 0)
1503: {
1504: BOOL OK;
1505: switch (c)
1506: {
1.1.1.4 misho 1507: VSPACE_CASES:
1.1 misho 1508: OK = TRUE;
1509: break;
1510:
1511: default:
1512: OK = FALSE;
1513: break;
1514: }
1515:
1516: if (OK == (d == OP_VSPACE))
1517: {
1518: if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1519: {
1520: active_count--; /* Remove non-match possibility */
1521: next_active_state--;
1522: }
1523: count++;
1524: ADD_NEW_DATA(-state_offset, count, 0);
1525: }
1526: }
1527: break;
1528:
1529: /*-----------------------------------------------------------------*/
1530: case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1531: case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1532: case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1533: count = current_state->count; /* Already matched */
1534: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1535: if (clen > 0)
1536: {
1537: BOOL OK;
1538: switch (c)
1539: {
1.1.1.4 misho 1540: HSPACE_CASES:
1.1 misho 1541: OK = TRUE;
1542: break;
1543:
1544: default:
1545: OK = FALSE;
1546: break;
1547: }
1548:
1549: if (OK == (d == OP_HSPACE))
1550: {
1551: if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1552: {
1553: active_count--; /* Remove non-match possibility */
1554: next_active_state--;
1555: }
1556: count++;
1557: ADD_NEW_DATA(-state_offset, count, 0);
1558: }
1559: }
1560: break;
1561:
1562: /*-----------------------------------------------------------------*/
1563: #ifdef SUPPORT_UCP
1564: case OP_PROP_EXTRA + OP_TYPEQUERY:
1565: case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1566: case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1567: count = 4;
1568: goto QS1;
1569:
1570: case OP_PROP_EXTRA + OP_TYPESTAR:
1571: case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1572: case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1573: count = 0;
1574:
1575: QS1:
1576:
1577: ADD_ACTIVE(state_offset + 4, 0);
1578: if (clen > 0)
1579: {
1580: BOOL OK;
1.1.1.4 misho 1581: const pcre_uint32 *cp;
1.1 misho 1582: const ucd_record * prop = GET_UCD(c);
1583: switch(code[2])
1584: {
1585: case PT_ANY:
1586: OK = TRUE;
1587: break;
1588:
1589: case PT_LAMP:
1590: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1591: prop->chartype == ucp_Lt;
1592: break;
1593:
1594: case PT_GC:
1.1.1.2 misho 1595: OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1.1 misho 1596: break;
1597:
1598: case PT_PC:
1599: OK = prop->chartype == code[3];
1600: break;
1601:
1602: case PT_SC:
1603: OK = prop->script == code[3];
1604: break;
1605:
1606: /* These are specials for combination cases. */
1607:
1608: case PT_ALNUM:
1.1.1.2 misho 1609: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1610: PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1.1 misho 1611: break;
1612:
1.1.1.5 ! misho 1613: /* Perl space used to exclude VT, but from Perl 5.18 it is included,
! 1614: which means that Perl space and POSIX space are now identical. PCRE
! 1615: was changed at release 8.34. */
1.1 misho 1616:
1.1.1.5 ! misho 1617: case PT_SPACE: /* Perl space */
1.1 misho 1618: case PT_PXSPACE: /* POSIX space */
1.1.1.5 ! misho 1619: switch(c)
! 1620: {
! 1621: HSPACE_CASES:
! 1622: VSPACE_CASES:
! 1623: OK = TRUE;
! 1624: break;
! 1625:
! 1626: default:
! 1627: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
! 1628: break;
! 1629: }
1.1 misho 1630: break;
1631:
1632: case PT_WORD:
1.1.1.2 misho 1633: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1634: PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1.1 misho 1635: c == CHAR_UNDERSCORE;
1636: break;
1637:
1.1.1.4 misho 1638: case PT_CLIST:
1639: cp = PRIV(ucd_caseless_sets) + code[3];
1640: for (;;)
1641: {
1642: if (c < *cp) { OK = FALSE; break; }
1643: if (c == *cp++) { OK = TRUE; break; }
1644: }
1645: break;
1646:
1647: case PT_UCNC:
1648: OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1649: c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1650: c >= 0xe000;
1651: break;
1652:
1.1 misho 1653: /* Should never occur, but keep compilers from grumbling. */
1654:
1655: default:
1656: OK = codevalue != OP_PROP;
1657: break;
1658: }
1659:
1660: if (OK == (d == OP_PROP))
1661: {
1662: if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1663: codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1664: {
1665: active_count--; /* Remove non-match possibility */
1666: next_active_state--;
1667: }
1668: ADD_NEW(state_offset + count, 0);
1669: }
1670: }
1671: break;
1672:
1673: /*-----------------------------------------------------------------*/
1674: case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1675: case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1676: case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1677: count = 2;
1678: goto QS2;
1679:
1680: case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1681: case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1682: case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1683: count = 0;
1684:
1685: QS2:
1686:
1687: ADD_ACTIVE(state_offset + 2, 0);
1.1.1.4 misho 1688: if (clen > 0)
1.1 misho 1689: {
1.1.1.4 misho 1690: int lgb, rgb;
1.1.1.2 misho 1691: const pcre_uchar *nptr = ptr + clen;
1.1 misho 1692: int ncount = 0;
1693: if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1694: codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1695: {
1696: active_count--; /* Remove non-match possibility */
1697: next_active_state--;
1698: }
1.1.1.4 misho 1699: lgb = UCD_GRAPHBREAK(c);
1.1 misho 1700: while (nptr < end_subject)
1701: {
1.1.1.4 misho 1702: dlen = 1;
1703: if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1704: rgb = UCD_GRAPHBREAK(d);
1705: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1.1 misho 1706: ncount++;
1.1.1.4 misho 1707: lgb = rgb;
1708: nptr += dlen;
1.1 misho 1709: }
1710: ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1711: }
1712: break;
1713: #endif
1714:
1715: /*-----------------------------------------------------------------*/
1716: case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1717: case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1718: case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1719: count = 2;
1720: goto QS3;
1721:
1722: case OP_ANYNL_EXTRA + OP_TYPESTAR:
1723: case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1724: case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1725: count = 0;
1726:
1727: QS3:
1728: ADD_ACTIVE(state_offset + 2, 0);
1729: if (clen > 0)
1730: {
1731: int ncount = 0;
1732: switch (c)
1733: {
1.1.1.4 misho 1734: case CHAR_VT:
1735: case CHAR_FF:
1736: case CHAR_NEL:
1737: #ifndef EBCDIC
1.1 misho 1738: case 0x2028:
1739: case 0x2029:
1.1.1.4 misho 1740: #endif /* Not EBCDIC */
1.1 misho 1741: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1742: goto ANYNL02;
1743:
1.1.1.4 misho 1744: case CHAR_CR:
1745: if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1.1 misho 1746: /* Fall through */
1747:
1748: ANYNL02:
1.1.1.4 misho 1749: case CHAR_LF:
1.1 misho 1750: if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1751: codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1752: {
1753: active_count--; /* Remove non-match possibility */
1754: next_active_state--;
1755: }
1.1.1.4 misho 1756: ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1.1 misho 1757: break;
1758:
1759: default:
1760: break;
1761: }
1762: }
1763: break;
1764:
1765: /*-----------------------------------------------------------------*/
1766: case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1767: case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1768: case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1769: count = 2;
1770: goto QS4;
1771:
1772: case OP_VSPACE_EXTRA + OP_TYPESTAR:
1773: case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1774: case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1775: count = 0;
1776:
1777: QS4:
1778: ADD_ACTIVE(state_offset + 2, 0);
1779: if (clen > 0)
1780: {
1781: BOOL OK;
1782: switch (c)
1783: {
1.1.1.4 misho 1784: VSPACE_CASES:
1.1 misho 1785: OK = TRUE;
1786: break;
1787:
1788: default:
1789: OK = FALSE;
1790: break;
1791: }
1792: if (OK == (d == OP_VSPACE))
1793: {
1794: if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1795: codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1796: {
1797: active_count--; /* Remove non-match possibility */
1798: next_active_state--;
1799: }
1.1.1.4 misho 1800: ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1.1 misho 1801: }
1802: }
1803: break;
1804:
1805: /*-----------------------------------------------------------------*/
1806: case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1807: case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1808: case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1809: count = 2;
1810: goto QS5;
1811:
1812: case OP_HSPACE_EXTRA + OP_TYPESTAR:
1813: case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1814: case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1815: count = 0;
1816:
1817: QS5:
1818: ADD_ACTIVE(state_offset + 2, 0);
1819: if (clen > 0)
1820: {
1821: BOOL OK;
1822: switch (c)
1823: {
1.1.1.4 misho 1824: HSPACE_CASES:
1.1 misho 1825: OK = TRUE;
1826: break;
1827:
1828: default:
1829: OK = FALSE;
1830: break;
1831: }
1832:
1833: if (OK == (d == OP_HSPACE))
1834: {
1835: if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1836: codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1837: {
1838: active_count--; /* Remove non-match possibility */
1839: next_active_state--;
1840: }
1.1.1.4 misho 1841: ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1.1 misho 1842: }
1843: }
1844: break;
1845:
1846: /*-----------------------------------------------------------------*/
1847: #ifdef SUPPORT_UCP
1848: case OP_PROP_EXTRA + OP_TYPEEXACT:
1849: case OP_PROP_EXTRA + OP_TYPEUPTO:
1850: case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1851: case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1852: if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1.1.1.2 misho 1853: { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1.1 misho 1854: count = current_state->count; /* Number already matched */
1855: if (clen > 0)
1856: {
1857: BOOL OK;
1.1.1.4 misho 1858: const pcre_uint32 *cp;
1.1 misho 1859: const ucd_record * prop = GET_UCD(c);
1.1.1.2 misho 1860: switch(code[1 + IMM2_SIZE + 1])
1.1 misho 1861: {
1862: case PT_ANY:
1863: OK = TRUE;
1864: break;
1865:
1866: case PT_LAMP:
1867: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1868: prop->chartype == ucp_Lt;
1869: break;
1870:
1871: case PT_GC:
1.1.1.2 misho 1872: OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1.1 misho 1873: break;
1874:
1875: case PT_PC:
1.1.1.2 misho 1876: OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1.1 misho 1877: break;
1878:
1879: case PT_SC:
1.1.1.2 misho 1880: OK = prop->script == code[1 + IMM2_SIZE + 2];
1.1 misho 1881: break;
1882:
1883: /* These are specials for combination cases. */
1884:
1885: case PT_ALNUM:
1.1.1.2 misho 1886: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1887: PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1.1 misho 1888: break;
1889:
1.1.1.5 ! misho 1890: /* Perl space used to exclude VT, but from Perl 5.18 it is included,
! 1891: which means that Perl space and POSIX space are now identical. PCRE
! 1892: was changed at release 8.34. */
1.1 misho 1893:
1.1.1.5 ! misho 1894: case PT_SPACE: /* Perl space */
1.1 misho 1895: case PT_PXSPACE: /* POSIX space */
1.1.1.5 ! misho 1896: switch(c)
! 1897: {
! 1898: HSPACE_CASES:
! 1899: VSPACE_CASES:
! 1900: OK = TRUE;
! 1901: break;
! 1902:
! 1903: default:
! 1904: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
! 1905: break;
! 1906: }
1.1 misho 1907: break;
1908:
1909: case PT_WORD:
1.1.1.2 misho 1910: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1911: PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1.1 misho 1912: c == CHAR_UNDERSCORE;
1913: break;
1914:
1.1.1.4 misho 1915: case PT_CLIST:
1916: cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1917: for (;;)
1918: {
1919: if (c < *cp) { OK = FALSE; break; }
1920: if (c == *cp++) { OK = TRUE; break; }
1921: }
1922: break;
1923:
1924: case PT_UCNC:
1925: OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1926: c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1927: c >= 0xe000;
1928: break;
1929:
1.1 misho 1930: /* Should never occur, but keep compilers from grumbling. */
1931:
1932: default:
1933: OK = codevalue != OP_PROP;
1934: break;
1935: }
1936:
1937: if (OK == (d == OP_PROP))
1938: {
1939: if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1940: {
1941: active_count--; /* Remove non-match possibility */
1942: next_active_state--;
1943: }
1.1.1.4 misho 1944: if (++count >= (int)GET2(code, 1))
1.1.1.2 misho 1945: { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1.1 misho 1946: else
1947: { ADD_NEW(state_offset, count); }
1948: }
1949: }
1950: break;
1951:
1952: /*-----------------------------------------------------------------*/
1953: case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1954: case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1955: case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1956: case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1957: if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1.1.1.2 misho 1958: { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1.1 misho 1959: count = current_state->count; /* Number already matched */
1.1.1.4 misho 1960: if (clen > 0)
1.1 misho 1961: {
1.1.1.4 misho 1962: int lgb, rgb;
1.1.1.2 misho 1963: const pcre_uchar *nptr = ptr + clen;
1.1 misho 1964: int ncount = 0;
1965: if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1966: {
1967: active_count--; /* Remove non-match possibility */
1968: next_active_state--;
1969: }
1.1.1.4 misho 1970: lgb = UCD_GRAPHBREAK(c);
1.1 misho 1971: while (nptr < end_subject)
1972: {
1.1.1.4 misho 1973: dlen = 1;
1974: if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1975: rgb = UCD_GRAPHBREAK(d);
1976: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1.1 misho 1977: ncount++;
1.1.1.4 misho 1978: lgb = rgb;
1979: nptr += dlen;
1.1 misho 1980: }
1.1.1.3 misho 1981: if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
1982: reset_could_continue = TRUE;
1.1.1.4 misho 1983: if (++count >= (int)GET2(code, 1))
1.1.1.2 misho 1984: { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1.1 misho 1985: else
1986: { ADD_NEW_DATA(-state_offset, count, ncount); }
1987: }
1988: break;
1989: #endif
1990:
1991: /*-----------------------------------------------------------------*/
1992: case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1993: case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1994: case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1995: case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1996: if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1.1.1.2 misho 1997: { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1.1 misho 1998: count = current_state->count; /* Number already matched */
1999: if (clen > 0)
2000: {
2001: int ncount = 0;
2002: switch (c)
2003: {
1.1.1.4 misho 2004: case CHAR_VT:
2005: case CHAR_FF:
2006: case CHAR_NEL:
2007: #ifndef EBCDIC
1.1 misho 2008: case 0x2028:
2009: case 0x2029:
1.1.1.4 misho 2010: #endif /* Not EBCDIC */
1.1 misho 2011: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2012: goto ANYNL03;
2013:
1.1.1.4 misho 2014: case CHAR_CR:
2015: if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1.1 misho 2016: /* Fall through */
2017:
2018: ANYNL03:
1.1.1.4 misho 2019: case CHAR_LF:
1.1 misho 2020: if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
2021: {
2022: active_count--; /* Remove non-match possibility */
2023: next_active_state--;
2024: }
1.1.1.4 misho 2025: if (++count >= (int)GET2(code, 1))
1.1.1.2 misho 2026: { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1.1 misho 2027: else
2028: { ADD_NEW_DATA(-state_offset, count, ncount); }
2029: break;
2030:
2031: default:
2032: break;
2033: }
2034: }
2035: break;
2036:
2037: /*-----------------------------------------------------------------*/
2038: case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2039: case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2040: case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2041: case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2042: if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1.1.1.2 misho 2043: { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1.1 misho 2044: count = current_state->count; /* Number already matched */
2045: if (clen > 0)
2046: {
2047: BOOL OK;
2048: switch (c)
2049: {
1.1.1.4 misho 2050: VSPACE_CASES:
1.1 misho 2051: OK = TRUE;
2052: break;
2053:
2054: default:
2055: OK = FALSE;
2056: }
2057:
2058: if (OK == (d == OP_VSPACE))
2059: {
2060: if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2061: {
2062: active_count--; /* Remove non-match possibility */
2063: next_active_state--;
2064: }
1.1.1.4 misho 2065: if (++count >= (int)GET2(code, 1))
1.1.1.2 misho 2066: { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1.1 misho 2067: else
2068: { ADD_NEW_DATA(-state_offset, count, 0); }
2069: }
2070: }
2071: break;
2072:
2073: /*-----------------------------------------------------------------*/
2074: case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2075: case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2076: case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2077: case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2078: if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1.1.1.2 misho 2079: { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1.1 misho 2080: count = current_state->count; /* Number already matched */
2081: if (clen > 0)
2082: {
2083: BOOL OK;
2084: switch (c)
2085: {
1.1.1.4 misho 2086: HSPACE_CASES:
1.1 misho 2087: OK = TRUE;
2088: break;
2089:
2090: default:
2091: OK = FALSE;
2092: break;
2093: }
2094:
2095: if (OK == (d == OP_HSPACE))
2096: {
2097: if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2098: {
2099: active_count--; /* Remove non-match possibility */
2100: next_active_state--;
2101: }
1.1.1.4 misho 2102: if (++count >= (int)GET2(code, 1))
1.1.1.2 misho 2103: { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1.1 misho 2104: else
2105: { ADD_NEW_DATA(-state_offset, count, 0); }
2106: }
2107: }
2108: break;
2109:
2110: /* ========================================================================== */
2111: /* These opcodes are followed by a character that is usually compared
2112: to the current subject character; it is loaded into d. We still get
2113: here even if there is no subject character, because in some cases zero
2114: repetitions are permitted. */
2115:
2116: /*-----------------------------------------------------------------*/
2117: case OP_CHAR:
2118: if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2119: break;
2120:
2121: /*-----------------------------------------------------------------*/
2122: case OP_CHARI:
2123: if (clen == 0) break;
2124:
1.1.1.2 misho 2125: #ifdef SUPPORT_UTF
2126: if (utf)
1.1 misho 2127: {
2128: if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2129: {
2130: unsigned int othercase;
1.1.1.2 misho 2131: if (c < 128)
2132: othercase = fcc[c];
2133: else
2134: /* If we have Unicode property support, we can use it to test the
2135: other case of the character. */
1.1 misho 2136: #ifdef SUPPORT_UCP
1.1.1.2 misho 2137: othercase = UCD_OTHERCASE(c);
1.1 misho 2138: #else
1.1.1.2 misho 2139: othercase = NOTACHAR;
1.1 misho 2140: #endif
2141:
2142: if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2143: }
2144: }
2145: else
1.1.1.2 misho 2146: #endif /* SUPPORT_UTF */
2147: /* Not UTF mode */
1.1 misho 2148: {
1.1.1.2 misho 2149: if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2150: { ADD_NEW(state_offset + 2, 0); }
1.1 misho 2151: }
2152: break;
2153:
2154:
2155: #ifdef SUPPORT_UCP
2156: /*-----------------------------------------------------------------*/
2157: /* This is a tricky one because it can match more than one character.
2158: Find out how many characters to skip, and then set up a negative state
2159: to wait for them to pass before continuing. */
2160:
2161: case OP_EXTUNI:
1.1.1.4 misho 2162: if (clen > 0)
1.1 misho 2163: {
1.1.1.4 misho 2164: int lgb, rgb;
1.1.1.2 misho 2165: const pcre_uchar *nptr = ptr + clen;
1.1 misho 2166: int ncount = 0;
1.1.1.4 misho 2167: lgb = UCD_GRAPHBREAK(c);
1.1 misho 2168: while (nptr < end_subject)
2169: {
1.1.1.4 misho 2170: dlen = 1;
2171: if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
2172: rgb = UCD_GRAPHBREAK(d);
2173: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1.1 misho 2174: ncount++;
1.1.1.4 misho 2175: lgb = rgb;
2176: nptr += dlen;
1.1 misho 2177: }
1.1.1.3 misho 2178: if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
2179: reset_could_continue = TRUE;
1.1 misho 2180: ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2181: }
2182: break;
2183: #endif
2184:
2185: /*-----------------------------------------------------------------*/
2186: /* This is a tricky like EXTUNI because it too can match more than one
2187: character (when CR is followed by LF). In this case, set up a negative
2188: state to wait for one character to pass before continuing. */
2189:
2190: case OP_ANYNL:
2191: if (clen > 0) switch(c)
2192: {
1.1.1.4 misho 2193: case CHAR_VT:
2194: case CHAR_FF:
2195: case CHAR_NEL:
2196: #ifndef EBCDIC
1.1 misho 2197: case 0x2028:
2198: case 0x2029:
1.1.1.4 misho 2199: #endif /* Not EBCDIC */
1.1 misho 2200: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2201:
1.1.1.4 misho 2202: case CHAR_LF:
1.1 misho 2203: ADD_NEW(state_offset + 1, 0);
2204: break;
2205:
1.1.1.4 misho 2206: case CHAR_CR:
1.1.1.3 misho 2207: if (ptr + 1 >= end_subject)
2208: {
2209: ADD_NEW(state_offset + 1, 0);
2210: if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
2211: reset_could_continue = TRUE;
2212: }
1.1.1.4 misho 2213: else if (RAWUCHARTEST(ptr + 1) == CHAR_LF)
1.1 misho 2214: {
2215: ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2216: }
2217: else
2218: {
2219: ADD_NEW(state_offset + 1, 0);
2220: }
2221: break;
2222: }
2223: break;
2224:
2225: /*-----------------------------------------------------------------*/
2226: case OP_NOT_VSPACE:
2227: if (clen > 0) switch(c)
2228: {
1.1.1.4 misho 2229: VSPACE_CASES:
1.1 misho 2230: break;
2231:
2232: default:
2233: ADD_NEW(state_offset + 1, 0);
2234: break;
2235: }
2236: break;
2237:
2238: /*-----------------------------------------------------------------*/
2239: case OP_VSPACE:
2240: if (clen > 0) switch(c)
2241: {
1.1.1.4 misho 2242: VSPACE_CASES:
1.1 misho 2243: ADD_NEW(state_offset + 1, 0);
2244: break;
2245:
1.1.1.4 misho 2246: default:
2247: break;
1.1 misho 2248: }
2249: break;
2250:
2251: /*-----------------------------------------------------------------*/
2252: case OP_NOT_HSPACE:
2253: if (clen > 0) switch(c)
2254: {
1.1.1.4 misho 2255: HSPACE_CASES:
1.1 misho 2256: break;
2257:
2258: default:
2259: ADD_NEW(state_offset + 1, 0);
2260: break;
2261: }
2262: break;
2263:
2264: /*-----------------------------------------------------------------*/
2265: case OP_HSPACE:
2266: if (clen > 0) switch(c)
2267: {
1.1.1.4 misho 2268: HSPACE_CASES:
1.1 misho 2269: ADD_NEW(state_offset + 1, 0);
2270: break;
1.1.1.4 misho 2271:
2272: default:
2273: break;
1.1 misho 2274: }
2275: break;
2276:
2277: /*-----------------------------------------------------------------*/
1.1.1.3 misho 2278: /* Match a negated single character casefully. */
1.1 misho 2279:
2280: case OP_NOT:
2281: if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2282: break;
2283:
2284: /*-----------------------------------------------------------------*/
1.1.1.3 misho 2285: /* Match a negated single character caselessly. */
1.1 misho 2286:
2287: case OP_NOTI:
1.1.1.3 misho 2288: if (clen > 0)
2289: {
2290: unsigned int otherd;
2291: #ifdef SUPPORT_UTF
2292: if (utf && d >= 128)
2293: {
2294: #ifdef SUPPORT_UCP
2295: otherd = UCD_OTHERCASE(d);
2296: #endif /* SUPPORT_UCP */
2297: }
2298: else
2299: #endif /* SUPPORT_UTF */
2300: otherd = TABLE_GET(d, fcc, d);
2301: if (c != d && c != otherd)
2302: { ADD_NEW(state_offset + dlen + 1, 0); }
2303: }
1.1 misho 2304: break;
2305:
2306: /*-----------------------------------------------------------------*/
2307: case OP_PLUSI:
2308: case OP_MINPLUSI:
2309: case OP_POSPLUSI:
2310: case OP_NOTPLUSI:
2311: case OP_NOTMINPLUSI:
2312: case OP_NOTPOSPLUSI:
2313: caseless = TRUE;
2314: codevalue -= OP_STARI - OP_STAR;
2315:
2316: /* Fall through */
2317: case OP_PLUS:
2318: case OP_MINPLUS:
2319: case OP_POSPLUS:
2320: case OP_NOTPLUS:
2321: case OP_NOTMINPLUS:
2322: case OP_NOTPOSPLUS:
2323: count = current_state->count; /* Already matched */
2324: if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2325: if (clen > 0)
2326: {
1.1.1.4 misho 2327: pcre_uint32 otherd = NOTACHAR;
1.1 misho 2328: if (caseless)
2329: {
1.1.1.2 misho 2330: #ifdef SUPPORT_UTF
2331: if (utf && d >= 128)
1.1 misho 2332: {
2333: #ifdef SUPPORT_UCP
2334: otherd = UCD_OTHERCASE(d);
2335: #endif /* SUPPORT_UCP */
2336: }
2337: else
1.1.1.2 misho 2338: #endif /* SUPPORT_UTF */
2339: otherd = TABLE_GET(d, fcc, d);
1.1 misho 2340: }
2341: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2342: {
2343: if (count > 0 &&
2344: (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2345: {
2346: active_count--; /* Remove non-match possibility */
2347: next_active_state--;
2348: }
2349: count++;
2350: ADD_NEW(state_offset, count);
2351: }
2352: }
2353: break;
2354:
2355: /*-----------------------------------------------------------------*/
2356: case OP_QUERYI:
2357: case OP_MINQUERYI:
2358: case OP_POSQUERYI:
2359: case OP_NOTQUERYI:
2360: case OP_NOTMINQUERYI:
2361: case OP_NOTPOSQUERYI:
2362: caseless = TRUE;
2363: codevalue -= OP_STARI - OP_STAR;
2364: /* Fall through */
2365: case OP_QUERY:
2366: case OP_MINQUERY:
2367: case OP_POSQUERY:
2368: case OP_NOTQUERY:
2369: case OP_NOTMINQUERY:
2370: case OP_NOTPOSQUERY:
2371: ADD_ACTIVE(state_offset + dlen + 1, 0);
2372: if (clen > 0)
2373: {
1.1.1.4 misho 2374: pcre_uint32 otherd = NOTACHAR;
1.1 misho 2375: if (caseless)
2376: {
1.1.1.2 misho 2377: #ifdef SUPPORT_UTF
2378: if (utf && d >= 128)
1.1 misho 2379: {
2380: #ifdef SUPPORT_UCP
2381: otherd = UCD_OTHERCASE(d);
2382: #endif /* SUPPORT_UCP */
2383: }
2384: else
1.1.1.2 misho 2385: #endif /* SUPPORT_UTF */
2386: otherd = TABLE_GET(d, fcc, d);
1.1 misho 2387: }
2388: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2389: {
2390: if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2391: {
2392: active_count--; /* Remove non-match possibility */
2393: next_active_state--;
2394: }
2395: ADD_NEW(state_offset + dlen + 1, 0);
2396: }
2397: }
2398: break;
2399:
2400: /*-----------------------------------------------------------------*/
2401: case OP_STARI:
2402: case OP_MINSTARI:
2403: case OP_POSSTARI:
2404: case OP_NOTSTARI:
2405: case OP_NOTMINSTARI:
2406: case OP_NOTPOSSTARI:
2407: caseless = TRUE;
2408: codevalue -= OP_STARI - OP_STAR;
2409: /* Fall through */
2410: case OP_STAR:
2411: case OP_MINSTAR:
2412: case OP_POSSTAR:
2413: case OP_NOTSTAR:
2414: case OP_NOTMINSTAR:
2415: case OP_NOTPOSSTAR:
2416: ADD_ACTIVE(state_offset + dlen + 1, 0);
2417: if (clen > 0)
2418: {
1.1.1.4 misho 2419: pcre_uint32 otherd = NOTACHAR;
1.1 misho 2420: if (caseless)
2421: {
1.1.1.2 misho 2422: #ifdef SUPPORT_UTF
2423: if (utf && d >= 128)
1.1 misho 2424: {
2425: #ifdef SUPPORT_UCP
2426: otherd = UCD_OTHERCASE(d);
2427: #endif /* SUPPORT_UCP */
2428: }
2429: else
1.1.1.2 misho 2430: #endif /* SUPPORT_UTF */
2431: otherd = TABLE_GET(d, fcc, d);
1.1 misho 2432: }
2433: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2434: {
2435: if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2436: {
2437: active_count--; /* Remove non-match possibility */
2438: next_active_state--;
2439: }
2440: ADD_NEW(state_offset, 0);
2441: }
2442: }
2443: break;
2444:
2445: /*-----------------------------------------------------------------*/
2446: case OP_EXACTI:
2447: case OP_NOTEXACTI:
2448: caseless = TRUE;
2449: codevalue -= OP_STARI - OP_STAR;
2450: /* Fall through */
2451: case OP_EXACT:
2452: case OP_NOTEXACT:
2453: count = current_state->count; /* Number already matched */
2454: if (clen > 0)
2455: {
1.1.1.4 misho 2456: pcre_uint32 otherd = NOTACHAR;
1.1 misho 2457: if (caseless)
2458: {
1.1.1.2 misho 2459: #ifdef SUPPORT_UTF
2460: if (utf && d >= 128)
1.1 misho 2461: {
2462: #ifdef SUPPORT_UCP
2463: otherd = UCD_OTHERCASE(d);
2464: #endif /* SUPPORT_UCP */
2465: }
2466: else
1.1.1.2 misho 2467: #endif /* SUPPORT_UTF */
2468: otherd = TABLE_GET(d, fcc, d);
1.1 misho 2469: }
2470: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2471: {
1.1.1.4 misho 2472: if (++count >= (int)GET2(code, 1))
1.1.1.2 misho 2473: { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
1.1 misho 2474: else
2475: { ADD_NEW(state_offset, count); }
2476: }
2477: }
2478: break;
2479:
2480: /*-----------------------------------------------------------------*/
2481: case OP_UPTOI:
2482: case OP_MINUPTOI:
2483: case OP_POSUPTOI:
2484: case OP_NOTUPTOI:
2485: case OP_NOTMINUPTOI:
2486: case OP_NOTPOSUPTOI:
2487: caseless = TRUE;
2488: codevalue -= OP_STARI - OP_STAR;
2489: /* Fall through */
2490: case OP_UPTO:
2491: case OP_MINUPTO:
2492: case OP_POSUPTO:
2493: case OP_NOTUPTO:
2494: case OP_NOTMINUPTO:
2495: case OP_NOTPOSUPTO:
1.1.1.2 misho 2496: ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
1.1 misho 2497: count = current_state->count; /* Number already matched */
2498: if (clen > 0)
2499: {
1.1.1.4 misho 2500: pcre_uint32 otherd = NOTACHAR;
1.1 misho 2501: if (caseless)
2502: {
1.1.1.2 misho 2503: #ifdef SUPPORT_UTF
2504: if (utf && d >= 128)
1.1 misho 2505: {
2506: #ifdef SUPPORT_UCP
2507: otherd = UCD_OTHERCASE(d);
2508: #endif /* SUPPORT_UCP */
2509: }
2510: else
1.1.1.2 misho 2511: #endif /* SUPPORT_UTF */
2512: otherd = TABLE_GET(d, fcc, d);
1.1 misho 2513: }
2514: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2515: {
2516: if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2517: {
2518: active_count--; /* Remove non-match possibility */
2519: next_active_state--;
2520: }
1.1.1.4 misho 2521: if (++count >= (int)GET2(code, 1))
1.1.1.2 misho 2522: { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
1.1 misho 2523: else
2524: { ADD_NEW(state_offset, count); }
2525: }
2526: }
2527: break;
2528:
2529:
2530: /* ========================================================================== */
2531: /* These are the class-handling opcodes */
2532:
2533: case OP_CLASS:
2534: case OP_NCLASS:
2535: case OP_XCLASS:
2536: {
2537: BOOL isinclass = FALSE;
2538: int next_state_offset;
1.1.1.2 misho 2539: const pcre_uchar *ecode;
1.1 misho 2540:
2541: /* For a simple class, there is always just a 32-byte table, and we
2542: can set isinclass from it. */
2543:
2544: if (codevalue != OP_XCLASS)
2545: {
1.1.1.2 misho 2546: ecode = code + 1 + (32 / sizeof(pcre_uchar));
1.1 misho 2547: if (clen > 0)
2548: {
2549: isinclass = (c > 255)? (codevalue == OP_NCLASS) :
1.1.1.2 misho 2550: ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
1.1 misho 2551: }
2552: }
2553:
2554: /* An extended class may have a table or a list of single characters,
2555: ranges, or both, and it may be positive or negative. There's a
2556: function that sorts all this out. */
2557:
2558: else
2559: {
2560: ecode = code + GET(code, 1);
1.1.1.2 misho 2561: if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
1.1 misho 2562: }
2563:
2564: /* At this point, isinclass is set for all kinds of class, and ecode
2565: points to the byte after the end of the class. If there is a
2566: quantifier, this is where it will be. */
2567:
2568: next_state_offset = (int)(ecode - start_code);
2569:
2570: switch (*ecode)
2571: {
2572: case OP_CRSTAR:
2573: case OP_CRMINSTAR:
1.1.1.5 ! misho 2574: case OP_CRPOSSTAR:
1.1 misho 2575: ADD_ACTIVE(next_state_offset + 1, 0);
1.1.1.5 ! misho 2576: if (isinclass)
! 2577: {
! 2578: if (*ecode == OP_CRPOSSTAR)
! 2579: {
! 2580: active_count--; /* Remove non-match possibility */
! 2581: next_active_state--;
! 2582: }
! 2583: ADD_NEW(state_offset, 0);
! 2584: }
1.1 misho 2585: break;
2586:
2587: case OP_CRPLUS:
2588: case OP_CRMINPLUS:
1.1.1.5 ! misho 2589: case OP_CRPOSPLUS:
1.1 misho 2590: count = current_state->count; /* Already matched */
2591: if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
1.1.1.5 ! misho 2592: if (isinclass)
! 2593: {
! 2594: if (count > 0 && *ecode == OP_CRPOSPLUS)
! 2595: {
! 2596: active_count--; /* Remove non-match possibility */
! 2597: next_active_state--;
! 2598: }
! 2599: count++;
! 2600: ADD_NEW(state_offset, count);
! 2601: }
1.1 misho 2602: break;
2603:
2604: case OP_CRQUERY:
2605: case OP_CRMINQUERY:
1.1.1.5 ! misho 2606: case OP_CRPOSQUERY:
1.1 misho 2607: ADD_ACTIVE(next_state_offset + 1, 0);
1.1.1.5 ! misho 2608: if (isinclass)
! 2609: {
! 2610: if (*ecode == OP_CRPOSQUERY)
! 2611: {
! 2612: active_count--; /* Remove non-match possibility */
! 2613: next_active_state--;
! 2614: }
! 2615: ADD_NEW(next_state_offset + 1, 0);
! 2616: }
1.1 misho 2617: break;
2618:
2619: case OP_CRRANGE:
2620: case OP_CRMINRANGE:
1.1.1.5 ! misho 2621: case OP_CRPOSRANGE:
1.1 misho 2622: count = current_state->count; /* Already matched */
1.1.1.4 misho 2623: if (count >= (int)GET2(ecode, 1))
1.1.1.2 misho 2624: { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
1.1 misho 2625: if (isinclass)
2626: {
1.1.1.4 misho 2627: int max = (int)GET2(ecode, 1 + IMM2_SIZE);
1.1.1.5 ! misho 2628: if (*ecode == OP_CRPOSRANGE)
! 2629: {
! 2630: active_count--; /* Remove non-match possibility */
! 2631: next_active_state--;
! 2632: }
1.1 misho 2633: if (++count >= max && max != 0) /* Max 0 => no limit */
1.1.1.2 misho 2634: { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
1.1 misho 2635: else
2636: { ADD_NEW(state_offset, count); }
2637: }
2638: break;
2639:
2640: default:
2641: if (isinclass) { ADD_NEW(next_state_offset, 0); }
2642: break;
2643: }
2644: }
2645: break;
2646:
2647: /* ========================================================================== */
2648: /* These are the opcodes for fancy brackets of various kinds. We have
2649: to use recursion in order to handle them. The "always failing" assertion
2650: (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2651: though the other "backtracking verbs" are not supported. */
2652:
2653: case OP_FAIL:
2654: forced_fail++; /* Count FAILs for multiple states */
2655: break;
2656:
2657: case OP_ASSERT:
2658: case OP_ASSERT_NOT:
2659: case OP_ASSERTBACK:
2660: case OP_ASSERTBACK_NOT:
2661: {
2662: int rc;
2663: int local_offsets[2];
2664: int local_workspace[1000];
1.1.1.2 misho 2665: const pcre_uchar *endasscode = code + GET(code, 1);
1.1 misho 2666:
2667: while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2668:
2669: rc = internal_dfa_exec(
2670: md, /* static match data */
2671: code, /* this subexpression's code */
2672: ptr, /* where we currently are */
2673: (int)(ptr - start_subject), /* start offset */
2674: local_offsets, /* offset vector */
2675: sizeof(local_offsets)/sizeof(int), /* size of same */
2676: local_workspace, /* workspace vector */
2677: sizeof(local_workspace)/sizeof(int), /* size of same */
2678: rlevel); /* function recursion level */
2679:
2680: if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2681: if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2682: { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2683: }
2684: break;
2685:
2686: /*-----------------------------------------------------------------*/
2687: case OP_COND:
2688: case OP_SCOND:
2689: {
2690: int local_offsets[1000];
2691: int local_workspace[1000];
2692: int codelink = GET(code, 1);
2693: int condcode;
2694:
2695: /* Because of the way auto-callout works during compile, a callout item
2696: is inserted between OP_COND and an assertion condition. This does not
2697: happen for the other conditions. */
2698:
2699: if (code[LINK_SIZE+1] == OP_CALLOUT)
2700: {
2701: rrc = 0;
1.1.1.2 misho 2702: if (PUBL(callout) != NULL)
1.1 misho 2703: {
1.1.1.2 misho 2704: PUBL(callout_block) cb;
1.1 misho 2705: cb.version = 1; /* Version 1 of the callout block */
2706: cb.callout_number = code[LINK_SIZE+2];
2707: cb.offset_vector = offsets;
1.1.1.4 misho 2708: #if defined COMPILE_PCRE8
1.1 misho 2709: cb.subject = (PCRE_SPTR)start_subject;
1.1.1.4 misho 2710: #elif defined COMPILE_PCRE16
1.1.1.2 misho 2711: cb.subject = (PCRE_SPTR16)start_subject;
1.1.1.4 misho 2712: #elif defined COMPILE_PCRE32
2713: cb.subject = (PCRE_SPTR32)start_subject;
1.1.1.2 misho 2714: #endif
1.1 misho 2715: cb.subject_length = (int)(end_subject - start_subject);
2716: cb.start_match = (int)(current_subject - start_subject);
2717: cb.current_position = (int)(ptr - start_subject);
2718: cb.pattern_position = GET(code, LINK_SIZE + 3);
2719: cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2720: cb.capture_top = 1;
2721: cb.capture_last = -1;
2722: cb.callout_data = md->callout_data;
2723: cb.mark = NULL; /* No (*MARK) support */
1.1.1.2 misho 2724: if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
1.1 misho 2725: }
2726: if (rrc > 0) break; /* Fail this thread */
1.1.1.2 misho 2727: code += PRIV(OP_lengths)[OP_CALLOUT]; /* Skip callout data */
1.1 misho 2728: }
2729:
2730: condcode = code[LINK_SIZE+1];
2731:
1.1.1.5 ! misho 2732: /* Back reference conditions and duplicate named recursion conditions
! 2733: are not supported */
1.1 misho 2734:
1.1.1.5 ! misho 2735: if (condcode == OP_CREF || condcode == OP_DNCREF ||
! 2736: condcode == OP_DNRREF)
1.1 misho 2737: return PCRE_ERROR_DFA_UCOND;
2738:
2739: /* The DEFINE condition is always false */
2740:
2741: if (condcode == OP_DEF)
2742: { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2743:
2744: /* The only supported version of OP_RREF is for the value RREF_ANY,
2745: which means "test if in any recursion". We can't test for specifically
2746: recursed groups. */
2747:
1.1.1.5 ! misho 2748: else if (condcode == OP_RREF)
1.1 misho 2749: {
1.1.1.2 misho 2750: int value = GET2(code, LINK_SIZE + 2);
1.1 misho 2751: if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2752: if (md->recursive != NULL)
1.1.1.2 misho 2753: { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
1.1 misho 2754: else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2755: }
2756:
2757: /* Otherwise, the condition is an assertion */
2758:
2759: else
2760: {
2761: int rc;
1.1.1.2 misho 2762: const pcre_uchar *asscode = code + LINK_SIZE + 1;
2763: const pcre_uchar *endasscode = asscode + GET(asscode, 1);
1.1 misho 2764:
2765: while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2766:
2767: rc = internal_dfa_exec(
2768: md, /* fixed match data */
2769: asscode, /* this subexpression's code */
2770: ptr, /* where we currently are */
2771: (int)(ptr - start_subject), /* start offset */
2772: local_offsets, /* offset vector */
2773: sizeof(local_offsets)/sizeof(int), /* size of same */
2774: local_workspace, /* workspace vector */
2775: sizeof(local_workspace)/sizeof(int), /* size of same */
2776: rlevel); /* function recursion level */
2777:
2778: if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2779: if ((rc >= 0) ==
2780: (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2781: { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2782: else
2783: { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2784: }
2785: }
2786: break;
2787:
2788: /*-----------------------------------------------------------------*/
2789: case OP_RECURSE:
2790: {
2791: dfa_recursion_info *ri;
2792: int local_offsets[1000];
2793: int local_workspace[1000];
1.1.1.2 misho 2794: const pcre_uchar *callpat = start_code + GET(code, 1);
1.1 misho 2795: int recno = (callpat == md->start_code)? 0 :
2796: GET2(callpat, 1 + LINK_SIZE);
2797: int rc;
2798:
2799: DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2800:
2801: /* Check for repeating a recursion without advancing the subject
2802: pointer. This should catch convoluted mutual recursions. (Some simple
2803: cases are caught at compile time.) */
2804:
2805: for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2806: if (recno == ri->group_num && ptr == ri->subject_position)
2807: return PCRE_ERROR_RECURSELOOP;
2808:
2809: /* Remember this recursion and where we started it so as to
2810: catch infinite loops. */
2811:
2812: new_recursive.group_num = recno;
2813: new_recursive.subject_position = ptr;
2814: new_recursive.prevrec = md->recursive;
2815: md->recursive = &new_recursive;
2816:
2817: rc = internal_dfa_exec(
2818: md, /* fixed match data */
2819: callpat, /* this subexpression's code */
2820: ptr, /* where we currently are */
2821: (int)(ptr - start_subject), /* start offset */
2822: local_offsets, /* offset vector */
2823: sizeof(local_offsets)/sizeof(int), /* size of same */
2824: local_workspace, /* workspace vector */
2825: sizeof(local_workspace)/sizeof(int), /* size of same */
2826: rlevel); /* function recursion level */
2827:
2828: md->recursive = new_recursive.prevrec; /* Done this recursion */
2829:
2830: DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2831: rc));
2832:
2833: /* Ran out of internal offsets */
2834:
2835: if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2836:
2837: /* For each successful matched substring, set up the next state with a
2838: count of characters to skip before trying it. Note that the count is in
2839: characters, not bytes. */
2840:
2841: if (rc > 0)
2842: {
2843: for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2844: {
2845: int charcount = local_offsets[rc+1] - local_offsets[rc];
1.1.1.4 misho 2846: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
1.1.1.3 misho 2847: if (utf)
2848: {
2849: const pcre_uchar *p = start_subject + local_offsets[rc];
2850: const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2851: while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2852: }
1.1.1.2 misho 2853: #endif
1.1 misho 2854: if (charcount > 0)
2855: {
2856: ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2857: }
2858: else
2859: {
2860: ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2861: }
2862: }
2863: }
2864: else if (rc != PCRE_ERROR_NOMATCH) return rc;
2865: }
2866: break;
2867:
2868: /*-----------------------------------------------------------------*/
2869: case OP_BRAPOS:
2870: case OP_SBRAPOS:
2871: case OP_CBRAPOS:
2872: case OP_SCBRAPOS:
2873: case OP_BRAPOSZERO:
2874: {
2875: int charcount, matched_count;
1.1.1.2 misho 2876: const pcre_uchar *local_ptr = ptr;
1.1 misho 2877: BOOL allow_zero;
2878:
2879: if (codevalue == OP_BRAPOSZERO)
2880: {
2881: allow_zero = TRUE;
2882: codevalue = *(++code); /* Codevalue will be one of above BRAs */
2883: }
2884: else allow_zero = FALSE;
2885:
2886: /* Loop to match the subpattern as many times as possible as if it were
2887: a complete pattern. */
2888:
2889: for (matched_count = 0;; matched_count++)
2890: {
2891: int local_offsets[2];
2892: int local_workspace[1000];
2893:
2894: int rc = internal_dfa_exec(
2895: md, /* fixed match data */
2896: code, /* this subexpression's code */
2897: local_ptr, /* where we currently are */
2898: (int)(ptr - start_subject), /* start offset */
2899: local_offsets, /* offset vector */
2900: sizeof(local_offsets)/sizeof(int), /* size of same */
2901: local_workspace, /* workspace vector */
2902: sizeof(local_workspace)/sizeof(int), /* size of same */
2903: rlevel); /* function recursion level */
2904:
2905: /* Failed to match */
2906:
2907: if (rc < 0)
2908: {
2909: if (rc != PCRE_ERROR_NOMATCH) return rc;
2910: break;
2911: }
2912:
2913: /* Matched: break the loop if zero characters matched. */
2914:
2915: charcount = local_offsets[1] - local_offsets[0];
2916: if (charcount == 0) break;
2917: local_ptr += charcount; /* Advance temporary position ptr */
2918: }
2919:
2920: /* At this point we have matched the subpattern matched_count
2921: times, and local_ptr is pointing to the character after the end of the
2922: last match. */
2923:
2924: if (matched_count > 0 || allow_zero)
2925: {
1.1.1.2 misho 2926: const pcre_uchar *end_subpattern = code;
1.1 misho 2927: int next_state_offset;
2928:
2929: do { end_subpattern += GET(end_subpattern, 1); }
2930: while (*end_subpattern == OP_ALT);
2931: next_state_offset =
2932: (int)(end_subpattern - start_code + LINK_SIZE + 1);
2933:
2934: /* Optimization: if there are no more active states, and there
2935: are no new states yet set up, then skip over the subject string
2936: right here, to save looping. Otherwise, set up the new state to swing
2937: into action when the end of the matched substring is reached. */
2938:
2939: if (i + 1 >= active_count && new_count == 0)
2940: {
2941: ptr = local_ptr;
2942: clen = 0;
2943: ADD_NEW(next_state_offset, 0);
2944: }
2945: else
2946: {
1.1.1.2 misho 2947: const pcre_uchar *p = ptr;
2948: const pcre_uchar *pp = local_ptr;
1.1 misho 2949: charcount = (int)(pp - p);
1.1.1.4 misho 2950: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
1.1.1.3 misho 2951: if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
1.1.1.2 misho 2952: #endif
1.1 misho 2953: ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2954: }
2955: }
2956: }
2957: break;
2958:
2959: /*-----------------------------------------------------------------*/
2960: case OP_ONCE:
2961: case OP_ONCE_NC:
2962: {
2963: int local_offsets[2];
2964: int local_workspace[1000];
2965:
2966: int rc = internal_dfa_exec(
2967: md, /* fixed match data */
2968: code, /* this subexpression's code */
2969: ptr, /* where we currently are */
2970: (int)(ptr - start_subject), /* start offset */
2971: local_offsets, /* offset vector */
2972: sizeof(local_offsets)/sizeof(int), /* size of same */
2973: local_workspace, /* workspace vector */
2974: sizeof(local_workspace)/sizeof(int), /* size of same */
2975: rlevel); /* function recursion level */
2976:
2977: if (rc >= 0)
2978: {
1.1.1.2 misho 2979: const pcre_uchar *end_subpattern = code;
1.1 misho 2980: int charcount = local_offsets[1] - local_offsets[0];
2981: int next_state_offset, repeat_state_offset;
2982:
2983: do { end_subpattern += GET(end_subpattern, 1); }
2984: while (*end_subpattern == OP_ALT);
2985: next_state_offset =
2986: (int)(end_subpattern - start_code + LINK_SIZE + 1);
2987:
2988: /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2989: arrange for the repeat state also to be added to the relevant list.
2990: Calculate the offset, or set -1 for no repeat. */
2991:
2992: repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2993: *end_subpattern == OP_KETRMIN)?
2994: (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2995:
2996: /* If we have matched an empty string, add the next state at the
2997: current character pointer. This is important so that the duplicate
2998: checking kicks in, which is what breaks infinite loops that match an
2999: empty string. */
3000:
3001: if (charcount == 0)
3002: {
3003: ADD_ACTIVE(next_state_offset, 0);
3004: }
3005:
3006: /* Optimization: if there are no more active states, and there
3007: are no new states yet set up, then skip over the subject string
3008: right here, to save looping. Otherwise, set up the new state to swing
3009: into action when the end of the matched substring is reached. */
3010:
3011: else if (i + 1 >= active_count && new_count == 0)
3012: {
3013: ptr += charcount;
3014: clen = 0;
3015: ADD_NEW(next_state_offset, 0);
3016:
3017: /* If we are adding a repeat state at the new character position,
3018: we must fudge things so that it is the only current state.
3019: Otherwise, it might be a duplicate of one we processed before, and
3020: that would cause it to be skipped. */
3021:
3022: if (repeat_state_offset >= 0)
3023: {
3024: next_active_state = active_states;
3025: active_count = 0;
3026: i = -1;
3027: ADD_ACTIVE(repeat_state_offset, 0);
3028: }
3029: }
3030: else
3031: {
1.1.1.4 misho 3032: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
1.1.1.3 misho 3033: if (utf)
3034: {
3035: const pcre_uchar *p = start_subject + local_offsets[0];
3036: const pcre_uchar *pp = start_subject + local_offsets[1];
3037: while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
3038: }
1.1.1.2 misho 3039: #endif
1.1 misho 3040: ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
3041: if (repeat_state_offset >= 0)
3042: { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
3043: }
3044: }
3045: else if (rc != PCRE_ERROR_NOMATCH) return rc;
3046: }
3047: break;
3048:
3049:
3050: /* ========================================================================== */
3051: /* Handle callouts */
3052:
3053: case OP_CALLOUT:
3054: rrc = 0;
1.1.1.2 misho 3055: if (PUBL(callout) != NULL)
1.1 misho 3056: {
1.1.1.2 misho 3057: PUBL(callout_block) cb;
1.1 misho 3058: cb.version = 1; /* Version 1 of the callout block */
3059: cb.callout_number = code[1];
3060: cb.offset_vector = offsets;
1.1.1.4 misho 3061: #if defined COMPILE_PCRE8
1.1 misho 3062: cb.subject = (PCRE_SPTR)start_subject;
1.1.1.4 misho 3063: #elif defined COMPILE_PCRE16
1.1.1.2 misho 3064: cb.subject = (PCRE_SPTR16)start_subject;
1.1.1.4 misho 3065: #elif defined COMPILE_PCRE32
3066: cb.subject = (PCRE_SPTR32)start_subject;
1.1.1.2 misho 3067: #endif
1.1 misho 3068: cb.subject_length = (int)(end_subject - start_subject);
3069: cb.start_match = (int)(current_subject - start_subject);
3070: cb.current_position = (int)(ptr - start_subject);
3071: cb.pattern_position = GET(code, 2);
3072: cb.next_item_length = GET(code, 2 + LINK_SIZE);
3073: cb.capture_top = 1;
3074: cb.capture_last = -1;
3075: cb.callout_data = md->callout_data;
3076: cb.mark = NULL; /* No (*MARK) support */
1.1.1.2 misho 3077: if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
1.1 misho 3078: }
3079: if (rrc == 0)
1.1.1.2 misho 3080: { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
1.1 misho 3081: break;
3082:
3083:
3084: /* ========================================================================== */
3085: default: /* Unsupported opcode */
3086: return PCRE_ERROR_DFA_UITEM;
3087: }
3088:
3089: NEXT_ACTIVE_STATE: continue;
3090:
3091: } /* End of loop scanning active states */
3092:
3093: /* We have finished the processing at the current subject character. If no
3094: new states have been set for the next character, we have found all the
3095: matches that we are going to find. If we are at the top level and partial
3096: matching has been requested, check for appropriate conditions.
3097:
3098: The "forced_ fail" variable counts the number of (*F) encountered for the
3099: character. If it is equal to the original active_count (saved in
3100: workspace[1]) it means that (*F) was found on every active state. In this
3101: case we don't want to give a partial match.
3102:
3103: The "could_continue" variable is true if a state could have continued but
3104: for the fact that the end of the subject was reached. */
3105:
3106: if (new_count <= 0)
3107: {
3108: if (rlevel == 1 && /* Top level, and */
1.1.1.3 misho 3109: could_continue && /* Some could go on, and */
1.1 misho 3110: forced_fail != workspace[1] && /* Not all forced fail & */
3111: ( /* either... */
3112: (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
3113: || /* or... */
3114: ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
3115: match_count < 0) /* no matches */
3116: ) && /* And... */
1.1.1.3 misho 3117: (
3118: partial_newline || /* Either partial NL */
3119: ( /* or ... */
3120: ptr >= end_subject && /* End of subject and */
3121: ptr > md->start_used_ptr) /* Inspected non-empty string */
3122: )
3123: )
1.1 misho 3124: match_count = PCRE_ERROR_PARTIAL;
3125: DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
3126: "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
3127: rlevel*2-2, SP));
3128: break; /* In effect, "return", but see the comment below */
3129: }
3130:
3131: /* One or more states are active for the next character. */
3132:
3133: ptr += clen; /* Advance to next subject character */
3134: } /* Loop to move along the subject string */
3135:
3136: /* Control gets here from "break" a few lines above. We do it this way because
3137: if we use "return" above, we have compiler trouble. Some compilers warn if
3138: there's nothing here because they think the function doesn't return a value. On
3139: the other hand, if we put a dummy statement here, some more clever compilers
3140: complain that it can't be reached. Sigh. */
3141:
3142: return match_count;
3143: }
3144:
3145:
3146:
3147:
3148: /*************************************************
3149: * Execute a Regular Expression - DFA engine *
3150: *************************************************/
3151:
3152: /* This external function applies a compiled re to a subject string using a DFA
3153: engine. This function calls the internal function multiple times if the pattern
3154: is not anchored.
3155:
3156: Arguments:
3157: argument_re points to the compiled expression
3158: extra_data points to extra data or is NULL
3159: subject points to the subject string
3160: length length of subject string (may contain binary zeros)
3161: start_offset where to start in the subject string
3162: options option bits
3163: offsets vector of match offsets
3164: offsetcount size of same
3165: workspace workspace vector
3166: wscount size of same
3167:
3168: Returns: > 0 => number of match offset pairs placed in offsets
3169: = 0 => offsets overflowed; longest matches are present
3170: -1 => failed to match
3171: < -1 => some kind of unexpected problem
3172: */
3173:
1.1.1.4 misho 3174: #if defined COMPILE_PCRE8
1.1 misho 3175: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3176: pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3177: const char *subject, int length, int start_offset, int options, int *offsets,
3178: int offsetcount, int *workspace, int wscount)
1.1.1.4 misho 3179: #elif defined COMPILE_PCRE16
1.1.1.2 misho 3180: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3181: pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3182: PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3183: int offsetcount, int *workspace, int wscount)
1.1.1.4 misho 3184: #elif defined COMPILE_PCRE32
3185: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3186: pcre32_dfa_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
3187: PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
3188: int offsetcount, int *workspace, int wscount)
1.1.1.2 misho 3189: #endif
1.1 misho 3190: {
1.1.1.2 misho 3191: REAL_PCRE *re = (REAL_PCRE *)argument_re;
1.1 misho 3192: dfa_match_data match_block;
3193: dfa_match_data *md = &match_block;
1.1.1.2 misho 3194: BOOL utf, anchored, startline, firstline;
3195: const pcre_uchar *current_subject, *end_subject;
1.1 misho 3196: const pcre_study_data *study = NULL;
3197:
1.1.1.2 misho 3198: const pcre_uchar *req_char_ptr;
3199: const pcre_uint8 *start_bits = NULL;
3200: BOOL has_first_char = FALSE;
3201: BOOL has_req_char = FALSE;
3202: pcre_uchar first_char = 0;
3203: pcre_uchar first_char2 = 0;
3204: pcre_uchar req_char = 0;
3205: pcre_uchar req_char2 = 0;
1.1 misho 3206: int newline;
3207:
3208: /* Plausibility checks */
3209:
3210: if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3211: if (re == NULL || subject == NULL || workspace == NULL ||
3212: (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3213: if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3214: if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
1.1.1.4 misho 3215: if (length < 0) return PCRE_ERROR_BADLENGTH;
1.1 misho 3216: if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3217:
1.1.1.3 misho 3218: /* Check that the first field in the block is the magic number. If it is not,
3219: return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3220: REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3221: means that the pattern is likely compiled with different endianness. */
3222:
3223: if (re->magic_number != MAGIC_NUMBER)
3224: return re->magic_number == REVERSED_MAGIC_NUMBER?
3225: PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3226: if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3227:
3228: /* If restarting after a partial match, do some sanity checks on the contents
3229: of the workspace. */
3230:
3231: if ((options & PCRE_DFA_RESTART) != 0)
3232: {
3233: if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3234: workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
3235: return PCRE_ERROR_DFA_BADRESTART;
3236: }
3237:
3238: /* Set up study, callout, and table data */
1.1 misho 3239:
3240: md->tables = re->tables;
3241: md->callout_data = NULL;
3242:
3243: if (extra_data != NULL)
3244: {
3245: unsigned int flags = extra_data->flags;
3246: if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3247: study = (const pcre_study_data *)extra_data->study_data;
3248: if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3249: if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3250: return PCRE_ERROR_DFA_UMLIMIT;
3251: if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3252: md->callout_data = extra_data->callout_data;
3253: if ((flags & PCRE_EXTRA_TABLES) != 0)
3254: md->tables = extra_data->tables;
3255: }
3256:
3257: /* Set some local values */
3258:
1.1.1.2 misho 3259: current_subject = (const pcre_uchar *)subject + start_offset;
3260: end_subject = (const pcre_uchar *)subject + length;
3261: req_char_ptr = current_subject - 1;
3262:
3263: #ifdef SUPPORT_UTF
1.1.1.4 misho 3264: /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
1.1.1.2 misho 3265: utf = (re->options & PCRE_UTF8) != 0;
1.1 misho 3266: #else
1.1.1.2 misho 3267: utf = FALSE;
1.1 misho 3268: #endif
3269:
3270: anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3271: (re->options & PCRE_ANCHORED) != 0;
3272:
3273: /* The remaining fixed data for passing around. */
3274:
1.1.1.2 misho 3275: md->start_code = (const pcre_uchar *)argument_re +
1.1 misho 3276: re->name_table_offset + re->name_count * re->name_entry_size;
1.1.1.2 misho 3277: md->start_subject = (const pcre_uchar *)subject;
1.1 misho 3278: md->end_subject = end_subject;
3279: md->start_offset = start_offset;
3280: md->moptions = options;
3281: md->poptions = re->options;
3282:
3283: /* If the BSR option is not set at match time, copy what was set
3284: at compile time. */
3285:
3286: if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3287: {
3288: if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3289: md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3290: #ifdef BSR_ANYCRLF
3291: else md->moptions |= PCRE_BSR_ANYCRLF;
3292: #endif
3293: }
3294:
3295: /* Handle different types of newline. The three bits give eight cases. If
3296: nothing is set at run time, whatever was used at compile time applies. */
3297:
3298: switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3299: PCRE_NEWLINE_BITS)
3300: {
3301: case 0: newline = NEWLINE; break; /* Compile-time default */
3302: case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3303: case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3304: case PCRE_NEWLINE_CR+
3305: PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3306: case PCRE_NEWLINE_ANY: newline = -1; break;
3307: case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3308: default: return PCRE_ERROR_BADNEWLINE;
3309: }
3310:
3311: if (newline == -2)
3312: {
3313: md->nltype = NLTYPE_ANYCRLF;
3314: }
3315: else if (newline < 0)
3316: {
3317: md->nltype = NLTYPE_ANY;
3318: }
3319: else
3320: {
3321: md->nltype = NLTYPE_FIXED;
3322: if (newline > 255)
3323: {
3324: md->nllen = 2;
3325: md->nl[0] = (newline >> 8) & 255;
3326: md->nl[1] = newline & 255;
3327: }
3328: else
3329: {
3330: md->nllen = 1;
3331: md->nl[0] = newline;
3332: }
3333: }
3334:
3335: /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3336: back the character offset. */
3337:
1.1.1.2 misho 3338: #ifdef SUPPORT_UTF
3339: if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
1.1 misho 3340: {
3341: int erroroffset;
1.1.1.2 misho 3342: int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
1.1 misho 3343: if (errorcode != 0)
3344: {
3345: if (offsetcount >= 2)
3346: {
3347: offsets[0] = erroroffset;
3348: offsets[1] = errorcode;
3349: }
1.1.1.4 misho 3350: #if defined COMPILE_PCRE8
3351: return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0) ?
1.1 misho 3352: PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
1.1.1.4 misho 3353: #elif defined COMPILE_PCRE16
3354: return (errorcode <= PCRE_UTF16_ERR1 && (options & PCRE_PARTIAL_HARD) != 0) ?
3355: PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
3356: #elif defined COMPILE_PCRE32
3357: return PCRE_ERROR_BADUTF32;
3358: #endif
1.1 misho 3359: }
1.1.1.4 misho 3360: #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
1.1 misho 3361: if (start_offset > 0 && start_offset < length &&
1.1.1.2 misho 3362: NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
1.1 misho 3363: return PCRE_ERROR_BADUTF8_OFFSET;
1.1.1.4 misho 3364: #endif
1.1 misho 3365: }
3366: #endif
3367:
3368: /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3369: is a feature that makes it possible to save compiled regex and re-use them
3370: in other programs later. */
3371:
1.1.1.2 misho 3372: if (md->tables == NULL) md->tables = PRIV(default_tables);
1.1 misho 3373:
1.1.1.2 misho 3374: /* The "must be at the start of a line" flags are used in a loop when finding
3375: where to start. */
1.1 misho 3376:
3377: startline = (re->flags & PCRE_STARTLINE) != 0;
3378: firstline = (re->options & PCRE_FIRSTLINE) != 0;
3379:
3380: /* Set up the first character to match, if available. The first_byte value is
3381: never set for an anchored regular expression, but the anchoring may be forced
3382: at run time, so we have to test for anchoring. The first char may be unset for
3383: an unanchored pattern, of course. If there's no first char and the pattern was
3384: studied, there may be a bitmap of possible first characters. */
3385:
3386: if (!anchored)
3387: {
3388: if ((re->flags & PCRE_FIRSTSET) != 0)
3389: {
1.1.1.2 misho 3390: has_first_char = TRUE;
3391: first_char = first_char2 = (pcre_uchar)(re->first_char);
3392: if ((re->flags & PCRE_FCH_CASELESS) != 0)
3393: {
3394: first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3395: #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3396: if (utf && first_char > 127)
3397: first_char2 = UCD_OTHERCASE(first_char);
3398: #endif
3399: }
1.1 misho 3400: }
3401: else
3402: {
3403: if (!startline && study != NULL &&
3404: (study->flags & PCRE_STUDY_MAPPED) != 0)
3405: start_bits = study->start_bits;
3406: }
3407: }
3408:
3409: /* For anchored or unanchored matches, there may be a "last known required
3410: character" set. */
3411:
3412: if ((re->flags & PCRE_REQCHSET) != 0)
3413: {
1.1.1.2 misho 3414: has_req_char = TRUE;
3415: req_char = req_char2 = (pcre_uchar)(re->req_char);
3416: if ((re->flags & PCRE_RCH_CASELESS) != 0)
3417: {
3418: req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3419: #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3420: if (utf && req_char > 127)
3421: req_char2 = UCD_OTHERCASE(req_char);
3422: #endif
3423: }
1.1 misho 3424: }
3425:
3426: /* Call the main matching function, looping for a non-anchored regex after a
3427: failed match. If not restarting, perform certain optimizations at the start of
3428: a match. */
3429:
3430: for (;;)
3431: {
3432: int rc;
3433:
3434: if ((options & PCRE_DFA_RESTART) == 0)
3435: {
1.1.1.2 misho 3436: const pcre_uchar *save_end_subject = end_subject;
1.1 misho 3437:
3438: /* If firstline is TRUE, the start of the match is constrained to the first
3439: line of a multiline string. Implement this by temporarily adjusting
3440: end_subject so that we stop scanning at a newline. If the match fails at
3441: the newline, later code breaks this loop. */
3442:
3443: if (firstline)
3444: {
1.1.1.2 misho 3445: PCRE_PUCHAR t = current_subject;
3446: #ifdef SUPPORT_UTF
3447: if (utf)
1.1 misho 3448: {
3449: while (t < md->end_subject && !IS_NEWLINE(t))
3450: {
3451: t++;
1.1.1.2 misho 3452: ACROSSCHAR(t < end_subject, *t, t++);
1.1 misho 3453: }
3454: }
3455: else
3456: #endif
3457: while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3458: end_subject = t;
3459: }
3460:
3461: /* There are some optimizations that avoid running the match if a known
3462: starting point is not found. However, there is an option that disables
3463: these, for testing and for ensuring that all callouts do actually occur.
3464: The option can be set in the regex by (*NO_START_OPT) or passed in
3465: match-time options. */
3466:
3467: if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3468: {
1.1.1.2 misho 3469: /* Advance to a known first char. */
1.1 misho 3470:
1.1.1.2 misho 3471: if (has_first_char)
1.1 misho 3472: {
1.1.1.2 misho 3473: if (first_char != first_char2)
1.1.1.4 misho 3474: {
3475: pcre_uchar csc;
1.1 misho 3476: while (current_subject < end_subject &&
1.1.1.4 misho 3477: (csc = RAWUCHARTEST(current_subject)) != first_char && csc != first_char2)
1.1 misho 3478: current_subject++;
1.1.1.4 misho 3479: }
1.1 misho 3480: else
3481: while (current_subject < end_subject &&
1.1.1.4 misho 3482: RAWUCHARTEST(current_subject) != first_char)
1.1 misho 3483: current_subject++;
3484: }
3485:
3486: /* Or to just after a linebreak for a multiline match if possible */
3487:
3488: else if (startline)
3489: {
3490: if (current_subject > md->start_subject + start_offset)
3491: {
1.1.1.2 misho 3492: #ifdef SUPPORT_UTF
3493: if (utf)
1.1 misho 3494: {
3495: while (current_subject < end_subject &&
3496: !WAS_NEWLINE(current_subject))
3497: {
3498: current_subject++;
1.1.1.2 misho 3499: ACROSSCHAR(current_subject < end_subject, *current_subject,
3500: current_subject++);
1.1 misho 3501: }
3502: }
3503: else
3504: #endif
3505: while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3506: current_subject++;
3507:
3508: /* If we have just passed a CR and the newline option is ANY or
3509: ANYCRLF, and we are now at a LF, advance the match position by one
3510: more character. */
3511:
1.1.1.4 misho 3512: if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
1.1 misho 3513: (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3514: current_subject < end_subject &&
1.1.1.4 misho 3515: RAWUCHARTEST(current_subject) == CHAR_NL)
1.1 misho 3516: current_subject++;
3517: }
3518: }
3519:
3520: /* Or to a non-unique first char after study */
3521:
3522: else if (start_bits != NULL)
3523: {
3524: while (current_subject < end_subject)
3525: {
1.1.1.4 misho 3526: register pcre_uint32 c = RAWUCHARTEST(current_subject);
1.1.1.2 misho 3527: #ifndef COMPILE_PCRE8
3528: if (c > 255) c = 255;
3529: #endif
1.1 misho 3530: if ((start_bits[c/8] & (1 << (c&7))) == 0)
3531: {
3532: current_subject++;
1.1.1.2 misho 3533: #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3534: /* In non 8-bit mode, the iteration will stop for
3535: characters > 255 at the beginning or not stop at all. */
3536: if (utf)
3537: ACROSSCHAR(current_subject < end_subject, *current_subject,
3538: current_subject++);
1.1 misho 3539: #endif
3540: }
3541: else break;
3542: }
3543: }
3544: }
3545:
3546: /* Restore fudged end_subject */
3547:
3548: end_subject = save_end_subject;
3549:
3550: /* The following two optimizations are disabled for partial matching or if
3551: disabling is explicitly requested (and of course, by the test above, this
3552: code is not obeyed when restarting after a partial match). */
3553:
3554: if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3555: (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3556: {
3557: /* If the pattern was studied, a minimum subject length may be set. This
3558: is a lower bound; no actual string of that length may actually match the
3559: pattern. Although the value is, strictly, in characters, we treat it as
3560: bytes to avoid spending too much time in this optimization. */
3561:
3562: if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3563: (pcre_uint32)(end_subject - current_subject) < study->minlength)
3564: return PCRE_ERROR_NOMATCH;
3565:
1.1.1.2 misho 3566: /* If req_char is set, we know that that character must appear in the
3567: subject for the match to succeed. If the first character is set, req_char
1.1 misho 3568: must be later in the subject; otherwise the test starts at the match
3569: point. This optimization can save a huge amount of work in patterns with
3570: nested unlimited repeats that aren't going to match. Writing separate
3571: code for cased/caseless versions makes it go faster, as does using an
3572: autoincrement and backing off on a match.
3573:
3574: HOWEVER: when the subject string is very, very long, searching to its end
3575: can take a long time, and give bad performance on quite ordinary
3576: patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3577: string... so we don't do this when the string is sufficiently long. */
3578:
1.1.1.2 misho 3579: if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
1.1 misho 3580: {
1.1.1.2 misho 3581: register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
1.1 misho 3582:
3583: /* We don't need to repeat the search if we haven't yet reached the
3584: place we found it at last time. */
3585:
1.1.1.2 misho 3586: if (p > req_char_ptr)
1.1 misho 3587: {
1.1.1.2 misho 3588: if (req_char != req_char2)
1.1 misho 3589: {
3590: while (p < end_subject)
3591: {
1.1.1.4 misho 3592: register pcre_uint32 pp = RAWUCHARINCTEST(p);
1.1.1.2 misho 3593: if (pp == req_char || pp == req_char2) { p--; break; }
1.1 misho 3594: }
3595: }
3596: else
3597: {
3598: while (p < end_subject)
3599: {
1.1.1.4 misho 3600: if (RAWUCHARINCTEST(p) == req_char) { p--; break; }
1.1 misho 3601: }
3602: }
3603:
3604: /* If we can't find the required character, break the matching loop,
3605: which will cause a return or PCRE_ERROR_NOMATCH. */
3606:
3607: if (p >= end_subject) break;
3608:
3609: /* If we have found the required character, save the point where we
3610: found it, so that we don't search again next time round the loop if
3611: the start hasn't passed this character yet. */
3612:
1.1.1.2 misho 3613: req_char_ptr = p;
1.1 misho 3614: }
3615: }
3616: }
3617: } /* End of optimizations that are done when not restarting */
3618:
3619: /* OK, now we can do the business */
3620:
3621: md->start_used_ptr = current_subject;
3622: md->recursive = NULL;
3623:
3624: rc = internal_dfa_exec(
3625: md, /* fixed match data */
3626: md->start_code, /* this subexpression's code */
3627: current_subject, /* where we currently are */
3628: start_offset, /* start offset in subject */
3629: offsets, /* offset vector */
3630: offsetcount, /* size of same */
3631: workspace, /* workspace vector */
3632: wscount, /* size of same */
3633: 0); /* function recurse level */
3634:
3635: /* Anything other than "no match" means we are done, always; otherwise, carry
3636: on only if not anchored. */
3637:
1.1.1.4 misho 3638: if (rc != PCRE_ERROR_NOMATCH || anchored)
3639: {
3640: if (rc == PCRE_ERROR_PARTIAL && offsetcount >= 2)
3641: {
3642: offsets[0] = (int)(md->start_used_ptr - (PCRE_PUCHAR)subject);
3643: offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
3644: if (offsetcount > 2)
3645: offsets[2] = (int)(current_subject - (PCRE_PUCHAR)subject);
3646: }
3647: return rc;
3648: }
1.1 misho 3649:
3650: /* Advance to the next subject character unless we are at the end of a line
3651: and firstline is set. */
3652:
3653: if (firstline && IS_NEWLINE(current_subject)) break;
3654: current_subject++;
1.1.1.2 misho 3655: #ifdef SUPPORT_UTF
3656: if (utf)
1.1 misho 3657: {
1.1.1.2 misho 3658: ACROSSCHAR(current_subject < end_subject, *current_subject,
3659: current_subject++);
1.1 misho 3660: }
1.1.1.2 misho 3661: #endif
1.1 misho 3662: if (current_subject > end_subject) break;
3663:
3664: /* If we have just passed a CR and we are now at a LF, and the pattern does
3665: not contain any explicit matches for \r or \n, and the newline option is CRLF
3666: or ANY or ANYCRLF, advance the match position by one more character. */
3667:
1.1.1.4 misho 3668: if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
1.1 misho 3669: current_subject < end_subject &&
1.1.1.4 misho 3670: RAWUCHARTEST(current_subject) == CHAR_NL &&
1.1 misho 3671: (re->flags & PCRE_HASCRORLF) == 0 &&
3672: (md->nltype == NLTYPE_ANY ||
3673: md->nltype == NLTYPE_ANYCRLF ||
3674: md->nllen == 2))
3675: current_subject++;
3676:
3677: } /* "Bumpalong" loop */
3678:
3679: return PCRE_ERROR_NOMATCH;
3680: }
3681:
3682: /* End of pcre_dfa_exec.c */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>