Annotation of embedaddon/pcre/pcre_dfa_exec.c, revision 1.1.1.2
1.1 misho 1: /*************************************************
2: * Perl-Compatible Regular Expressions *
3: *************************************************/
4:
5: /* PCRE is a library of functions to support regular expressions whose syntax
6: and semantics are as close as possible to those of the Perl 5 language (but see
7: below for why this module is different).
8:
9: Written by Philip Hazel
1.1.1.2 ! misho 10: Copyright (c) 1997-2012 University of Cambridge
1.1 misho 11:
12: -----------------------------------------------------------------------------
13: Redistribution and use in source and binary forms, with or without
14: modification, are permitted provided that the following conditions are met:
15:
16: * Redistributions of source code must retain the above copyright notice,
17: this list of conditions and the following disclaimer.
18:
19: * Redistributions in binary form must reproduce the above copyright
20: notice, this list of conditions and the following disclaimer in the
21: documentation and/or other materials provided with the distribution.
22:
23: * Neither the name of the University of Cambridge nor the names of its
24: contributors may be used to endorse or promote products derived from
25: this software without specific prior written permission.
26:
27: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37: POSSIBILITY OF SUCH DAMAGE.
38: -----------------------------------------------------------------------------
39: */
40:
41:
42: /* This module contains the external function pcre_dfa_exec(), which is an
43: alternative matching function that uses a sort of DFA algorithm (not a true
44: FSM). This is NOT Perl- compatible, but it has advantages in certain
45: applications. */
46:
47:
48: /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49: the performance of his patterns greatly. I could not use it as it stood, as it
50: was not thread safe, and made assumptions about pattern sizes. Also, it caused
51: test 7 to loop, and test 9 to crash with a segfault.
52:
53: The issue is the check for duplicate states, which is done by a simple linear
54: search up the state list. (Grep for "duplicate" below to find the code.) For
55: many patterns, there will never be many states active at one time, so a simple
56: linear search is fine. In patterns that have many active states, it might be a
57: bottleneck. The suggested code used an indexing scheme to remember which states
58: had previously been used for each character, and avoided the linear search when
59: it knew there was no chance of a duplicate. This was implemented when adding
60: states to the state lists.
61:
62: I wrote some thread-safe, not-limited code to try something similar at the time
63: of checking for duplicates (instead of when adding states), using index vectors
64: on the stack. It did give a 13% improvement with one specially constructed
65: pattern for certain subject strings, but on other strings and on many of the
66: simpler patterns in the test suite it did worse. The major problem, I think,
67: was the extra time to initialize the index. This had to be done for each call
68: of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69: only once - I suspect this was the cause of the problems with the tests.)
70:
71: Overall, I concluded that the gains in some cases did not outweigh the losses
72: in others, so I abandoned this code. */
73:
74:
75:
76: #ifdef HAVE_CONFIG_H
77: #include "config.h"
78: #endif
79:
80: #define NLBLOCK md /* Block containing newline information */
81: #define PSSTART start_subject /* Field containing processed string start */
82: #define PSEND end_subject /* Field containing processed string end */
83:
84: #include "pcre_internal.h"
85:
86:
87: /* For use to indent debugging output */
88:
89: #define SP " "
90:
91:
92: /*************************************************
93: * Code parameters and static tables *
94: *************************************************/
95:
96: /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97: into others, under special conditions. A gap of 20 between the blocks should be
98: enough. The resulting opcodes don't have to be less than 256 because they are
99: never stored, so we push them well clear of the normal opcodes. */
100:
101: #define OP_PROP_EXTRA 300
102: #define OP_EXTUNI_EXTRA 320
103: #define OP_ANYNL_EXTRA 340
104: #define OP_HSPACE_EXTRA 360
105: #define OP_VSPACE_EXTRA 380
106:
107:
108: /* This table identifies those opcodes that are followed immediately by a
109: character that is to be tested in some way. This makes it possible to
110: centralize the loading of these characters. In the case of Type * etc, the
111: "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112: small value. Non-zero values in the table are the offsets from the opcode where
113: the character is to be found. ***NOTE*** If the start of this table is
114: modified, the three tables that follow must also be modified. */
115:
1.1.1.2 ! misho 116: static const pcre_uint8 coptable[] = {
1.1 misho 117: 0, /* End */
118: 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119: 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120: 0, 0, 0, /* Any, AllAny, Anybyte */
121: 0, 0, /* \P, \p */
122: 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123: 0, /* \X */
124: 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
125: 1, /* Char */
126: 1, /* Chari */
127: 1, /* not */
128: 1, /* noti */
129: /* Positive single-char repeats */
130: 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
1.1.1.2 ! misho 131: 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
! 132: 1+IMM2_SIZE, /* exact */
! 133: 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
1.1 misho 134: 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
1.1.1.2 ! misho 135: 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
! 136: 1+IMM2_SIZE, /* exact I */
! 137: 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
1.1 misho 138: /* Negative single-char repeats - only for chars < 256 */
139: 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
1.1.1.2 ! misho 140: 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
! 141: 1+IMM2_SIZE, /* NOT exact */
! 142: 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
1.1 misho 143: 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
1.1.1.2 ! misho 144: 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
! 145: 1+IMM2_SIZE, /* NOT exact I */
! 146: 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
1.1 misho 147: /* Positive type repeats */
148: 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
1.1.1.2 ! misho 149: 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
! 150: 1+IMM2_SIZE, /* Type exact */
! 151: 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
1.1 misho 152: /* Character class & ref repeats */
153: 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
154: 0, 0, /* CRRANGE, CRMINRANGE */
155: 0, /* CLASS */
156: 0, /* NCLASS */
157: 0, /* XCLASS - variable length */
158: 0, /* REF */
159: 0, /* REFI */
160: 0, /* RECURSE */
161: 0, /* CALLOUT */
162: 0, /* Alt */
163: 0, /* Ket */
164: 0, /* KetRmax */
165: 0, /* KetRmin */
166: 0, /* KetRpos */
167: 0, /* Reverse */
168: 0, /* Assert */
169: 0, /* Assert not */
170: 0, /* Assert behind */
171: 0, /* Assert behind not */
172: 0, 0, /* ONCE, ONCE_NC */
173: 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
174: 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
175: 0, 0, /* CREF, NCREF */
176: 0, 0, /* RREF, NRREF */
177: 0, /* DEF */
178: 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
179: 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
180: 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
181: 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
182: 0, 0 /* CLOSE, SKIPZERO */
183: };
184:
185: /* This table identifies those opcodes that inspect a character. It is used to
186: remember the fact that a character could have been inspected when the end of
187: the subject is reached. ***NOTE*** If the start of this table is modified, the
188: two tables that follow must also be modified. */
189:
1.1.1.2 ! misho 190: static const pcre_uint8 poptable[] = {
1.1 misho 191: 0, /* End */
192: 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
193: 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
194: 1, 1, 1, /* Any, AllAny, Anybyte */
195: 1, 1, /* \P, \p */
196: 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
197: 1, /* \X */
198: 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
199: 1, /* Char */
200: 1, /* Chari */
201: 1, /* not */
202: 1, /* noti */
203: /* Positive single-char repeats */
204: 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
205: 1, 1, 1, /* upto, minupto, exact */
206: 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
207: 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
208: 1, 1, 1, /* upto I, minupto I, exact I */
209: 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
210: /* Negative single-char repeats - only for chars < 256 */
211: 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
212: 1, 1, 1, /* NOT upto, minupto, exact */
213: 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
214: 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
215: 1, 1, 1, /* NOT upto I, minupto I, exact I */
216: 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
217: /* Positive type repeats */
218: 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
219: 1, 1, 1, /* Type upto, minupto, exact */
220: 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
221: /* Character class & ref repeats */
222: 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
223: 1, 1, /* CRRANGE, CRMINRANGE */
224: 1, /* CLASS */
225: 1, /* NCLASS */
226: 1, /* XCLASS - variable length */
227: 0, /* REF */
228: 0, /* REFI */
229: 0, /* RECURSE */
230: 0, /* CALLOUT */
231: 0, /* Alt */
232: 0, /* Ket */
233: 0, /* KetRmax */
234: 0, /* KetRmin */
235: 0, /* KetRpos */
236: 0, /* Reverse */
237: 0, /* Assert */
238: 0, /* Assert not */
239: 0, /* Assert behind */
240: 0, /* Assert behind not */
241: 0, 0, /* ONCE, ONCE_NC */
242: 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
243: 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
244: 0, 0, /* CREF, NCREF */
245: 0, 0, /* RREF, NRREF */
246: 0, /* DEF */
247: 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
248: 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
249: 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
250: 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
251: 0, 0 /* CLOSE, SKIPZERO */
252: };
253:
254: /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
255: and \w */
256:
1.1.1.2 ! misho 257: static const pcre_uint8 toptable1[] = {
1.1 misho 258: 0, 0, 0, 0, 0, 0,
259: ctype_digit, ctype_digit,
260: ctype_space, ctype_space,
261: ctype_word, ctype_word,
262: 0, 0 /* OP_ANY, OP_ALLANY */
263: };
264:
1.1.1.2 ! misho 265: static const pcre_uint8 toptable2[] = {
1.1 misho 266: 0, 0, 0, 0, 0, 0,
267: ctype_digit, 0,
268: ctype_space, 0,
269: ctype_word, 0,
270: 1, 1 /* OP_ANY, OP_ALLANY */
271: };
272:
273:
274: /* Structure for holding data about a particular state, which is in effect the
275: current data for an active path through the match tree. It must consist
276: entirely of ints because the working vector we are passed, and which we put
277: these structures in, is a vector of ints. */
278:
279: typedef struct stateblock {
280: int offset; /* Offset to opcode */
281: int count; /* Count for repeats */
282: int data; /* Some use extra data */
283: } stateblock;
284:
285: #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
286:
287:
288: #ifdef PCRE_DEBUG
289: /*************************************************
290: * Print character string *
291: *************************************************/
292:
293: /* Character string printing function for debugging.
294:
295: Arguments:
296: p points to string
297: length number of bytes
298: f where to print
299:
300: Returns: nothing
301: */
302:
303: static void
1.1.1.2 ! misho 304: pchars(const pcre_uchar *p, int length, FILE *f)
1.1 misho 305: {
306: int c;
307: while (length-- > 0)
308: {
309: if (isprint(c = *(p++)))
310: fprintf(f, "%c", c);
311: else
312: fprintf(f, "\\x%02x", c);
313: }
314: }
315: #endif
316:
317:
318:
319: /*************************************************
320: * Execute a Regular Expression - DFA engine *
321: *************************************************/
322:
323: /* This internal function applies a compiled pattern to a subject string,
324: starting at a given point, using a DFA engine. This function is called from the
325: external one, possibly multiple times if the pattern is not anchored. The
326: function calls itself recursively for some kinds of subpattern.
327:
328: Arguments:
329: md the match_data block with fixed information
330: this_start_code the opening bracket of this subexpression's code
331: current_subject where we currently are in the subject string
332: start_offset start offset in the subject string
333: offsets vector to contain the matching string offsets
334: offsetcount size of same
335: workspace vector of workspace
336: wscount size of same
337: rlevel function call recursion level
338:
339: Returns: > 0 => number of match offset pairs placed in offsets
340: = 0 => offsets overflowed; longest matches are present
341: -1 => failed to match
342: < -1 => some kind of unexpected problem
343:
344: The following macros are used for adding states to the two state vectors (one
345: for the current character, one for the following character). */
346:
347: #define ADD_ACTIVE(x,y) \
348: if (active_count++ < wscount) \
349: { \
350: next_active_state->offset = (x); \
351: next_active_state->count = (y); \
352: next_active_state++; \
353: DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
354: } \
355: else return PCRE_ERROR_DFA_WSSIZE
356:
357: #define ADD_ACTIVE_DATA(x,y,z) \
358: if (active_count++ < wscount) \
359: { \
360: next_active_state->offset = (x); \
361: next_active_state->count = (y); \
362: next_active_state->data = (z); \
363: next_active_state++; \
364: DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
365: } \
366: else return PCRE_ERROR_DFA_WSSIZE
367:
368: #define ADD_NEW(x,y) \
369: if (new_count++ < wscount) \
370: { \
371: next_new_state->offset = (x); \
372: next_new_state->count = (y); \
373: next_new_state++; \
374: DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
375: } \
376: else return PCRE_ERROR_DFA_WSSIZE
377:
378: #define ADD_NEW_DATA(x,y,z) \
379: if (new_count++ < wscount) \
380: { \
381: next_new_state->offset = (x); \
382: next_new_state->count = (y); \
383: next_new_state->data = (z); \
384: next_new_state++; \
385: DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
386: } \
387: else return PCRE_ERROR_DFA_WSSIZE
388:
389: /* And now, here is the code */
390:
391: static int
392: internal_dfa_exec(
393: dfa_match_data *md,
1.1.1.2 ! misho 394: const pcre_uchar *this_start_code,
! 395: const pcre_uchar *current_subject,
1.1 misho 396: int start_offset,
397: int *offsets,
398: int offsetcount,
399: int *workspace,
400: int wscount,
401: int rlevel)
402: {
403: stateblock *active_states, *new_states, *temp_states;
404: stateblock *next_active_state, *next_new_state;
405:
1.1.1.2 ! misho 406: const pcre_uint8 *ctypes, *lcc, *fcc;
! 407: const pcre_uchar *ptr;
! 408: const pcre_uchar *end_code, *first_op;
1.1 misho 409:
410: dfa_recursion_info new_recursive;
411:
412: int active_count, new_count, match_count;
413:
414: /* Some fields in the md block are frequently referenced, so we load them into
415: independent variables in the hope that this will perform better. */
416:
1.1.1.2 ! misho 417: const pcre_uchar *start_subject = md->start_subject;
! 418: const pcre_uchar *end_subject = md->end_subject;
! 419: const pcre_uchar *start_code = md->start_code;
1.1 misho 420:
1.1.1.2 ! misho 421: #ifdef SUPPORT_UTF
! 422: BOOL utf = (md->poptions & PCRE_UTF8) != 0;
1.1 misho 423: #else
1.1.1.2 ! misho 424: BOOL utf = FALSE;
1.1 misho 425: #endif
426:
427: rlevel++;
428: offsetcount &= (-2);
429:
430: wscount -= 2;
431: wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
432: (2 * INTS_PER_STATEBLOCK);
433:
434: DPRINTF(("\n%.*s---------------------\n"
435: "%.*sCall to internal_dfa_exec f=%d\n",
436: rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
437:
438: ctypes = md->tables + ctypes_offset;
439: lcc = md->tables + lcc_offset;
440: fcc = md->tables + fcc_offset;
441:
442: match_count = PCRE_ERROR_NOMATCH; /* A negative number */
443:
444: active_states = (stateblock *)(workspace + 2);
445: next_new_state = new_states = active_states + wscount;
446: new_count = 0;
447:
448: first_op = this_start_code + 1 + LINK_SIZE +
449: ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
1.1.1.2 ! misho 450: *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
! 451: ? IMM2_SIZE:0);
1.1 misho 452:
453: /* The first thing in any (sub) pattern is a bracket of some sort. Push all
454: the alternative states onto the list, and find out where the end is. This
455: makes is possible to use this function recursively, when we want to stop at a
456: matching internal ket rather than at the end.
457:
458: If the first opcode in the first alternative is OP_REVERSE, we are dealing with
459: a backward assertion. In that case, we have to find out the maximum amount to
460: move back, and set up each alternative appropriately. */
461:
462: if (*first_op == OP_REVERSE)
463: {
464: int max_back = 0;
465: int gone_back;
466:
467: end_code = this_start_code;
468: do
469: {
470: int back = GET(end_code, 2+LINK_SIZE);
471: if (back > max_back) max_back = back;
472: end_code += GET(end_code, 1);
473: }
474: while (*end_code == OP_ALT);
475:
476: /* If we can't go back the amount required for the longest lookbehind
477: pattern, go back as far as we can; some alternatives may still be viable. */
478:
1.1.1.2 ! misho 479: #ifdef SUPPORT_UTF
1.1 misho 480: /* In character mode we have to step back character by character */
481:
1.1.1.2 ! misho 482: if (utf)
1.1 misho 483: {
484: for (gone_back = 0; gone_back < max_back; gone_back++)
485: {
486: if (current_subject <= start_subject) break;
487: current_subject--;
1.1.1.2 ! misho 488: ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
1.1 misho 489: }
490: }
491: else
492: #endif
493:
494: /* In byte-mode we can do this quickly. */
495:
496: {
497: gone_back = (current_subject - max_back < start_subject)?
498: (int)(current_subject - start_subject) : max_back;
499: current_subject -= gone_back;
500: }
501:
502: /* Save the earliest consulted character */
503:
504: if (current_subject < md->start_used_ptr)
505: md->start_used_ptr = current_subject;
506:
507: /* Now we can process the individual branches. */
508:
509: end_code = this_start_code;
510: do
511: {
512: int back = GET(end_code, 2+LINK_SIZE);
513: if (back <= gone_back)
514: {
515: int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
516: ADD_NEW_DATA(-bstate, 0, gone_back - back);
517: }
518: end_code += GET(end_code, 1);
519: }
520: while (*end_code == OP_ALT);
521: }
522:
523: /* This is the code for a "normal" subpattern (not a backward assertion). The
524: start of a whole pattern is always one of these. If we are at the top level,
525: we may be asked to restart matching from the same point that we reached for a
526: previous partial match. We still have to scan through the top-level branches to
527: find the end state. */
528:
529: else
530: {
531: end_code = this_start_code;
532:
533: /* Restarting */
534:
535: if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
536: {
537: do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
538: new_count = workspace[1];
539: if (!workspace[0])
540: memcpy(new_states, active_states, new_count * sizeof(stateblock));
541: }
542:
543: /* Not restarting */
544:
545: else
546: {
547: int length = 1 + LINK_SIZE +
548: ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
1.1.1.2 ! misho 549: *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
! 550: ? IMM2_SIZE:0);
1.1 misho 551: do
552: {
553: ADD_NEW((int)(end_code - start_code + length), 0);
554: end_code += GET(end_code, 1);
555: length = 1 + LINK_SIZE;
556: }
557: while (*end_code == OP_ALT);
558: }
559: }
560:
561: workspace[0] = 0; /* Bit indicating which vector is current */
562:
1.1.1.2 ! misho 563: DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
1.1 misho 564:
565: /* Loop for scanning the subject */
566:
567: ptr = current_subject;
568: for (;;)
569: {
570: int i, j;
571: int clen, dlen;
572: unsigned int c, d;
573: int forced_fail = 0;
574: BOOL could_continue = FALSE;
575:
576: /* Make the new state list into the active state list and empty the
577: new state list. */
578:
579: temp_states = active_states;
580: active_states = new_states;
581: new_states = temp_states;
582: active_count = new_count;
583: new_count = 0;
584:
585: workspace[0] ^= 1; /* Remember for the restarting feature */
586: workspace[1] = active_count;
587:
588: #ifdef PCRE_DEBUG
589: printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
1.1.1.2 ! misho 590: pchars(ptr, STRLEN_UC(ptr), stdout);
1.1 misho 591: printf("\"\n");
592:
593: printf("%.*sActive states: ", rlevel*2-2, SP);
594: for (i = 0; i < active_count; i++)
595: printf("%d/%d ", active_states[i].offset, active_states[i].count);
596: printf("\n");
597: #endif
598:
599: /* Set the pointers for adding new states */
600:
601: next_active_state = active_states + active_count;
602: next_new_state = new_states;
603:
604: /* Load the current character from the subject outside the loop, as many
605: different states may want to look at it, and we assume that at least one
606: will. */
607:
608: if (ptr < end_subject)
609: {
610: clen = 1; /* Number of bytes in the character */
1.1.1.2 ! misho 611: #ifdef SUPPORT_UTF
! 612: if (utf) { GETCHARLEN(c, ptr, clen); } else
! 613: #endif /* SUPPORT_UTF */
1.1 misho 614: c = *ptr;
615: }
616: else
617: {
618: clen = 0; /* This indicates the end of the subject */
619: c = NOTACHAR; /* This value should never actually be used */
620: }
621:
622: /* Scan up the active states and act on each one. The result of an action
623: may be to add more states to the currently active list (e.g. on hitting a
624: parenthesis) or it may be to put states on the new list, for considering
625: when we move the character pointer on. */
626:
627: for (i = 0; i < active_count; i++)
628: {
629: stateblock *current_state = active_states + i;
630: BOOL caseless = FALSE;
1.1.1.2 ! misho 631: const pcre_uchar *code;
1.1 misho 632: int state_offset = current_state->offset;
633: int count, codevalue, rrc;
634:
635: #ifdef PCRE_DEBUG
636: printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
637: if (clen == 0) printf("EOL\n");
638: else if (c > 32 && c < 127) printf("'%c'\n", c);
639: else printf("0x%02x\n", c);
640: #endif
641:
642: /* A negative offset is a special case meaning "hold off going to this
643: (negated) state until the number of characters in the data field have
644: been skipped". */
645:
646: if (state_offset < 0)
647: {
648: if (current_state->data > 0)
649: {
650: DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
651: ADD_NEW_DATA(state_offset, current_state->count,
652: current_state->data - 1);
653: continue;
654: }
655: else
656: {
657: current_state->offset = state_offset = -state_offset;
658: }
659: }
660:
661: /* Check for a duplicate state with the same count, and skip if found.
662: See the note at the head of this module about the possibility of improving
663: performance here. */
664:
665: for (j = 0; j < i; j++)
666: {
667: if (active_states[j].offset == state_offset &&
668: active_states[j].count == current_state->count)
669: {
670: DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
671: goto NEXT_ACTIVE_STATE;
672: }
673: }
674:
675: /* The state offset is the offset to the opcode */
676:
677: code = start_code + state_offset;
678: codevalue = *code;
679:
680: /* If this opcode inspects a character, but we are at the end of the
681: subject, remember the fact for use when testing for a partial match. */
682:
683: if (clen == 0 && poptable[codevalue] != 0)
684: could_continue = TRUE;
685:
686: /* If this opcode is followed by an inline character, load it. It is
687: tempting to test for the presence of a subject character here, but that
688: is wrong, because sometimes zero repetitions of the subject are
689: permitted.
690:
691: We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
692: argument that is not a data character - but is always one byte long. We
693: have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
694: this case. To keep the other cases fast, convert these ones to new opcodes.
695: */
696:
697: if (coptable[codevalue] > 0)
698: {
699: dlen = 1;
1.1.1.2 ! misho 700: #ifdef SUPPORT_UTF
! 701: if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
! 702: #endif /* SUPPORT_UTF */
1.1 misho 703: d = code[coptable[codevalue]];
704: if (codevalue >= OP_TYPESTAR)
705: {
706: switch(d)
707: {
708: case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
709: case OP_NOTPROP:
710: case OP_PROP: codevalue += OP_PROP_EXTRA; break;
711: case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
712: case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
713: case OP_NOT_HSPACE:
714: case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
715: case OP_NOT_VSPACE:
716: case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
717: default: break;
718: }
719: }
720: }
721: else
722: {
723: dlen = 0; /* Not strictly necessary, but compilers moan */
724: d = NOTACHAR; /* if these variables are not set. */
725: }
726:
727:
728: /* Now process the individual opcodes */
729:
730: switch (codevalue)
731: {
732: /* ========================================================================== */
733: /* These cases are never obeyed. This is a fudge that causes a compile-
734: time error if the vectors coptable or poptable, which are indexed by
735: opcode, are not the correct length. It seems to be the only way to do
736: such a check at compile time, as the sizeof() operator does not work
737: in the C preprocessor. */
738:
739: case OP_TABLE_LENGTH:
740: case OP_TABLE_LENGTH +
741: ((sizeof(coptable) == OP_TABLE_LENGTH) &&
742: (sizeof(poptable) == OP_TABLE_LENGTH)):
743: break;
744:
745: /* ========================================================================== */
746: /* Reached a closing bracket. If not at the end of the pattern, carry
747: on with the next opcode. For repeating opcodes, also add the repeat
748: state. Note that KETRPOS will always be encountered at the end of the
749: subpattern, because the possessive subpattern repeats are always handled
750: using recursive calls. Thus, it never adds any new states.
751:
752: At the end of the (sub)pattern, unless we have an empty string and
753: PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
754: start of the subject, save the match data, shifting up all previous
755: matches so we always have the longest first. */
756:
757: case OP_KET:
758: case OP_KETRMIN:
759: case OP_KETRMAX:
760: case OP_KETRPOS:
761: if (code != end_code)
762: {
763: ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
764: if (codevalue != OP_KET)
765: {
766: ADD_ACTIVE(state_offset - GET(code, 1), 0);
767: }
768: }
769: else
770: {
771: if (ptr > current_subject ||
772: ((md->moptions & PCRE_NOTEMPTY) == 0 &&
773: ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
774: current_subject > start_subject + md->start_offset)))
775: {
776: if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
777: else if (match_count > 0 && ++match_count * 2 > offsetcount)
778: match_count = 0;
779: count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
780: if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
781: if (offsetcount >= 2)
782: {
783: offsets[0] = (int)(current_subject - start_subject);
784: offsets[1] = (int)(ptr - start_subject);
785: DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
786: offsets[1] - offsets[0], current_subject));
787: }
788: if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
789: {
790: DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
791: "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
792: match_count, rlevel*2-2, SP));
793: return match_count;
794: }
795: }
796: }
797: break;
798:
799: /* ========================================================================== */
800: /* These opcodes add to the current list of states without looking
801: at the current character. */
802:
803: /*-----------------------------------------------------------------*/
804: case OP_ALT:
805: do { code += GET(code, 1); } while (*code == OP_ALT);
806: ADD_ACTIVE((int)(code - start_code), 0);
807: break;
808:
809: /*-----------------------------------------------------------------*/
810: case OP_BRA:
811: case OP_SBRA:
812: do
813: {
814: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
815: code += GET(code, 1);
816: }
817: while (*code == OP_ALT);
818: break;
819:
820: /*-----------------------------------------------------------------*/
821: case OP_CBRA:
822: case OP_SCBRA:
1.1.1.2 ! misho 823: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
1.1 misho 824: code += GET(code, 1);
825: while (*code == OP_ALT)
826: {
827: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
828: code += GET(code, 1);
829: }
830: break;
831:
832: /*-----------------------------------------------------------------*/
833: case OP_BRAZERO:
834: case OP_BRAMINZERO:
835: ADD_ACTIVE(state_offset + 1, 0);
836: code += 1 + GET(code, 2);
837: while (*code == OP_ALT) code += GET(code, 1);
838: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
839: break;
840:
841: /*-----------------------------------------------------------------*/
842: case OP_SKIPZERO:
843: code += 1 + GET(code, 2);
844: while (*code == OP_ALT) code += GET(code, 1);
845: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
846: break;
847:
848: /*-----------------------------------------------------------------*/
849: case OP_CIRC:
850: if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
851: { ADD_ACTIVE(state_offset + 1, 0); }
852: break;
853:
854: /*-----------------------------------------------------------------*/
855: case OP_CIRCM:
856: if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
857: (ptr != end_subject && WAS_NEWLINE(ptr)))
858: { ADD_ACTIVE(state_offset + 1, 0); }
859: break;
860:
861: /*-----------------------------------------------------------------*/
862: case OP_EOD:
863: if (ptr >= end_subject)
864: {
865: if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
866: could_continue = TRUE;
867: else { ADD_ACTIVE(state_offset + 1, 0); }
868: }
869: break;
870:
871: /*-----------------------------------------------------------------*/
872: case OP_SOD:
873: if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
874: break;
875:
876: /*-----------------------------------------------------------------*/
877: case OP_SOM:
878: if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
879: break;
880:
881:
882: /* ========================================================================== */
883: /* These opcodes inspect the next subject character, and sometimes
884: the previous one as well, but do not have an argument. The variable
885: clen contains the length of the current character and is zero if we are
886: at the end of the subject. */
887:
888: /*-----------------------------------------------------------------*/
889: case OP_ANY:
890: if (clen > 0 && !IS_NEWLINE(ptr))
891: { ADD_NEW(state_offset + 1, 0); }
892: break;
893:
894: /*-----------------------------------------------------------------*/
895: case OP_ALLANY:
896: if (clen > 0)
897: { ADD_NEW(state_offset + 1, 0); }
898: break;
899:
900: /*-----------------------------------------------------------------*/
901: case OP_EODN:
902: if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
903: could_continue = TRUE;
904: else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
905: { ADD_ACTIVE(state_offset + 1, 0); }
906: break;
907:
908: /*-----------------------------------------------------------------*/
909: case OP_DOLL:
910: if ((md->moptions & PCRE_NOTEOL) == 0)
911: {
912: if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
913: could_continue = TRUE;
914: else if (clen == 0 ||
915: ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
916: (ptr == end_subject - md->nllen)
917: ))
918: { ADD_ACTIVE(state_offset + 1, 0); }
919: }
920: break;
921:
922: /*-----------------------------------------------------------------*/
923: case OP_DOLLM:
924: if ((md->moptions & PCRE_NOTEOL) == 0)
925: {
926: if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
927: could_continue = TRUE;
928: else if (clen == 0 ||
929: ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
930: { ADD_ACTIVE(state_offset + 1, 0); }
931: }
932: else if (IS_NEWLINE(ptr))
933: { ADD_ACTIVE(state_offset + 1, 0); }
934: break;
935:
936: /*-----------------------------------------------------------------*/
937:
938: case OP_DIGIT:
939: case OP_WHITESPACE:
940: case OP_WORDCHAR:
941: if (clen > 0 && c < 256 &&
942: ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
943: { ADD_NEW(state_offset + 1, 0); }
944: break;
945:
946: /*-----------------------------------------------------------------*/
947: case OP_NOT_DIGIT:
948: case OP_NOT_WHITESPACE:
949: case OP_NOT_WORDCHAR:
950: if (clen > 0 && (c >= 256 ||
951: ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
952: { ADD_NEW(state_offset + 1, 0); }
953: break;
954:
955: /*-----------------------------------------------------------------*/
956: case OP_WORD_BOUNDARY:
957: case OP_NOT_WORD_BOUNDARY:
958: {
959: int left_word, right_word;
960:
961: if (ptr > start_subject)
962: {
1.1.1.2 ! misho 963: const pcre_uchar *temp = ptr - 1;
1.1 misho 964: if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1.1.1.2 ! misho 965: #ifdef SUPPORT_UTF
! 966: if (utf) { BACKCHAR(temp); }
1.1 misho 967: #endif
968: GETCHARTEST(d, temp);
969: #ifdef SUPPORT_UCP
970: if ((md->poptions & PCRE_UCP) != 0)
971: {
972: if (d == '_') left_word = TRUE; else
973: {
974: int cat = UCD_CATEGORY(d);
975: left_word = (cat == ucp_L || cat == ucp_N);
976: }
977: }
978: else
979: #endif
980: left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
981: }
982: else left_word = FALSE;
983:
984: if (clen > 0)
985: {
986: #ifdef SUPPORT_UCP
987: if ((md->poptions & PCRE_UCP) != 0)
988: {
989: if (c == '_') right_word = TRUE; else
990: {
991: int cat = UCD_CATEGORY(c);
992: right_word = (cat == ucp_L || cat == ucp_N);
993: }
994: }
995: else
996: #endif
997: right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
998: }
999: else right_word = FALSE;
1000:
1001: if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1002: { ADD_ACTIVE(state_offset + 1, 0); }
1003: }
1004: break;
1005:
1006:
1007: /*-----------------------------------------------------------------*/
1008: /* Check the next character by Unicode property. We will get here only
1009: if the support is in the binary; otherwise a compile-time error occurs.
1010: */
1011:
1012: #ifdef SUPPORT_UCP
1013: case OP_PROP:
1014: case OP_NOTPROP:
1015: if (clen > 0)
1016: {
1017: BOOL OK;
1018: const ucd_record * prop = GET_UCD(c);
1019: switch(code[1])
1020: {
1021: case PT_ANY:
1022: OK = TRUE;
1023: break;
1024:
1025: case PT_LAMP:
1026: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1027: prop->chartype == ucp_Lt;
1028: break;
1029:
1030: case PT_GC:
1.1.1.2 ! misho 1031: OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1.1 misho 1032: break;
1033:
1034: case PT_PC:
1035: OK = prop->chartype == code[2];
1036: break;
1037:
1038: case PT_SC:
1039: OK = prop->script == code[2];
1040: break;
1041:
1042: /* These are specials for combination cases. */
1043:
1044: case PT_ALNUM:
1.1.1.2 ! misho 1045: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
! 1046: PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1.1 misho 1047: break;
1048:
1049: case PT_SPACE: /* Perl space */
1.1.1.2 ! misho 1050: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.1 misho 1051: c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1052: break;
1053:
1054: case PT_PXSPACE: /* POSIX space */
1.1.1.2 ! misho 1055: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.1 misho 1056: c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1057: c == CHAR_FF || c == CHAR_CR;
1058: break;
1059:
1060: case PT_WORD:
1.1.1.2 ! misho 1061: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
! 1062: PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1.1 misho 1063: c == CHAR_UNDERSCORE;
1064: break;
1065:
1066: /* Should never occur, but keep compilers from grumbling. */
1067:
1068: default:
1069: OK = codevalue != OP_PROP;
1070: break;
1071: }
1072:
1073: if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1074: }
1075: break;
1076: #endif
1077:
1078:
1079:
1080: /* ========================================================================== */
1081: /* These opcodes likewise inspect the subject character, but have an
1082: argument that is not a data character. It is one of these opcodes:
1083: OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1084: OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1085:
1086: case OP_TYPEPLUS:
1087: case OP_TYPEMINPLUS:
1088: case OP_TYPEPOSPLUS:
1089: count = current_state->count; /* Already matched */
1090: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1091: if (clen > 0)
1092: {
1093: if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1094: (c < 256 &&
1095: (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1096: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1097: {
1098: if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1099: {
1100: active_count--; /* Remove non-match possibility */
1101: next_active_state--;
1102: }
1103: count++;
1104: ADD_NEW(state_offset, count);
1105: }
1106: }
1107: break;
1108:
1109: /*-----------------------------------------------------------------*/
1110: case OP_TYPEQUERY:
1111: case OP_TYPEMINQUERY:
1112: case OP_TYPEPOSQUERY:
1113: ADD_ACTIVE(state_offset + 2, 0);
1114: if (clen > 0)
1115: {
1116: if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1117: (c < 256 &&
1118: (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1119: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1120: {
1121: if (codevalue == OP_TYPEPOSQUERY)
1122: {
1123: active_count--; /* Remove non-match possibility */
1124: next_active_state--;
1125: }
1126: ADD_NEW(state_offset + 2, 0);
1127: }
1128: }
1129: break;
1130:
1131: /*-----------------------------------------------------------------*/
1132: case OP_TYPESTAR:
1133: case OP_TYPEMINSTAR:
1134: case OP_TYPEPOSSTAR:
1135: ADD_ACTIVE(state_offset + 2, 0);
1136: if (clen > 0)
1137: {
1138: if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1139: (c < 256 &&
1140: (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1141: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1142: {
1143: if (codevalue == OP_TYPEPOSSTAR)
1144: {
1145: active_count--; /* Remove non-match possibility */
1146: next_active_state--;
1147: }
1148: ADD_NEW(state_offset, 0);
1149: }
1150: }
1151: break;
1152:
1153: /*-----------------------------------------------------------------*/
1154: case OP_TYPEEXACT:
1155: count = current_state->count; /* Number already matched */
1156: if (clen > 0)
1157: {
1158: if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1159: (c < 256 &&
1160: (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1161: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1162: {
1163: if (++count >= GET2(code, 1))
1.1.1.2 ! misho 1164: { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1.1 misho 1165: else
1166: { ADD_NEW(state_offset, count); }
1167: }
1168: }
1169: break;
1170:
1171: /*-----------------------------------------------------------------*/
1172: case OP_TYPEUPTO:
1173: case OP_TYPEMINUPTO:
1174: case OP_TYPEPOSUPTO:
1.1.1.2 ! misho 1175: ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1.1 misho 1176: count = current_state->count; /* Number already matched */
1177: if (clen > 0)
1178: {
1179: if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1180: (c < 256 &&
1181: (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1182: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1183: {
1184: if (codevalue == OP_TYPEPOSUPTO)
1185: {
1186: active_count--; /* Remove non-match possibility */
1187: next_active_state--;
1188: }
1189: if (++count >= GET2(code, 1))
1.1.1.2 ! misho 1190: { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1.1 misho 1191: else
1192: { ADD_NEW(state_offset, count); }
1193: }
1194: }
1195: break;
1196:
1197: /* ========================================================================== */
1198: /* These are virtual opcodes that are used when something like
1199: OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1200: argument. It keeps the code above fast for the other cases. The argument
1201: is in the d variable. */
1202:
1203: #ifdef SUPPORT_UCP
1204: case OP_PROP_EXTRA + OP_TYPEPLUS:
1205: case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1206: case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1207: count = current_state->count; /* Already matched */
1208: if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1209: if (clen > 0)
1210: {
1211: BOOL OK;
1212: const ucd_record * prop = GET_UCD(c);
1213: switch(code[2])
1214: {
1215: case PT_ANY:
1216: OK = TRUE;
1217: break;
1218:
1219: case PT_LAMP:
1220: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1221: prop->chartype == ucp_Lt;
1222: break;
1223:
1224: case PT_GC:
1.1.1.2 ! misho 1225: OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1.1 misho 1226: break;
1227:
1228: case PT_PC:
1229: OK = prop->chartype == code[3];
1230: break;
1231:
1232: case PT_SC:
1233: OK = prop->script == code[3];
1234: break;
1235:
1236: /* These are specials for combination cases. */
1237:
1238: case PT_ALNUM:
1.1.1.2 ! misho 1239: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
! 1240: PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1.1 misho 1241: break;
1242:
1243: case PT_SPACE: /* Perl space */
1.1.1.2 ! misho 1244: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.1 misho 1245: c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1246: break;
1247:
1248: case PT_PXSPACE: /* POSIX space */
1.1.1.2 ! misho 1249: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.1 misho 1250: c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1251: c == CHAR_FF || c == CHAR_CR;
1252: break;
1253:
1254: case PT_WORD:
1.1.1.2 ! misho 1255: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
! 1256: PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1.1 misho 1257: c == CHAR_UNDERSCORE;
1258: break;
1259:
1260: /* Should never occur, but keep compilers from grumbling. */
1261:
1262: default:
1263: OK = codevalue != OP_PROP;
1264: break;
1265: }
1266:
1267: if (OK == (d == OP_PROP))
1268: {
1269: if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1270: {
1271: active_count--; /* Remove non-match possibility */
1272: next_active_state--;
1273: }
1274: count++;
1275: ADD_NEW(state_offset, count);
1276: }
1277: }
1278: break;
1279:
1280: /*-----------------------------------------------------------------*/
1281: case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1282: case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1283: case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1284: count = current_state->count; /* Already matched */
1285: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1286: if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1287: {
1.1.1.2 ! misho 1288: const pcre_uchar *nptr = ptr + clen;
1.1 misho 1289: int ncount = 0;
1290: if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1291: {
1292: active_count--; /* Remove non-match possibility */
1293: next_active_state--;
1294: }
1295: while (nptr < end_subject)
1296: {
1297: int nd;
1298: int ndlen = 1;
1299: GETCHARLEN(nd, nptr, ndlen);
1300: if (UCD_CATEGORY(nd) != ucp_M) break;
1301: ncount++;
1302: nptr += ndlen;
1303: }
1304: count++;
1305: ADD_NEW_DATA(-state_offset, count, ncount);
1306: }
1307: break;
1308: #endif
1309:
1310: /*-----------------------------------------------------------------*/
1311: case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1312: case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1313: case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1314: count = current_state->count; /* Already matched */
1315: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1316: if (clen > 0)
1317: {
1318: int ncount = 0;
1319: switch (c)
1320: {
1321: case 0x000b:
1322: case 0x000c:
1323: case 0x0085:
1324: case 0x2028:
1325: case 0x2029:
1326: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1327: goto ANYNL01;
1328:
1329: case 0x000d:
1330: if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1331: /* Fall through */
1332:
1333: ANYNL01:
1334: case 0x000a:
1335: if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1336: {
1337: active_count--; /* Remove non-match possibility */
1338: next_active_state--;
1339: }
1340: count++;
1341: ADD_NEW_DATA(-state_offset, count, ncount);
1342: break;
1343:
1344: default:
1345: break;
1346: }
1347: }
1348: break;
1349:
1350: /*-----------------------------------------------------------------*/
1351: case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1352: case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1353: case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1354: count = current_state->count; /* Already matched */
1355: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1356: if (clen > 0)
1357: {
1358: BOOL OK;
1359: switch (c)
1360: {
1361: case 0x000a:
1362: case 0x000b:
1363: case 0x000c:
1364: case 0x000d:
1365: case 0x0085:
1366: case 0x2028:
1367: case 0x2029:
1368: OK = TRUE;
1369: break;
1370:
1371: default:
1372: OK = FALSE;
1373: break;
1374: }
1375:
1376: if (OK == (d == OP_VSPACE))
1377: {
1378: if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1379: {
1380: active_count--; /* Remove non-match possibility */
1381: next_active_state--;
1382: }
1383: count++;
1384: ADD_NEW_DATA(-state_offset, count, 0);
1385: }
1386: }
1387: break;
1388:
1389: /*-----------------------------------------------------------------*/
1390: case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1391: case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1392: case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1393: count = current_state->count; /* Already matched */
1394: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1395: if (clen > 0)
1396: {
1397: BOOL OK;
1398: switch (c)
1399: {
1400: case 0x09: /* HT */
1401: case 0x20: /* SPACE */
1402: case 0xa0: /* NBSP */
1403: case 0x1680: /* OGHAM SPACE MARK */
1404: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1405: case 0x2000: /* EN QUAD */
1406: case 0x2001: /* EM QUAD */
1407: case 0x2002: /* EN SPACE */
1408: case 0x2003: /* EM SPACE */
1409: case 0x2004: /* THREE-PER-EM SPACE */
1410: case 0x2005: /* FOUR-PER-EM SPACE */
1411: case 0x2006: /* SIX-PER-EM SPACE */
1412: case 0x2007: /* FIGURE SPACE */
1413: case 0x2008: /* PUNCTUATION SPACE */
1414: case 0x2009: /* THIN SPACE */
1415: case 0x200A: /* HAIR SPACE */
1416: case 0x202f: /* NARROW NO-BREAK SPACE */
1417: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1418: case 0x3000: /* IDEOGRAPHIC SPACE */
1419: OK = TRUE;
1420: break;
1421:
1422: default:
1423: OK = FALSE;
1424: break;
1425: }
1426:
1427: if (OK == (d == OP_HSPACE))
1428: {
1429: if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1430: {
1431: active_count--; /* Remove non-match possibility */
1432: next_active_state--;
1433: }
1434: count++;
1435: ADD_NEW_DATA(-state_offset, count, 0);
1436: }
1437: }
1438: break;
1439:
1440: /*-----------------------------------------------------------------*/
1441: #ifdef SUPPORT_UCP
1442: case OP_PROP_EXTRA + OP_TYPEQUERY:
1443: case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1444: case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1445: count = 4;
1446: goto QS1;
1447:
1448: case OP_PROP_EXTRA + OP_TYPESTAR:
1449: case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1450: case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1451: count = 0;
1452:
1453: QS1:
1454:
1455: ADD_ACTIVE(state_offset + 4, 0);
1456: if (clen > 0)
1457: {
1458: BOOL OK;
1459: const ucd_record * prop = GET_UCD(c);
1460: switch(code[2])
1461: {
1462: case PT_ANY:
1463: OK = TRUE;
1464: break;
1465:
1466: case PT_LAMP:
1467: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1468: prop->chartype == ucp_Lt;
1469: break;
1470:
1471: case PT_GC:
1.1.1.2 ! misho 1472: OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1.1 misho 1473: break;
1474:
1475: case PT_PC:
1476: OK = prop->chartype == code[3];
1477: break;
1478:
1479: case PT_SC:
1480: OK = prop->script == code[3];
1481: break;
1482:
1483: /* These are specials for combination cases. */
1484:
1485: case PT_ALNUM:
1.1.1.2 ! misho 1486: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
! 1487: PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1.1 misho 1488: break;
1489:
1490: case PT_SPACE: /* Perl space */
1.1.1.2 ! misho 1491: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.1 misho 1492: c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1493: break;
1494:
1495: case PT_PXSPACE: /* POSIX space */
1.1.1.2 ! misho 1496: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.1 misho 1497: c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1498: c == CHAR_FF || c == CHAR_CR;
1499: break;
1500:
1501: case PT_WORD:
1.1.1.2 ! misho 1502: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
! 1503: PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1.1 misho 1504: c == CHAR_UNDERSCORE;
1505: break;
1506:
1507: /* Should never occur, but keep compilers from grumbling. */
1508:
1509: default:
1510: OK = codevalue != OP_PROP;
1511: break;
1512: }
1513:
1514: if (OK == (d == OP_PROP))
1515: {
1516: if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1517: codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1518: {
1519: active_count--; /* Remove non-match possibility */
1520: next_active_state--;
1521: }
1522: ADD_NEW(state_offset + count, 0);
1523: }
1524: }
1525: break;
1526:
1527: /*-----------------------------------------------------------------*/
1528: case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1529: case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1530: case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1531: count = 2;
1532: goto QS2;
1533:
1534: case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1535: case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1536: case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1537: count = 0;
1538:
1539: QS2:
1540:
1541: ADD_ACTIVE(state_offset + 2, 0);
1542: if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1543: {
1.1.1.2 ! misho 1544: const pcre_uchar *nptr = ptr + clen;
1.1 misho 1545: int ncount = 0;
1546: if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1547: codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1548: {
1549: active_count--; /* Remove non-match possibility */
1550: next_active_state--;
1551: }
1552: while (nptr < end_subject)
1553: {
1554: int nd;
1555: int ndlen = 1;
1556: GETCHARLEN(nd, nptr, ndlen);
1557: if (UCD_CATEGORY(nd) != ucp_M) break;
1558: ncount++;
1559: nptr += ndlen;
1560: }
1561: ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1562: }
1563: break;
1564: #endif
1565:
1566: /*-----------------------------------------------------------------*/
1567: case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1568: case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1569: case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1570: count = 2;
1571: goto QS3;
1572:
1573: case OP_ANYNL_EXTRA + OP_TYPESTAR:
1574: case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1575: case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1576: count = 0;
1577:
1578: QS3:
1579: ADD_ACTIVE(state_offset + 2, 0);
1580: if (clen > 0)
1581: {
1582: int ncount = 0;
1583: switch (c)
1584: {
1585: case 0x000b:
1586: case 0x000c:
1587: case 0x0085:
1588: case 0x2028:
1589: case 0x2029:
1590: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1591: goto ANYNL02;
1592:
1593: case 0x000d:
1594: if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1595: /* Fall through */
1596:
1597: ANYNL02:
1598: case 0x000a:
1599: if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1600: codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1601: {
1602: active_count--; /* Remove non-match possibility */
1603: next_active_state--;
1604: }
1605: ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1606: break;
1607:
1608: default:
1609: break;
1610: }
1611: }
1612: break;
1613:
1614: /*-----------------------------------------------------------------*/
1615: case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1616: case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1617: case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1618: count = 2;
1619: goto QS4;
1620:
1621: case OP_VSPACE_EXTRA + OP_TYPESTAR:
1622: case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1623: case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1624: count = 0;
1625:
1626: QS4:
1627: ADD_ACTIVE(state_offset + 2, 0);
1628: if (clen > 0)
1629: {
1630: BOOL OK;
1631: switch (c)
1632: {
1633: case 0x000a:
1634: case 0x000b:
1635: case 0x000c:
1636: case 0x000d:
1637: case 0x0085:
1638: case 0x2028:
1639: case 0x2029:
1640: OK = TRUE;
1641: break;
1642:
1643: default:
1644: OK = FALSE;
1645: break;
1646: }
1647: if (OK == (d == OP_VSPACE))
1648: {
1649: if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1650: codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1651: {
1652: active_count--; /* Remove non-match possibility */
1653: next_active_state--;
1654: }
1655: ADD_NEW_DATA(-(state_offset + count), 0, 0);
1656: }
1657: }
1658: break;
1659:
1660: /*-----------------------------------------------------------------*/
1661: case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1662: case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1663: case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1664: count = 2;
1665: goto QS5;
1666:
1667: case OP_HSPACE_EXTRA + OP_TYPESTAR:
1668: case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1669: case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1670: count = 0;
1671:
1672: QS5:
1673: ADD_ACTIVE(state_offset + 2, 0);
1674: if (clen > 0)
1675: {
1676: BOOL OK;
1677: switch (c)
1678: {
1679: case 0x09: /* HT */
1680: case 0x20: /* SPACE */
1681: case 0xa0: /* NBSP */
1682: case 0x1680: /* OGHAM SPACE MARK */
1683: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1684: case 0x2000: /* EN QUAD */
1685: case 0x2001: /* EM QUAD */
1686: case 0x2002: /* EN SPACE */
1687: case 0x2003: /* EM SPACE */
1688: case 0x2004: /* THREE-PER-EM SPACE */
1689: case 0x2005: /* FOUR-PER-EM SPACE */
1690: case 0x2006: /* SIX-PER-EM SPACE */
1691: case 0x2007: /* FIGURE SPACE */
1692: case 0x2008: /* PUNCTUATION SPACE */
1693: case 0x2009: /* THIN SPACE */
1694: case 0x200A: /* HAIR SPACE */
1695: case 0x202f: /* NARROW NO-BREAK SPACE */
1696: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1697: case 0x3000: /* IDEOGRAPHIC SPACE */
1698: OK = TRUE;
1699: break;
1700:
1701: default:
1702: OK = FALSE;
1703: break;
1704: }
1705:
1706: if (OK == (d == OP_HSPACE))
1707: {
1708: if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1709: codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1710: {
1711: active_count--; /* Remove non-match possibility */
1712: next_active_state--;
1713: }
1714: ADD_NEW_DATA(-(state_offset + count), 0, 0);
1715: }
1716: }
1717: break;
1718:
1719: /*-----------------------------------------------------------------*/
1720: #ifdef SUPPORT_UCP
1721: case OP_PROP_EXTRA + OP_TYPEEXACT:
1722: case OP_PROP_EXTRA + OP_TYPEUPTO:
1723: case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1724: case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1725: if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1.1.1.2 ! misho 1726: { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1.1 misho 1727: count = current_state->count; /* Number already matched */
1728: if (clen > 0)
1729: {
1730: BOOL OK;
1731: const ucd_record * prop = GET_UCD(c);
1.1.1.2 ! misho 1732: switch(code[1 + IMM2_SIZE + 1])
1.1 misho 1733: {
1734: case PT_ANY:
1735: OK = TRUE;
1736: break;
1737:
1738: case PT_LAMP:
1739: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1740: prop->chartype == ucp_Lt;
1741: break;
1742:
1743: case PT_GC:
1.1.1.2 ! misho 1744: OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1.1 misho 1745: break;
1746:
1747: case PT_PC:
1.1.1.2 ! misho 1748: OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1.1 misho 1749: break;
1750:
1751: case PT_SC:
1.1.1.2 ! misho 1752: OK = prop->script == code[1 + IMM2_SIZE + 2];
1.1 misho 1753: break;
1754:
1755: /* These are specials for combination cases. */
1756:
1757: case PT_ALNUM:
1.1.1.2 ! misho 1758: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
! 1759: PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1.1 misho 1760: break;
1761:
1762: case PT_SPACE: /* Perl space */
1.1.1.2 ! misho 1763: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.1 misho 1764: c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1765: break;
1766:
1767: case PT_PXSPACE: /* POSIX space */
1.1.1.2 ! misho 1768: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.1 misho 1769: c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1770: c == CHAR_FF || c == CHAR_CR;
1771: break;
1772:
1773: case PT_WORD:
1.1.1.2 ! misho 1774: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
! 1775: PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1.1 misho 1776: c == CHAR_UNDERSCORE;
1777: break;
1778:
1779: /* Should never occur, but keep compilers from grumbling. */
1780:
1781: default:
1782: OK = codevalue != OP_PROP;
1783: break;
1784: }
1785:
1786: if (OK == (d == OP_PROP))
1787: {
1788: if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1789: {
1790: active_count--; /* Remove non-match possibility */
1791: next_active_state--;
1792: }
1793: if (++count >= GET2(code, 1))
1.1.1.2 ! misho 1794: { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1.1 misho 1795: else
1796: { ADD_NEW(state_offset, count); }
1797: }
1798: }
1799: break;
1800:
1801: /*-----------------------------------------------------------------*/
1802: case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1803: case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1804: case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1805: case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1806: if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1.1.1.2 ! misho 1807: { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1.1 misho 1808: count = current_state->count; /* Number already matched */
1809: if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1810: {
1.1.1.2 ! misho 1811: const pcre_uchar *nptr = ptr + clen;
1.1 misho 1812: int ncount = 0;
1813: if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1814: {
1815: active_count--; /* Remove non-match possibility */
1816: next_active_state--;
1817: }
1818: while (nptr < end_subject)
1819: {
1820: int nd;
1821: int ndlen = 1;
1822: GETCHARLEN(nd, nptr, ndlen);
1823: if (UCD_CATEGORY(nd) != ucp_M) break;
1824: ncount++;
1825: nptr += ndlen;
1826: }
1827: if (++count >= GET2(code, 1))
1.1.1.2 ! misho 1828: { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1.1 misho 1829: else
1830: { ADD_NEW_DATA(-state_offset, count, ncount); }
1831: }
1832: break;
1833: #endif
1834:
1835: /*-----------------------------------------------------------------*/
1836: case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1837: case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1838: case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1839: case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1840: if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1.1.1.2 ! misho 1841: { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1.1 misho 1842: count = current_state->count; /* Number already matched */
1843: if (clen > 0)
1844: {
1845: int ncount = 0;
1846: switch (c)
1847: {
1848: case 0x000b:
1849: case 0x000c:
1850: case 0x0085:
1851: case 0x2028:
1852: case 0x2029:
1853: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1854: goto ANYNL03;
1855:
1856: case 0x000d:
1857: if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1858: /* Fall through */
1859:
1860: ANYNL03:
1861: case 0x000a:
1862: if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1863: {
1864: active_count--; /* Remove non-match possibility */
1865: next_active_state--;
1866: }
1867: if (++count >= GET2(code, 1))
1.1.1.2 ! misho 1868: { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1.1 misho 1869: else
1870: { ADD_NEW_DATA(-state_offset, count, ncount); }
1871: break;
1872:
1873: default:
1874: break;
1875: }
1876: }
1877: break;
1878:
1879: /*-----------------------------------------------------------------*/
1880: case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1881: case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1882: case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1883: case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1884: if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1.1.1.2 ! misho 1885: { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1.1 misho 1886: count = current_state->count; /* Number already matched */
1887: if (clen > 0)
1888: {
1889: BOOL OK;
1890: switch (c)
1891: {
1892: case 0x000a:
1893: case 0x000b:
1894: case 0x000c:
1895: case 0x000d:
1896: case 0x0085:
1897: case 0x2028:
1898: case 0x2029:
1899: OK = TRUE;
1900: break;
1901:
1902: default:
1903: OK = FALSE;
1904: }
1905:
1906: if (OK == (d == OP_VSPACE))
1907: {
1908: if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1909: {
1910: active_count--; /* Remove non-match possibility */
1911: next_active_state--;
1912: }
1913: if (++count >= GET2(code, 1))
1.1.1.2 ! misho 1914: { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1.1 misho 1915: else
1916: { ADD_NEW_DATA(-state_offset, count, 0); }
1917: }
1918: }
1919: break;
1920:
1921: /*-----------------------------------------------------------------*/
1922: case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1923: case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1924: case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1925: case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1926: if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1.1.1.2 ! misho 1927: { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1.1 misho 1928: count = current_state->count; /* Number already matched */
1929: if (clen > 0)
1930: {
1931: BOOL OK;
1932: switch (c)
1933: {
1934: case 0x09: /* HT */
1935: case 0x20: /* SPACE */
1936: case 0xa0: /* NBSP */
1937: case 0x1680: /* OGHAM SPACE MARK */
1938: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1939: case 0x2000: /* EN QUAD */
1940: case 0x2001: /* EM QUAD */
1941: case 0x2002: /* EN SPACE */
1942: case 0x2003: /* EM SPACE */
1943: case 0x2004: /* THREE-PER-EM SPACE */
1944: case 0x2005: /* FOUR-PER-EM SPACE */
1945: case 0x2006: /* SIX-PER-EM SPACE */
1946: case 0x2007: /* FIGURE SPACE */
1947: case 0x2008: /* PUNCTUATION SPACE */
1948: case 0x2009: /* THIN SPACE */
1949: case 0x200A: /* HAIR SPACE */
1950: case 0x202f: /* NARROW NO-BREAK SPACE */
1951: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1952: case 0x3000: /* IDEOGRAPHIC SPACE */
1953: OK = TRUE;
1954: break;
1955:
1956: default:
1957: OK = FALSE;
1958: break;
1959: }
1960:
1961: if (OK == (d == OP_HSPACE))
1962: {
1963: if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1964: {
1965: active_count--; /* Remove non-match possibility */
1966: next_active_state--;
1967: }
1968: if (++count >= GET2(code, 1))
1.1.1.2 ! misho 1969: { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1.1 misho 1970: else
1971: { ADD_NEW_DATA(-state_offset, count, 0); }
1972: }
1973: }
1974: break;
1975:
1976: /* ========================================================================== */
1977: /* These opcodes are followed by a character that is usually compared
1978: to the current subject character; it is loaded into d. We still get
1979: here even if there is no subject character, because in some cases zero
1980: repetitions are permitted. */
1981:
1982: /*-----------------------------------------------------------------*/
1983: case OP_CHAR:
1984: if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1985: break;
1986:
1987: /*-----------------------------------------------------------------*/
1988: case OP_CHARI:
1989: if (clen == 0) break;
1990:
1.1.1.2 ! misho 1991: #ifdef SUPPORT_UTF
! 1992: if (utf)
1.1 misho 1993: {
1994: if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1995: {
1996: unsigned int othercase;
1.1.1.2 ! misho 1997: if (c < 128)
! 1998: othercase = fcc[c];
! 1999: else
! 2000: /* If we have Unicode property support, we can use it to test the
! 2001: other case of the character. */
1.1 misho 2002: #ifdef SUPPORT_UCP
1.1.1.2 ! misho 2003: othercase = UCD_OTHERCASE(c);
1.1 misho 2004: #else
1.1.1.2 ! misho 2005: othercase = NOTACHAR;
1.1 misho 2006: #endif
2007:
2008: if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2009: }
2010: }
2011: else
1.1.1.2 ! misho 2012: #endif /* SUPPORT_UTF */
! 2013: /* Not UTF mode */
1.1 misho 2014: {
1.1.1.2 ! misho 2015: if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
! 2016: { ADD_NEW(state_offset + 2, 0); }
1.1 misho 2017: }
2018: break;
2019:
2020:
2021: #ifdef SUPPORT_UCP
2022: /*-----------------------------------------------------------------*/
2023: /* This is a tricky one because it can match more than one character.
2024: Find out how many characters to skip, and then set up a negative state
2025: to wait for them to pass before continuing. */
2026:
2027: case OP_EXTUNI:
2028: if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
2029: {
1.1.1.2 ! misho 2030: const pcre_uchar *nptr = ptr + clen;
1.1 misho 2031: int ncount = 0;
2032: while (nptr < end_subject)
2033: {
2034: int nclen = 1;
2035: GETCHARLEN(c, nptr, nclen);
2036: if (UCD_CATEGORY(c) != ucp_M) break;
2037: ncount++;
2038: nptr += nclen;
2039: }
2040: ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2041: }
2042: break;
2043: #endif
2044:
2045: /*-----------------------------------------------------------------*/
2046: /* This is a tricky like EXTUNI because it too can match more than one
2047: character (when CR is followed by LF). In this case, set up a negative
2048: state to wait for one character to pass before continuing. */
2049:
2050: case OP_ANYNL:
2051: if (clen > 0) switch(c)
2052: {
2053: case 0x000b:
2054: case 0x000c:
2055: case 0x0085:
2056: case 0x2028:
2057: case 0x2029:
2058: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2059:
2060: case 0x000a:
2061: ADD_NEW(state_offset + 1, 0);
2062: break;
2063:
2064: case 0x000d:
2065: if (ptr + 1 < end_subject && ptr[1] == 0x0a)
2066: {
2067: ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2068: }
2069: else
2070: {
2071: ADD_NEW(state_offset + 1, 0);
2072: }
2073: break;
2074: }
2075: break;
2076:
2077: /*-----------------------------------------------------------------*/
2078: case OP_NOT_VSPACE:
2079: if (clen > 0) switch(c)
2080: {
2081: case 0x000a:
2082: case 0x000b:
2083: case 0x000c:
2084: case 0x000d:
2085: case 0x0085:
2086: case 0x2028:
2087: case 0x2029:
2088: break;
2089:
2090: default:
2091: ADD_NEW(state_offset + 1, 0);
2092: break;
2093: }
2094: break;
2095:
2096: /*-----------------------------------------------------------------*/
2097: case OP_VSPACE:
2098: if (clen > 0) switch(c)
2099: {
2100: case 0x000a:
2101: case 0x000b:
2102: case 0x000c:
2103: case 0x000d:
2104: case 0x0085:
2105: case 0x2028:
2106: case 0x2029:
2107: ADD_NEW(state_offset + 1, 0);
2108: break;
2109:
2110: default: break;
2111: }
2112: break;
2113:
2114: /*-----------------------------------------------------------------*/
2115: case OP_NOT_HSPACE:
2116: if (clen > 0) switch(c)
2117: {
2118: case 0x09: /* HT */
2119: case 0x20: /* SPACE */
2120: case 0xa0: /* NBSP */
2121: case 0x1680: /* OGHAM SPACE MARK */
2122: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2123: case 0x2000: /* EN QUAD */
2124: case 0x2001: /* EM QUAD */
2125: case 0x2002: /* EN SPACE */
2126: case 0x2003: /* EM SPACE */
2127: case 0x2004: /* THREE-PER-EM SPACE */
2128: case 0x2005: /* FOUR-PER-EM SPACE */
2129: case 0x2006: /* SIX-PER-EM SPACE */
2130: case 0x2007: /* FIGURE SPACE */
2131: case 0x2008: /* PUNCTUATION SPACE */
2132: case 0x2009: /* THIN SPACE */
2133: case 0x200A: /* HAIR SPACE */
2134: case 0x202f: /* NARROW NO-BREAK SPACE */
2135: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2136: case 0x3000: /* IDEOGRAPHIC SPACE */
2137: break;
2138:
2139: default:
2140: ADD_NEW(state_offset + 1, 0);
2141: break;
2142: }
2143: break;
2144:
2145: /*-----------------------------------------------------------------*/
2146: case OP_HSPACE:
2147: if (clen > 0) switch(c)
2148: {
2149: case 0x09: /* HT */
2150: case 0x20: /* SPACE */
2151: case 0xa0: /* NBSP */
2152: case 0x1680: /* OGHAM SPACE MARK */
2153: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2154: case 0x2000: /* EN QUAD */
2155: case 0x2001: /* EM QUAD */
2156: case 0x2002: /* EN SPACE */
2157: case 0x2003: /* EM SPACE */
2158: case 0x2004: /* THREE-PER-EM SPACE */
2159: case 0x2005: /* FOUR-PER-EM SPACE */
2160: case 0x2006: /* SIX-PER-EM SPACE */
2161: case 0x2007: /* FIGURE SPACE */
2162: case 0x2008: /* PUNCTUATION SPACE */
2163: case 0x2009: /* THIN SPACE */
2164: case 0x200A: /* HAIR SPACE */
2165: case 0x202f: /* NARROW NO-BREAK SPACE */
2166: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2167: case 0x3000: /* IDEOGRAPHIC SPACE */
2168: ADD_NEW(state_offset + 1, 0);
2169: break;
2170: }
2171: break;
2172:
2173: /*-----------------------------------------------------------------*/
2174: /* Match a negated single character casefully. This is only used for
2175: one-byte characters, that is, we know that d < 256. The character we are
2176: checking (c) can be multibyte. */
2177:
2178: case OP_NOT:
2179: if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2180: break;
2181:
2182: /*-----------------------------------------------------------------*/
2183: /* Match a negated single character caselessly. This is only used for
2184: one-byte characters, that is, we know that d < 256. The character we are
2185: checking (c) can be multibyte. */
2186:
2187: case OP_NOTI:
2188: if (clen > 0 && c != d && c != fcc[d])
2189: { ADD_NEW(state_offset + dlen + 1, 0); }
2190: break;
2191:
2192: /*-----------------------------------------------------------------*/
2193: case OP_PLUSI:
2194: case OP_MINPLUSI:
2195: case OP_POSPLUSI:
2196: case OP_NOTPLUSI:
2197: case OP_NOTMINPLUSI:
2198: case OP_NOTPOSPLUSI:
2199: caseless = TRUE;
2200: codevalue -= OP_STARI - OP_STAR;
2201:
2202: /* Fall through */
2203: case OP_PLUS:
2204: case OP_MINPLUS:
2205: case OP_POSPLUS:
2206: case OP_NOTPLUS:
2207: case OP_NOTMINPLUS:
2208: case OP_NOTPOSPLUS:
2209: count = current_state->count; /* Already matched */
2210: if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2211: if (clen > 0)
2212: {
2213: unsigned int otherd = NOTACHAR;
2214: if (caseless)
2215: {
1.1.1.2 ! misho 2216: #ifdef SUPPORT_UTF
! 2217: if (utf && d >= 128)
1.1 misho 2218: {
2219: #ifdef SUPPORT_UCP
2220: otherd = UCD_OTHERCASE(d);
2221: #endif /* SUPPORT_UCP */
2222: }
2223: else
1.1.1.2 ! misho 2224: #endif /* SUPPORT_UTF */
! 2225: otherd = TABLE_GET(d, fcc, d);
1.1 misho 2226: }
2227: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2228: {
2229: if (count > 0 &&
2230: (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2231: {
2232: active_count--; /* Remove non-match possibility */
2233: next_active_state--;
2234: }
2235: count++;
2236: ADD_NEW(state_offset, count);
2237: }
2238: }
2239: break;
2240:
2241: /*-----------------------------------------------------------------*/
2242: case OP_QUERYI:
2243: case OP_MINQUERYI:
2244: case OP_POSQUERYI:
2245: case OP_NOTQUERYI:
2246: case OP_NOTMINQUERYI:
2247: case OP_NOTPOSQUERYI:
2248: caseless = TRUE;
2249: codevalue -= OP_STARI - OP_STAR;
2250: /* Fall through */
2251: case OP_QUERY:
2252: case OP_MINQUERY:
2253: case OP_POSQUERY:
2254: case OP_NOTQUERY:
2255: case OP_NOTMINQUERY:
2256: case OP_NOTPOSQUERY:
2257: ADD_ACTIVE(state_offset + dlen + 1, 0);
2258: if (clen > 0)
2259: {
2260: unsigned int otherd = NOTACHAR;
2261: if (caseless)
2262: {
1.1.1.2 ! misho 2263: #ifdef SUPPORT_UTF
! 2264: if (utf && d >= 128)
1.1 misho 2265: {
2266: #ifdef SUPPORT_UCP
2267: otherd = UCD_OTHERCASE(d);
2268: #endif /* SUPPORT_UCP */
2269: }
2270: else
1.1.1.2 ! misho 2271: #endif /* SUPPORT_UTF */
! 2272: otherd = TABLE_GET(d, fcc, d);
1.1 misho 2273: }
2274: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2275: {
2276: if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2277: {
2278: active_count--; /* Remove non-match possibility */
2279: next_active_state--;
2280: }
2281: ADD_NEW(state_offset + dlen + 1, 0);
2282: }
2283: }
2284: break;
2285:
2286: /*-----------------------------------------------------------------*/
2287: case OP_STARI:
2288: case OP_MINSTARI:
2289: case OP_POSSTARI:
2290: case OP_NOTSTARI:
2291: case OP_NOTMINSTARI:
2292: case OP_NOTPOSSTARI:
2293: caseless = TRUE;
2294: codevalue -= OP_STARI - OP_STAR;
2295: /* Fall through */
2296: case OP_STAR:
2297: case OP_MINSTAR:
2298: case OP_POSSTAR:
2299: case OP_NOTSTAR:
2300: case OP_NOTMINSTAR:
2301: case OP_NOTPOSSTAR:
2302: ADD_ACTIVE(state_offset + dlen + 1, 0);
2303: if (clen > 0)
2304: {
2305: unsigned int otherd = NOTACHAR;
2306: if (caseless)
2307: {
1.1.1.2 ! misho 2308: #ifdef SUPPORT_UTF
! 2309: if (utf && d >= 128)
1.1 misho 2310: {
2311: #ifdef SUPPORT_UCP
2312: otherd = UCD_OTHERCASE(d);
2313: #endif /* SUPPORT_UCP */
2314: }
2315: else
1.1.1.2 ! misho 2316: #endif /* SUPPORT_UTF */
! 2317: otherd = TABLE_GET(d, fcc, d);
1.1 misho 2318: }
2319: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2320: {
2321: if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2322: {
2323: active_count--; /* Remove non-match possibility */
2324: next_active_state--;
2325: }
2326: ADD_NEW(state_offset, 0);
2327: }
2328: }
2329: break;
2330:
2331: /*-----------------------------------------------------------------*/
2332: case OP_EXACTI:
2333: case OP_NOTEXACTI:
2334: caseless = TRUE;
2335: codevalue -= OP_STARI - OP_STAR;
2336: /* Fall through */
2337: case OP_EXACT:
2338: case OP_NOTEXACT:
2339: count = current_state->count; /* Number already matched */
2340: if (clen > 0)
2341: {
2342: unsigned int otherd = NOTACHAR;
2343: if (caseless)
2344: {
1.1.1.2 ! misho 2345: #ifdef SUPPORT_UTF
! 2346: if (utf && d >= 128)
1.1 misho 2347: {
2348: #ifdef SUPPORT_UCP
2349: otherd = UCD_OTHERCASE(d);
2350: #endif /* SUPPORT_UCP */
2351: }
2352: else
1.1.1.2 ! misho 2353: #endif /* SUPPORT_UTF */
! 2354: otherd = TABLE_GET(d, fcc, d);
1.1 misho 2355: }
2356: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2357: {
2358: if (++count >= GET2(code, 1))
1.1.1.2 ! misho 2359: { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
1.1 misho 2360: else
2361: { ADD_NEW(state_offset, count); }
2362: }
2363: }
2364: break;
2365:
2366: /*-----------------------------------------------------------------*/
2367: case OP_UPTOI:
2368: case OP_MINUPTOI:
2369: case OP_POSUPTOI:
2370: case OP_NOTUPTOI:
2371: case OP_NOTMINUPTOI:
2372: case OP_NOTPOSUPTOI:
2373: caseless = TRUE;
2374: codevalue -= OP_STARI - OP_STAR;
2375: /* Fall through */
2376: case OP_UPTO:
2377: case OP_MINUPTO:
2378: case OP_POSUPTO:
2379: case OP_NOTUPTO:
2380: case OP_NOTMINUPTO:
2381: case OP_NOTPOSUPTO:
1.1.1.2 ! misho 2382: ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
1.1 misho 2383: count = current_state->count; /* Number already matched */
2384: if (clen > 0)
2385: {
2386: unsigned int otherd = NOTACHAR;
2387: if (caseless)
2388: {
1.1.1.2 ! misho 2389: #ifdef SUPPORT_UTF
! 2390: if (utf && d >= 128)
1.1 misho 2391: {
2392: #ifdef SUPPORT_UCP
2393: otherd = UCD_OTHERCASE(d);
2394: #endif /* SUPPORT_UCP */
2395: }
2396: else
1.1.1.2 ! misho 2397: #endif /* SUPPORT_UTF */
! 2398: otherd = TABLE_GET(d, fcc, d);
1.1 misho 2399: }
2400: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2401: {
2402: if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2403: {
2404: active_count--; /* Remove non-match possibility */
2405: next_active_state--;
2406: }
2407: if (++count >= GET2(code, 1))
1.1.1.2 ! misho 2408: { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
1.1 misho 2409: else
2410: { ADD_NEW(state_offset, count); }
2411: }
2412: }
2413: break;
2414:
2415:
2416: /* ========================================================================== */
2417: /* These are the class-handling opcodes */
2418:
2419: case OP_CLASS:
2420: case OP_NCLASS:
2421: case OP_XCLASS:
2422: {
2423: BOOL isinclass = FALSE;
2424: int next_state_offset;
1.1.1.2 ! misho 2425: const pcre_uchar *ecode;
1.1 misho 2426:
2427: /* For a simple class, there is always just a 32-byte table, and we
2428: can set isinclass from it. */
2429:
2430: if (codevalue != OP_XCLASS)
2431: {
1.1.1.2 ! misho 2432: ecode = code + 1 + (32 / sizeof(pcre_uchar));
1.1 misho 2433: if (clen > 0)
2434: {
2435: isinclass = (c > 255)? (codevalue == OP_NCLASS) :
1.1.1.2 ! misho 2436: ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
1.1 misho 2437: }
2438: }
2439:
2440: /* An extended class may have a table or a list of single characters,
2441: ranges, or both, and it may be positive or negative. There's a
2442: function that sorts all this out. */
2443:
2444: else
2445: {
2446: ecode = code + GET(code, 1);
1.1.1.2 ! misho 2447: if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
1.1 misho 2448: }
2449:
2450: /* At this point, isinclass is set for all kinds of class, and ecode
2451: points to the byte after the end of the class. If there is a
2452: quantifier, this is where it will be. */
2453:
2454: next_state_offset = (int)(ecode - start_code);
2455:
2456: switch (*ecode)
2457: {
2458: case OP_CRSTAR:
2459: case OP_CRMINSTAR:
2460: ADD_ACTIVE(next_state_offset + 1, 0);
2461: if (isinclass) { ADD_NEW(state_offset, 0); }
2462: break;
2463:
2464: case OP_CRPLUS:
2465: case OP_CRMINPLUS:
2466: count = current_state->count; /* Already matched */
2467: if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2468: if (isinclass) { count++; ADD_NEW(state_offset, count); }
2469: break;
2470:
2471: case OP_CRQUERY:
2472: case OP_CRMINQUERY:
2473: ADD_ACTIVE(next_state_offset + 1, 0);
2474: if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2475: break;
2476:
2477: case OP_CRRANGE:
2478: case OP_CRMINRANGE:
2479: count = current_state->count; /* Already matched */
2480: if (count >= GET2(ecode, 1))
1.1.1.2 ! misho 2481: { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
1.1 misho 2482: if (isinclass)
2483: {
1.1.1.2 ! misho 2484: int max = GET2(ecode, 1 + IMM2_SIZE);
1.1 misho 2485: if (++count >= max && max != 0) /* Max 0 => no limit */
1.1.1.2 ! misho 2486: { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
1.1 misho 2487: else
2488: { ADD_NEW(state_offset, count); }
2489: }
2490: break;
2491:
2492: default:
2493: if (isinclass) { ADD_NEW(next_state_offset, 0); }
2494: break;
2495: }
2496: }
2497: break;
2498:
2499: /* ========================================================================== */
2500: /* These are the opcodes for fancy brackets of various kinds. We have
2501: to use recursion in order to handle them. The "always failing" assertion
2502: (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2503: though the other "backtracking verbs" are not supported. */
2504:
2505: case OP_FAIL:
2506: forced_fail++; /* Count FAILs for multiple states */
2507: break;
2508:
2509: case OP_ASSERT:
2510: case OP_ASSERT_NOT:
2511: case OP_ASSERTBACK:
2512: case OP_ASSERTBACK_NOT:
2513: {
2514: int rc;
2515: int local_offsets[2];
2516: int local_workspace[1000];
1.1.1.2 ! misho 2517: const pcre_uchar *endasscode = code + GET(code, 1);
1.1 misho 2518:
2519: while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2520:
2521: rc = internal_dfa_exec(
2522: md, /* static match data */
2523: code, /* this subexpression's code */
2524: ptr, /* where we currently are */
2525: (int)(ptr - start_subject), /* start offset */
2526: local_offsets, /* offset vector */
2527: sizeof(local_offsets)/sizeof(int), /* size of same */
2528: local_workspace, /* workspace vector */
2529: sizeof(local_workspace)/sizeof(int), /* size of same */
2530: rlevel); /* function recursion level */
2531:
2532: if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2533: if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2534: { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2535: }
2536: break;
2537:
2538: /*-----------------------------------------------------------------*/
2539: case OP_COND:
2540: case OP_SCOND:
2541: {
2542: int local_offsets[1000];
2543: int local_workspace[1000];
2544: int codelink = GET(code, 1);
2545: int condcode;
2546:
2547: /* Because of the way auto-callout works during compile, a callout item
2548: is inserted between OP_COND and an assertion condition. This does not
2549: happen for the other conditions. */
2550:
2551: if (code[LINK_SIZE+1] == OP_CALLOUT)
2552: {
2553: rrc = 0;
1.1.1.2 ! misho 2554: if (PUBL(callout) != NULL)
1.1 misho 2555: {
1.1.1.2 ! misho 2556: PUBL(callout_block) cb;
1.1 misho 2557: cb.version = 1; /* Version 1 of the callout block */
2558: cb.callout_number = code[LINK_SIZE+2];
2559: cb.offset_vector = offsets;
1.1.1.2 ! misho 2560: #ifdef COMPILE_PCRE8
1.1 misho 2561: cb.subject = (PCRE_SPTR)start_subject;
1.1.1.2 ! misho 2562: #else
! 2563: cb.subject = (PCRE_SPTR16)start_subject;
! 2564: #endif
1.1 misho 2565: cb.subject_length = (int)(end_subject - start_subject);
2566: cb.start_match = (int)(current_subject - start_subject);
2567: cb.current_position = (int)(ptr - start_subject);
2568: cb.pattern_position = GET(code, LINK_SIZE + 3);
2569: cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2570: cb.capture_top = 1;
2571: cb.capture_last = -1;
2572: cb.callout_data = md->callout_data;
2573: cb.mark = NULL; /* No (*MARK) support */
1.1.1.2 ! misho 2574: if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
1.1 misho 2575: }
2576: if (rrc > 0) break; /* Fail this thread */
1.1.1.2 ! misho 2577: code += PRIV(OP_lengths)[OP_CALLOUT]; /* Skip callout data */
1.1 misho 2578: }
2579:
2580: condcode = code[LINK_SIZE+1];
2581:
2582: /* Back reference conditions are not supported */
2583:
2584: if (condcode == OP_CREF || condcode == OP_NCREF)
2585: return PCRE_ERROR_DFA_UCOND;
2586:
2587: /* The DEFINE condition is always false */
2588:
2589: if (condcode == OP_DEF)
2590: { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2591:
2592: /* The only supported version of OP_RREF is for the value RREF_ANY,
2593: which means "test if in any recursion". We can't test for specifically
2594: recursed groups. */
2595:
2596: else if (condcode == OP_RREF || condcode == OP_NRREF)
2597: {
1.1.1.2 ! misho 2598: int value = GET2(code, LINK_SIZE + 2);
1.1 misho 2599: if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2600: if (md->recursive != NULL)
1.1.1.2 ! misho 2601: { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
1.1 misho 2602: else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2603: }
2604:
2605: /* Otherwise, the condition is an assertion */
2606:
2607: else
2608: {
2609: int rc;
1.1.1.2 ! misho 2610: const pcre_uchar *asscode = code + LINK_SIZE + 1;
! 2611: const pcre_uchar *endasscode = asscode + GET(asscode, 1);
1.1 misho 2612:
2613: while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2614:
2615: rc = internal_dfa_exec(
2616: md, /* fixed match data */
2617: asscode, /* this subexpression's code */
2618: ptr, /* where we currently are */
2619: (int)(ptr - start_subject), /* start offset */
2620: local_offsets, /* offset vector */
2621: sizeof(local_offsets)/sizeof(int), /* size of same */
2622: local_workspace, /* workspace vector */
2623: sizeof(local_workspace)/sizeof(int), /* size of same */
2624: rlevel); /* function recursion level */
2625:
2626: if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2627: if ((rc >= 0) ==
2628: (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2629: { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2630: else
2631: { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2632: }
2633: }
2634: break;
2635:
2636: /*-----------------------------------------------------------------*/
2637: case OP_RECURSE:
2638: {
2639: dfa_recursion_info *ri;
2640: int local_offsets[1000];
2641: int local_workspace[1000];
1.1.1.2 ! misho 2642: const pcre_uchar *callpat = start_code + GET(code, 1);
1.1 misho 2643: int recno = (callpat == md->start_code)? 0 :
2644: GET2(callpat, 1 + LINK_SIZE);
2645: int rc;
2646:
2647: DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2648:
2649: /* Check for repeating a recursion without advancing the subject
2650: pointer. This should catch convoluted mutual recursions. (Some simple
2651: cases are caught at compile time.) */
2652:
2653: for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2654: if (recno == ri->group_num && ptr == ri->subject_position)
2655: return PCRE_ERROR_RECURSELOOP;
2656:
2657: /* Remember this recursion and where we started it so as to
2658: catch infinite loops. */
2659:
2660: new_recursive.group_num = recno;
2661: new_recursive.subject_position = ptr;
2662: new_recursive.prevrec = md->recursive;
2663: md->recursive = &new_recursive;
2664:
2665: rc = internal_dfa_exec(
2666: md, /* fixed match data */
2667: callpat, /* this subexpression's code */
2668: ptr, /* where we currently are */
2669: (int)(ptr - start_subject), /* start offset */
2670: local_offsets, /* offset vector */
2671: sizeof(local_offsets)/sizeof(int), /* size of same */
2672: local_workspace, /* workspace vector */
2673: sizeof(local_workspace)/sizeof(int), /* size of same */
2674: rlevel); /* function recursion level */
2675:
2676: md->recursive = new_recursive.prevrec; /* Done this recursion */
2677:
2678: DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2679: rc));
2680:
2681: /* Ran out of internal offsets */
2682:
2683: if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2684:
2685: /* For each successful matched substring, set up the next state with a
2686: count of characters to skip before trying it. Note that the count is in
2687: characters, not bytes. */
2688:
2689: if (rc > 0)
2690: {
2691: for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2692: {
2693: int charcount = local_offsets[rc+1] - local_offsets[rc];
1.1.1.2 ! misho 2694: #ifdef SUPPORT_UTF
! 2695: const pcre_uchar *p = start_subject + local_offsets[rc];
! 2696: const pcre_uchar *pp = start_subject + local_offsets[rc+1];
! 2697: while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
! 2698: #endif
1.1 misho 2699: if (charcount > 0)
2700: {
2701: ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2702: }
2703: else
2704: {
2705: ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2706: }
2707: }
2708: }
2709: else if (rc != PCRE_ERROR_NOMATCH) return rc;
2710: }
2711: break;
2712:
2713: /*-----------------------------------------------------------------*/
2714: case OP_BRAPOS:
2715: case OP_SBRAPOS:
2716: case OP_CBRAPOS:
2717: case OP_SCBRAPOS:
2718: case OP_BRAPOSZERO:
2719: {
2720: int charcount, matched_count;
1.1.1.2 ! misho 2721: const pcre_uchar *local_ptr = ptr;
1.1 misho 2722: BOOL allow_zero;
2723:
2724: if (codevalue == OP_BRAPOSZERO)
2725: {
2726: allow_zero = TRUE;
2727: codevalue = *(++code); /* Codevalue will be one of above BRAs */
2728: }
2729: else allow_zero = FALSE;
2730:
2731: /* Loop to match the subpattern as many times as possible as if it were
2732: a complete pattern. */
2733:
2734: for (matched_count = 0;; matched_count++)
2735: {
2736: int local_offsets[2];
2737: int local_workspace[1000];
2738:
2739: int rc = internal_dfa_exec(
2740: md, /* fixed match data */
2741: code, /* this subexpression's code */
2742: local_ptr, /* where we currently are */
2743: (int)(ptr - start_subject), /* start offset */
2744: local_offsets, /* offset vector */
2745: sizeof(local_offsets)/sizeof(int), /* size of same */
2746: local_workspace, /* workspace vector */
2747: sizeof(local_workspace)/sizeof(int), /* size of same */
2748: rlevel); /* function recursion level */
2749:
2750: /* Failed to match */
2751:
2752: if (rc < 0)
2753: {
2754: if (rc != PCRE_ERROR_NOMATCH) return rc;
2755: break;
2756: }
2757:
2758: /* Matched: break the loop if zero characters matched. */
2759:
2760: charcount = local_offsets[1] - local_offsets[0];
2761: if (charcount == 0) break;
2762: local_ptr += charcount; /* Advance temporary position ptr */
2763: }
2764:
2765: /* At this point we have matched the subpattern matched_count
2766: times, and local_ptr is pointing to the character after the end of the
2767: last match. */
2768:
2769: if (matched_count > 0 || allow_zero)
2770: {
1.1.1.2 ! misho 2771: const pcre_uchar *end_subpattern = code;
1.1 misho 2772: int next_state_offset;
2773:
2774: do { end_subpattern += GET(end_subpattern, 1); }
2775: while (*end_subpattern == OP_ALT);
2776: next_state_offset =
2777: (int)(end_subpattern - start_code + LINK_SIZE + 1);
2778:
2779: /* Optimization: if there are no more active states, and there
2780: are no new states yet set up, then skip over the subject string
2781: right here, to save looping. Otherwise, set up the new state to swing
2782: into action when the end of the matched substring is reached. */
2783:
2784: if (i + 1 >= active_count && new_count == 0)
2785: {
2786: ptr = local_ptr;
2787: clen = 0;
2788: ADD_NEW(next_state_offset, 0);
2789: }
2790: else
2791: {
1.1.1.2 ! misho 2792: const pcre_uchar *p = ptr;
! 2793: const pcre_uchar *pp = local_ptr;
1.1 misho 2794: charcount = (int)(pp - p);
1.1.1.2 ! misho 2795: #ifdef SUPPORT_UTF
! 2796: while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
! 2797: #endif
1.1 misho 2798: ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2799: }
2800: }
2801: }
2802: break;
2803:
2804: /*-----------------------------------------------------------------*/
2805: case OP_ONCE:
2806: case OP_ONCE_NC:
2807: {
2808: int local_offsets[2];
2809: int local_workspace[1000];
2810:
2811: int rc = internal_dfa_exec(
2812: md, /* fixed match data */
2813: code, /* this subexpression's code */
2814: ptr, /* where we currently are */
2815: (int)(ptr - start_subject), /* start offset */
2816: local_offsets, /* offset vector */
2817: sizeof(local_offsets)/sizeof(int), /* size of same */
2818: local_workspace, /* workspace vector */
2819: sizeof(local_workspace)/sizeof(int), /* size of same */
2820: rlevel); /* function recursion level */
2821:
2822: if (rc >= 0)
2823: {
1.1.1.2 ! misho 2824: const pcre_uchar *end_subpattern = code;
1.1 misho 2825: int charcount = local_offsets[1] - local_offsets[0];
2826: int next_state_offset, repeat_state_offset;
2827:
2828: do { end_subpattern += GET(end_subpattern, 1); }
2829: while (*end_subpattern == OP_ALT);
2830: next_state_offset =
2831: (int)(end_subpattern - start_code + LINK_SIZE + 1);
2832:
2833: /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2834: arrange for the repeat state also to be added to the relevant list.
2835: Calculate the offset, or set -1 for no repeat. */
2836:
2837: repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2838: *end_subpattern == OP_KETRMIN)?
2839: (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2840:
2841: /* If we have matched an empty string, add the next state at the
2842: current character pointer. This is important so that the duplicate
2843: checking kicks in, which is what breaks infinite loops that match an
2844: empty string. */
2845:
2846: if (charcount == 0)
2847: {
2848: ADD_ACTIVE(next_state_offset, 0);
2849: }
2850:
2851: /* Optimization: if there are no more active states, and there
2852: are no new states yet set up, then skip over the subject string
2853: right here, to save looping. Otherwise, set up the new state to swing
2854: into action when the end of the matched substring is reached. */
2855:
2856: else if (i + 1 >= active_count && new_count == 0)
2857: {
2858: ptr += charcount;
2859: clen = 0;
2860: ADD_NEW(next_state_offset, 0);
2861:
2862: /* If we are adding a repeat state at the new character position,
2863: we must fudge things so that it is the only current state.
2864: Otherwise, it might be a duplicate of one we processed before, and
2865: that would cause it to be skipped. */
2866:
2867: if (repeat_state_offset >= 0)
2868: {
2869: next_active_state = active_states;
2870: active_count = 0;
2871: i = -1;
2872: ADD_ACTIVE(repeat_state_offset, 0);
2873: }
2874: }
2875: else
2876: {
1.1.1.2 ! misho 2877: #ifdef SUPPORT_UTF
! 2878: const pcre_uchar *p = start_subject + local_offsets[0];
! 2879: const pcre_uchar *pp = start_subject + local_offsets[1];
! 2880: while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
! 2881: #endif
1.1 misho 2882: ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2883: if (repeat_state_offset >= 0)
2884: { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2885: }
2886: }
2887: else if (rc != PCRE_ERROR_NOMATCH) return rc;
2888: }
2889: break;
2890:
2891:
2892: /* ========================================================================== */
2893: /* Handle callouts */
2894:
2895: case OP_CALLOUT:
2896: rrc = 0;
1.1.1.2 ! misho 2897: if (PUBL(callout) != NULL)
1.1 misho 2898: {
1.1.1.2 ! misho 2899: PUBL(callout_block) cb;
1.1 misho 2900: cb.version = 1; /* Version 1 of the callout block */
2901: cb.callout_number = code[1];
2902: cb.offset_vector = offsets;
1.1.1.2 ! misho 2903: #ifdef COMPILE_PCRE8
1.1 misho 2904: cb.subject = (PCRE_SPTR)start_subject;
1.1.1.2 ! misho 2905: #else
! 2906: cb.subject = (PCRE_SPTR16)start_subject;
! 2907: #endif
1.1 misho 2908: cb.subject_length = (int)(end_subject - start_subject);
2909: cb.start_match = (int)(current_subject - start_subject);
2910: cb.current_position = (int)(ptr - start_subject);
2911: cb.pattern_position = GET(code, 2);
2912: cb.next_item_length = GET(code, 2 + LINK_SIZE);
2913: cb.capture_top = 1;
2914: cb.capture_last = -1;
2915: cb.callout_data = md->callout_data;
2916: cb.mark = NULL; /* No (*MARK) support */
1.1.1.2 ! misho 2917: if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
1.1 misho 2918: }
2919: if (rrc == 0)
1.1.1.2 ! misho 2920: { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
1.1 misho 2921: break;
2922:
2923:
2924: /* ========================================================================== */
2925: default: /* Unsupported opcode */
2926: return PCRE_ERROR_DFA_UITEM;
2927: }
2928:
2929: NEXT_ACTIVE_STATE: continue;
2930:
2931: } /* End of loop scanning active states */
2932:
2933: /* We have finished the processing at the current subject character. If no
2934: new states have been set for the next character, we have found all the
2935: matches that we are going to find. If we are at the top level and partial
2936: matching has been requested, check for appropriate conditions.
2937:
2938: The "forced_ fail" variable counts the number of (*F) encountered for the
2939: character. If it is equal to the original active_count (saved in
2940: workspace[1]) it means that (*F) was found on every active state. In this
2941: case we don't want to give a partial match.
2942:
2943: The "could_continue" variable is true if a state could have continued but
2944: for the fact that the end of the subject was reached. */
2945:
2946: if (new_count <= 0)
2947: {
2948: if (rlevel == 1 && /* Top level, and */
2949: could_continue && /* Some could go on */
2950: forced_fail != workspace[1] && /* Not all forced fail & */
2951: ( /* either... */
2952: (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
2953: || /* or... */
2954: ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
2955: match_count < 0) /* no matches */
2956: ) && /* And... */
2957: ptr >= end_subject && /* Reached end of subject */
2958: ptr > md->start_used_ptr) /* Inspected non-empty string */
2959: {
2960: if (offsetcount >= 2)
2961: {
2962: offsets[0] = (int)(md->start_used_ptr - start_subject);
2963: offsets[1] = (int)(end_subject - start_subject);
2964: }
2965: match_count = PCRE_ERROR_PARTIAL;
2966: }
2967:
2968: DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2969: "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2970: rlevel*2-2, SP));
2971: break; /* In effect, "return", but see the comment below */
2972: }
2973:
2974: /* One or more states are active for the next character. */
2975:
2976: ptr += clen; /* Advance to next subject character */
2977: } /* Loop to move along the subject string */
2978:
2979: /* Control gets here from "break" a few lines above. We do it this way because
2980: if we use "return" above, we have compiler trouble. Some compilers warn if
2981: there's nothing here because they think the function doesn't return a value. On
2982: the other hand, if we put a dummy statement here, some more clever compilers
2983: complain that it can't be reached. Sigh. */
2984:
2985: return match_count;
2986: }
2987:
2988:
2989:
2990:
2991: /*************************************************
2992: * Execute a Regular Expression - DFA engine *
2993: *************************************************/
2994:
2995: /* This external function applies a compiled re to a subject string using a DFA
2996: engine. This function calls the internal function multiple times if the pattern
2997: is not anchored.
2998:
2999: Arguments:
3000: argument_re points to the compiled expression
3001: extra_data points to extra data or is NULL
3002: subject points to the subject string
3003: length length of subject string (may contain binary zeros)
3004: start_offset where to start in the subject string
3005: options option bits
3006: offsets vector of match offsets
3007: offsetcount size of same
3008: workspace workspace vector
3009: wscount size of same
3010:
3011: Returns: > 0 => number of match offset pairs placed in offsets
3012: = 0 => offsets overflowed; longest matches are present
3013: -1 => failed to match
3014: < -1 => some kind of unexpected problem
3015: */
3016:
1.1.1.2 ! misho 3017: #ifdef COMPILE_PCRE8
1.1 misho 3018: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3019: pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3020: const char *subject, int length, int start_offset, int options, int *offsets,
3021: int offsetcount, int *workspace, int wscount)
1.1.1.2 ! misho 3022: #else
! 3023: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
! 3024: pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
! 3025: PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
! 3026: int offsetcount, int *workspace, int wscount)
! 3027: #endif
1.1 misho 3028: {
1.1.1.2 ! misho 3029: REAL_PCRE *re = (REAL_PCRE *)argument_re;
1.1 misho 3030: dfa_match_data match_block;
3031: dfa_match_data *md = &match_block;
1.1.1.2 ! misho 3032: BOOL utf, anchored, startline, firstline;
! 3033: const pcre_uchar *current_subject, *end_subject;
1.1 misho 3034: const pcre_study_data *study = NULL;
3035:
1.1.1.2 ! misho 3036: const pcre_uchar *req_char_ptr;
! 3037: const pcre_uint8 *start_bits = NULL;
! 3038: BOOL has_first_char = FALSE;
! 3039: BOOL has_req_char = FALSE;
! 3040: pcre_uchar first_char = 0;
! 3041: pcre_uchar first_char2 = 0;
! 3042: pcre_uchar req_char = 0;
! 3043: pcre_uchar req_char2 = 0;
1.1 misho 3044: int newline;
3045:
3046: /* Plausibility checks */
3047:
3048: if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3049: if (re == NULL || subject == NULL || workspace == NULL ||
3050: (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3051: if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3052: if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3053: if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3054:
3055: /* We need to find the pointer to any study data before we test for byte
3056: flipping, so we scan the extra_data block first. This may set two fields in the
3057: match block, so we must initialize them beforehand. However, the other fields
3058: in the match block must not be set until after the byte flipping. */
3059:
3060: md->tables = re->tables;
3061: md->callout_data = NULL;
3062:
3063: if (extra_data != NULL)
3064: {
3065: unsigned int flags = extra_data->flags;
3066: if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3067: study = (const pcre_study_data *)extra_data->study_data;
3068: if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3069: if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3070: return PCRE_ERROR_DFA_UMLIMIT;
3071: if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3072: md->callout_data = extra_data->callout_data;
3073: if ((flags & PCRE_EXTRA_TABLES) != 0)
3074: md->tables = extra_data->tables;
3075: }
3076:
3077: /* Check that the first field in the block is the magic number. If it is not,
1.1.1.2 ! misho 3078: return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
! 3079: REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
! 3080: means that the pattern is likely compiled with different endianness. */
1.1 misho 3081:
3082: if (re->magic_number != MAGIC_NUMBER)
1.1.1.2 ! misho 3083: return re->magic_number == REVERSED_MAGIC_NUMBER?
! 3084: PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
! 3085: if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
1.1 misho 3086:
3087: /* Set some local values */
3088:
1.1.1.2 ! misho 3089: current_subject = (const pcre_uchar *)subject + start_offset;
! 3090: end_subject = (const pcre_uchar *)subject + length;
! 3091: req_char_ptr = current_subject - 1;
! 3092:
! 3093: #ifdef SUPPORT_UTF
! 3094: /* PCRE_UTF16 has the same value as PCRE_UTF8. */
! 3095: utf = (re->options & PCRE_UTF8) != 0;
1.1 misho 3096: #else
1.1.1.2 ! misho 3097: utf = FALSE;
1.1 misho 3098: #endif
3099:
3100: anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3101: (re->options & PCRE_ANCHORED) != 0;
3102:
3103: /* The remaining fixed data for passing around. */
3104:
1.1.1.2 ! misho 3105: md->start_code = (const pcre_uchar *)argument_re +
1.1 misho 3106: re->name_table_offset + re->name_count * re->name_entry_size;
1.1.1.2 ! misho 3107: md->start_subject = (const pcre_uchar *)subject;
1.1 misho 3108: md->end_subject = end_subject;
3109: md->start_offset = start_offset;
3110: md->moptions = options;
3111: md->poptions = re->options;
3112:
3113: /* If the BSR option is not set at match time, copy what was set
3114: at compile time. */
3115:
3116: if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3117: {
3118: if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3119: md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3120: #ifdef BSR_ANYCRLF
3121: else md->moptions |= PCRE_BSR_ANYCRLF;
3122: #endif
3123: }
3124:
3125: /* Handle different types of newline. The three bits give eight cases. If
3126: nothing is set at run time, whatever was used at compile time applies. */
3127:
3128: switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3129: PCRE_NEWLINE_BITS)
3130: {
3131: case 0: newline = NEWLINE; break; /* Compile-time default */
3132: case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3133: case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3134: case PCRE_NEWLINE_CR+
3135: PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3136: case PCRE_NEWLINE_ANY: newline = -1; break;
3137: case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3138: default: return PCRE_ERROR_BADNEWLINE;
3139: }
3140:
3141: if (newline == -2)
3142: {
3143: md->nltype = NLTYPE_ANYCRLF;
3144: }
3145: else if (newline < 0)
3146: {
3147: md->nltype = NLTYPE_ANY;
3148: }
3149: else
3150: {
3151: md->nltype = NLTYPE_FIXED;
3152: if (newline > 255)
3153: {
3154: md->nllen = 2;
3155: md->nl[0] = (newline >> 8) & 255;
3156: md->nl[1] = newline & 255;
3157: }
3158: else
3159: {
3160: md->nllen = 1;
3161: md->nl[0] = newline;
3162: }
3163: }
3164:
3165: /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3166: back the character offset. */
3167:
1.1.1.2 ! misho 3168: #ifdef SUPPORT_UTF
! 3169: if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
1.1 misho 3170: {
3171: int erroroffset;
1.1.1.2 ! misho 3172: int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
1.1 misho 3173: if (errorcode != 0)
3174: {
3175: if (offsetcount >= 2)
3176: {
3177: offsets[0] = erroroffset;
3178: offsets[1] = errorcode;
3179: }
3180: return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3181: PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3182: }
3183: if (start_offset > 0 && start_offset < length &&
1.1.1.2 ! misho 3184: NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
1.1 misho 3185: return PCRE_ERROR_BADUTF8_OFFSET;
3186: }
3187: #endif
3188:
3189: /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3190: is a feature that makes it possible to save compiled regex and re-use them
3191: in other programs later. */
3192:
1.1.1.2 ! misho 3193: if (md->tables == NULL) md->tables = PRIV(default_tables);
1.1 misho 3194:
1.1.1.2 ! misho 3195: /* The "must be at the start of a line" flags are used in a loop when finding
! 3196: where to start. */
1.1 misho 3197:
3198: startline = (re->flags & PCRE_STARTLINE) != 0;
3199: firstline = (re->options & PCRE_FIRSTLINE) != 0;
3200:
3201: /* Set up the first character to match, if available. The first_byte value is
3202: never set for an anchored regular expression, but the anchoring may be forced
3203: at run time, so we have to test for anchoring. The first char may be unset for
3204: an unanchored pattern, of course. If there's no first char and the pattern was
3205: studied, there may be a bitmap of possible first characters. */
3206:
3207: if (!anchored)
3208: {
3209: if ((re->flags & PCRE_FIRSTSET) != 0)
3210: {
1.1.1.2 ! misho 3211: has_first_char = TRUE;
! 3212: first_char = first_char2 = (pcre_uchar)(re->first_char);
! 3213: if ((re->flags & PCRE_FCH_CASELESS) != 0)
! 3214: {
! 3215: first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
! 3216: #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
! 3217: if (utf && first_char > 127)
! 3218: first_char2 = UCD_OTHERCASE(first_char);
! 3219: #endif
! 3220: }
1.1 misho 3221: }
3222: else
3223: {
3224: if (!startline && study != NULL &&
3225: (study->flags & PCRE_STUDY_MAPPED) != 0)
3226: start_bits = study->start_bits;
3227: }
3228: }
3229:
3230: /* For anchored or unanchored matches, there may be a "last known required
3231: character" set. */
3232:
3233: if ((re->flags & PCRE_REQCHSET) != 0)
3234: {
1.1.1.2 ! misho 3235: has_req_char = TRUE;
! 3236: req_char = req_char2 = (pcre_uchar)(re->req_char);
! 3237: if ((re->flags & PCRE_RCH_CASELESS) != 0)
! 3238: {
! 3239: req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
! 3240: #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
! 3241: if (utf && req_char > 127)
! 3242: req_char2 = UCD_OTHERCASE(req_char);
! 3243: #endif
! 3244: }
1.1 misho 3245: }
3246:
3247: /* Call the main matching function, looping for a non-anchored regex after a
3248: failed match. If not restarting, perform certain optimizations at the start of
3249: a match. */
3250:
3251: for (;;)
3252: {
3253: int rc;
3254:
3255: if ((options & PCRE_DFA_RESTART) == 0)
3256: {
1.1.1.2 ! misho 3257: const pcre_uchar *save_end_subject = end_subject;
1.1 misho 3258:
3259: /* If firstline is TRUE, the start of the match is constrained to the first
3260: line of a multiline string. Implement this by temporarily adjusting
3261: end_subject so that we stop scanning at a newline. If the match fails at
3262: the newline, later code breaks this loop. */
3263:
3264: if (firstline)
3265: {
1.1.1.2 ! misho 3266: PCRE_PUCHAR t = current_subject;
! 3267: #ifdef SUPPORT_UTF
! 3268: if (utf)
1.1 misho 3269: {
3270: while (t < md->end_subject && !IS_NEWLINE(t))
3271: {
3272: t++;
1.1.1.2 ! misho 3273: ACROSSCHAR(t < end_subject, *t, t++);
1.1 misho 3274: }
3275: }
3276: else
3277: #endif
3278: while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3279: end_subject = t;
3280: }
3281:
3282: /* There are some optimizations that avoid running the match if a known
3283: starting point is not found. However, there is an option that disables
3284: these, for testing and for ensuring that all callouts do actually occur.
3285: The option can be set in the regex by (*NO_START_OPT) or passed in
3286: match-time options. */
3287:
3288: if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3289: {
1.1.1.2 ! misho 3290: /* Advance to a known first char. */
1.1 misho 3291:
1.1.1.2 ! misho 3292: if (has_first_char)
1.1 misho 3293: {
1.1.1.2 ! misho 3294: if (first_char != first_char2)
1.1 misho 3295: while (current_subject < end_subject &&
1.1.1.2 ! misho 3296: *current_subject != first_char && *current_subject != first_char2)
1.1 misho 3297: current_subject++;
3298: else
3299: while (current_subject < end_subject &&
1.1.1.2 ! misho 3300: *current_subject != first_char)
1.1 misho 3301: current_subject++;
3302: }
3303:
3304: /* Or to just after a linebreak for a multiline match if possible */
3305:
3306: else if (startline)
3307: {
3308: if (current_subject > md->start_subject + start_offset)
3309: {
1.1.1.2 ! misho 3310: #ifdef SUPPORT_UTF
! 3311: if (utf)
1.1 misho 3312: {
3313: while (current_subject < end_subject &&
3314: !WAS_NEWLINE(current_subject))
3315: {
3316: current_subject++;
1.1.1.2 ! misho 3317: ACROSSCHAR(current_subject < end_subject, *current_subject,
! 3318: current_subject++);
1.1 misho 3319: }
3320: }
3321: else
3322: #endif
3323: while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3324: current_subject++;
3325:
3326: /* If we have just passed a CR and the newline option is ANY or
3327: ANYCRLF, and we are now at a LF, advance the match position by one
3328: more character. */
3329:
3330: if (current_subject[-1] == CHAR_CR &&
3331: (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3332: current_subject < end_subject &&
3333: *current_subject == CHAR_NL)
3334: current_subject++;
3335: }
3336: }
3337:
3338: /* Or to a non-unique first char after study */
3339:
3340: else if (start_bits != NULL)
3341: {
3342: while (current_subject < end_subject)
3343: {
3344: register unsigned int c = *current_subject;
1.1.1.2 ! misho 3345: #ifndef COMPILE_PCRE8
! 3346: if (c > 255) c = 255;
! 3347: #endif
1.1 misho 3348: if ((start_bits[c/8] & (1 << (c&7))) == 0)
3349: {
3350: current_subject++;
1.1.1.2 ! misho 3351: #if defined SUPPORT_UTF && defined COMPILE_PCRE8
! 3352: /* In non 8-bit mode, the iteration will stop for
! 3353: characters > 255 at the beginning or not stop at all. */
! 3354: if (utf)
! 3355: ACROSSCHAR(current_subject < end_subject, *current_subject,
! 3356: current_subject++);
1.1 misho 3357: #endif
3358: }
3359: else break;
3360: }
3361: }
3362: }
3363:
3364: /* Restore fudged end_subject */
3365:
3366: end_subject = save_end_subject;
3367:
3368: /* The following two optimizations are disabled for partial matching or if
3369: disabling is explicitly requested (and of course, by the test above, this
3370: code is not obeyed when restarting after a partial match). */
3371:
3372: if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3373: (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3374: {
3375: /* If the pattern was studied, a minimum subject length may be set. This
3376: is a lower bound; no actual string of that length may actually match the
3377: pattern. Although the value is, strictly, in characters, we treat it as
3378: bytes to avoid spending too much time in this optimization. */
3379:
3380: if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3381: (pcre_uint32)(end_subject - current_subject) < study->minlength)
3382: return PCRE_ERROR_NOMATCH;
3383:
1.1.1.2 ! misho 3384: /* If req_char is set, we know that that character must appear in the
! 3385: subject for the match to succeed. If the first character is set, req_char
1.1 misho 3386: must be later in the subject; otherwise the test starts at the match
3387: point. This optimization can save a huge amount of work in patterns with
3388: nested unlimited repeats that aren't going to match. Writing separate
3389: code for cased/caseless versions makes it go faster, as does using an
3390: autoincrement and backing off on a match.
3391:
3392: HOWEVER: when the subject string is very, very long, searching to its end
3393: can take a long time, and give bad performance on quite ordinary
3394: patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3395: string... so we don't do this when the string is sufficiently long. */
3396:
1.1.1.2 ! misho 3397: if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
1.1 misho 3398: {
1.1.1.2 ! misho 3399: register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
1.1 misho 3400:
3401: /* We don't need to repeat the search if we haven't yet reached the
3402: place we found it at last time. */
3403:
1.1.1.2 ! misho 3404: if (p > req_char_ptr)
1.1 misho 3405: {
1.1.1.2 ! misho 3406: if (req_char != req_char2)
1.1 misho 3407: {
3408: while (p < end_subject)
3409: {
3410: register int pp = *p++;
1.1.1.2 ! misho 3411: if (pp == req_char || pp == req_char2) { p--; break; }
1.1 misho 3412: }
3413: }
3414: else
3415: {
3416: while (p < end_subject)
3417: {
1.1.1.2 ! misho 3418: if (*p++ == req_char) { p--; break; }
1.1 misho 3419: }
3420: }
3421:
3422: /* If we can't find the required character, break the matching loop,
3423: which will cause a return or PCRE_ERROR_NOMATCH. */
3424:
3425: if (p >= end_subject) break;
3426:
3427: /* If we have found the required character, save the point where we
3428: found it, so that we don't search again next time round the loop if
3429: the start hasn't passed this character yet. */
3430:
1.1.1.2 ! misho 3431: req_char_ptr = p;
1.1 misho 3432: }
3433: }
3434: }
3435: } /* End of optimizations that are done when not restarting */
3436:
3437: /* OK, now we can do the business */
3438:
3439: md->start_used_ptr = current_subject;
3440: md->recursive = NULL;
3441:
3442: rc = internal_dfa_exec(
3443: md, /* fixed match data */
3444: md->start_code, /* this subexpression's code */
3445: current_subject, /* where we currently are */
3446: start_offset, /* start offset in subject */
3447: offsets, /* offset vector */
3448: offsetcount, /* size of same */
3449: workspace, /* workspace vector */
3450: wscount, /* size of same */
3451: 0); /* function recurse level */
3452:
3453: /* Anything other than "no match" means we are done, always; otherwise, carry
3454: on only if not anchored. */
3455:
3456: if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3457:
3458: /* Advance to the next subject character unless we are at the end of a line
3459: and firstline is set. */
3460:
3461: if (firstline && IS_NEWLINE(current_subject)) break;
3462: current_subject++;
1.1.1.2 ! misho 3463: #ifdef SUPPORT_UTF
! 3464: if (utf)
1.1 misho 3465: {
1.1.1.2 ! misho 3466: ACROSSCHAR(current_subject < end_subject, *current_subject,
! 3467: current_subject++);
1.1 misho 3468: }
1.1.1.2 ! misho 3469: #endif
1.1 misho 3470: if (current_subject > end_subject) break;
3471:
3472: /* If we have just passed a CR and we are now at a LF, and the pattern does
3473: not contain any explicit matches for \r or \n, and the newline option is CRLF
3474: or ANY or ANYCRLF, advance the match position by one more character. */
3475:
3476: if (current_subject[-1] == CHAR_CR &&
3477: current_subject < end_subject &&
3478: *current_subject == CHAR_NL &&
3479: (re->flags & PCRE_HASCRORLF) == 0 &&
3480: (md->nltype == NLTYPE_ANY ||
3481: md->nltype == NLTYPE_ANYCRLF ||
3482: md->nllen == 2))
3483: current_subject++;
3484:
3485: } /* "Bumpalong" loop */
3486:
3487: return PCRE_ERROR_NOMATCH;
3488: }
3489:
3490: /* End of pcre_dfa_exec.c */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>