Annotation of embedaddon/pcre/pcre_dfa_exec.c, revision 1.1.1.1
1.1 misho 1: /*************************************************
2: * Perl-Compatible Regular Expressions *
3: *************************************************/
4:
5: /* PCRE is a library of functions to support regular expressions whose syntax
6: and semantics are as close as possible to those of the Perl 5 language (but see
7: below for why this module is different).
8:
9: Written by Philip Hazel
10: Copyright (c) 1997-2011 University of Cambridge
11:
12: -----------------------------------------------------------------------------
13: Redistribution and use in source and binary forms, with or without
14: modification, are permitted provided that the following conditions are met:
15:
16: * Redistributions of source code must retain the above copyright notice,
17: this list of conditions and the following disclaimer.
18:
19: * Redistributions in binary form must reproduce the above copyright
20: notice, this list of conditions and the following disclaimer in the
21: documentation and/or other materials provided with the distribution.
22:
23: * Neither the name of the University of Cambridge nor the names of its
24: contributors may be used to endorse or promote products derived from
25: this software without specific prior written permission.
26:
27: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37: POSSIBILITY OF SUCH DAMAGE.
38: -----------------------------------------------------------------------------
39: */
40:
41:
42: /* This module contains the external function pcre_dfa_exec(), which is an
43: alternative matching function that uses a sort of DFA algorithm (not a true
44: FSM). This is NOT Perl- compatible, but it has advantages in certain
45: applications. */
46:
47:
48: /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49: the performance of his patterns greatly. I could not use it as it stood, as it
50: was not thread safe, and made assumptions about pattern sizes. Also, it caused
51: test 7 to loop, and test 9 to crash with a segfault.
52:
53: The issue is the check for duplicate states, which is done by a simple linear
54: search up the state list. (Grep for "duplicate" below to find the code.) For
55: many patterns, there will never be many states active at one time, so a simple
56: linear search is fine. In patterns that have many active states, it might be a
57: bottleneck. The suggested code used an indexing scheme to remember which states
58: had previously been used for each character, and avoided the linear search when
59: it knew there was no chance of a duplicate. This was implemented when adding
60: states to the state lists.
61:
62: I wrote some thread-safe, not-limited code to try something similar at the time
63: of checking for duplicates (instead of when adding states), using index vectors
64: on the stack. It did give a 13% improvement with one specially constructed
65: pattern for certain subject strings, but on other strings and on many of the
66: simpler patterns in the test suite it did worse. The major problem, I think,
67: was the extra time to initialize the index. This had to be done for each call
68: of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69: only once - I suspect this was the cause of the problems with the tests.)
70:
71: Overall, I concluded that the gains in some cases did not outweigh the losses
72: in others, so I abandoned this code. */
73:
74:
75:
76: #ifdef HAVE_CONFIG_H
77: #include "config.h"
78: #endif
79:
80: #define NLBLOCK md /* Block containing newline information */
81: #define PSSTART start_subject /* Field containing processed string start */
82: #define PSEND end_subject /* Field containing processed string end */
83:
84: #include "pcre_internal.h"
85:
86:
87: /* For use to indent debugging output */
88:
89: #define SP " "
90:
91:
92: /*************************************************
93: * Code parameters and static tables *
94: *************************************************/
95:
96: /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97: into others, under special conditions. A gap of 20 between the blocks should be
98: enough. The resulting opcodes don't have to be less than 256 because they are
99: never stored, so we push them well clear of the normal opcodes. */
100:
101: #define OP_PROP_EXTRA 300
102: #define OP_EXTUNI_EXTRA 320
103: #define OP_ANYNL_EXTRA 340
104: #define OP_HSPACE_EXTRA 360
105: #define OP_VSPACE_EXTRA 380
106:
107:
108: /* This table identifies those opcodes that are followed immediately by a
109: character that is to be tested in some way. This makes it possible to
110: centralize the loading of these characters. In the case of Type * etc, the
111: "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112: small value. Non-zero values in the table are the offsets from the opcode where
113: the character is to be found. ***NOTE*** If the start of this table is
114: modified, the three tables that follow must also be modified. */
115:
116: static const uschar coptable[] = {
117: 0, /* End */
118: 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119: 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120: 0, 0, 0, /* Any, AllAny, Anybyte */
121: 0, 0, /* \P, \p */
122: 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123: 0, /* \X */
124: 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
125: 1, /* Char */
126: 1, /* Chari */
127: 1, /* not */
128: 1, /* noti */
129: /* Positive single-char repeats */
130: 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
131: 3, 3, 3, /* upto, minupto, exact */
132: 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
133: 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
134: 3, 3, 3, /* upto I, minupto I, exact I */
135: 1, 1, 1, 3, /* *+I, ++I, ?+I, upto+I */
136: /* Negative single-char repeats - only for chars < 256 */
137: 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
138: 3, 3, 3, /* NOT upto, minupto, exact */
139: 1, 1, 1, 3, /* NOT *+, ++, ?+, upto+ */
140: 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
141: 3, 3, 3, /* NOT upto I, minupto I, exact I */
142: 1, 1, 1, 3, /* NOT *+I, ++I, ?+I, upto+I */
143: /* Positive type repeats */
144: 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
145: 3, 3, 3, /* Type upto, minupto, exact */
146: 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
147: /* Character class & ref repeats */
148: 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
149: 0, 0, /* CRRANGE, CRMINRANGE */
150: 0, /* CLASS */
151: 0, /* NCLASS */
152: 0, /* XCLASS - variable length */
153: 0, /* REF */
154: 0, /* REFI */
155: 0, /* RECURSE */
156: 0, /* CALLOUT */
157: 0, /* Alt */
158: 0, /* Ket */
159: 0, /* KetRmax */
160: 0, /* KetRmin */
161: 0, /* KetRpos */
162: 0, /* Reverse */
163: 0, /* Assert */
164: 0, /* Assert not */
165: 0, /* Assert behind */
166: 0, /* Assert behind not */
167: 0, 0, /* ONCE, ONCE_NC */
168: 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
169: 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
170: 0, 0, /* CREF, NCREF */
171: 0, 0, /* RREF, NRREF */
172: 0, /* DEF */
173: 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
174: 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
175: 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
176: 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
177: 0, 0 /* CLOSE, SKIPZERO */
178: };
179:
180: /* This table identifies those opcodes that inspect a character. It is used to
181: remember the fact that a character could have been inspected when the end of
182: the subject is reached. ***NOTE*** If the start of this table is modified, the
183: two tables that follow must also be modified. */
184:
185: static const uschar poptable[] = {
186: 0, /* End */
187: 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
188: 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
189: 1, 1, 1, /* Any, AllAny, Anybyte */
190: 1, 1, /* \P, \p */
191: 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
192: 1, /* \X */
193: 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
194: 1, /* Char */
195: 1, /* Chari */
196: 1, /* not */
197: 1, /* noti */
198: /* Positive single-char repeats */
199: 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
200: 1, 1, 1, /* upto, minupto, exact */
201: 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
202: 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
203: 1, 1, 1, /* upto I, minupto I, exact I */
204: 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
205: /* Negative single-char repeats - only for chars < 256 */
206: 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
207: 1, 1, 1, /* NOT upto, minupto, exact */
208: 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
209: 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
210: 1, 1, 1, /* NOT upto I, minupto I, exact I */
211: 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
212: /* Positive type repeats */
213: 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
214: 1, 1, 1, /* Type upto, minupto, exact */
215: 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
216: /* Character class & ref repeats */
217: 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
218: 1, 1, /* CRRANGE, CRMINRANGE */
219: 1, /* CLASS */
220: 1, /* NCLASS */
221: 1, /* XCLASS - variable length */
222: 0, /* REF */
223: 0, /* REFI */
224: 0, /* RECURSE */
225: 0, /* CALLOUT */
226: 0, /* Alt */
227: 0, /* Ket */
228: 0, /* KetRmax */
229: 0, /* KetRmin */
230: 0, /* KetRpos */
231: 0, /* Reverse */
232: 0, /* Assert */
233: 0, /* Assert not */
234: 0, /* Assert behind */
235: 0, /* Assert behind not */
236: 0, 0, /* ONCE, ONCE_NC */
237: 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
238: 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
239: 0, 0, /* CREF, NCREF */
240: 0, 0, /* RREF, NRREF */
241: 0, /* DEF */
242: 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
243: 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
244: 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
245: 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
246: 0, 0 /* CLOSE, SKIPZERO */
247: };
248:
249: /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
250: and \w */
251:
252: static const uschar toptable1[] = {
253: 0, 0, 0, 0, 0, 0,
254: ctype_digit, ctype_digit,
255: ctype_space, ctype_space,
256: ctype_word, ctype_word,
257: 0, 0 /* OP_ANY, OP_ALLANY */
258: };
259:
260: static const uschar toptable2[] = {
261: 0, 0, 0, 0, 0, 0,
262: ctype_digit, 0,
263: ctype_space, 0,
264: ctype_word, 0,
265: 1, 1 /* OP_ANY, OP_ALLANY */
266: };
267:
268:
269: /* Structure for holding data about a particular state, which is in effect the
270: current data for an active path through the match tree. It must consist
271: entirely of ints because the working vector we are passed, and which we put
272: these structures in, is a vector of ints. */
273:
274: typedef struct stateblock {
275: int offset; /* Offset to opcode */
276: int count; /* Count for repeats */
277: int data; /* Some use extra data */
278: } stateblock;
279:
280: #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
281:
282:
283: #ifdef PCRE_DEBUG
284: /*************************************************
285: * Print character string *
286: *************************************************/
287:
288: /* Character string printing function for debugging.
289:
290: Arguments:
291: p points to string
292: length number of bytes
293: f where to print
294:
295: Returns: nothing
296: */
297:
298: static void
299: pchars(unsigned char *p, int length, FILE *f)
300: {
301: int c;
302: while (length-- > 0)
303: {
304: if (isprint(c = *(p++)))
305: fprintf(f, "%c", c);
306: else
307: fprintf(f, "\\x%02x", c);
308: }
309: }
310: #endif
311:
312:
313:
314: /*************************************************
315: * Execute a Regular Expression - DFA engine *
316: *************************************************/
317:
318: /* This internal function applies a compiled pattern to a subject string,
319: starting at a given point, using a DFA engine. This function is called from the
320: external one, possibly multiple times if the pattern is not anchored. The
321: function calls itself recursively for some kinds of subpattern.
322:
323: Arguments:
324: md the match_data block with fixed information
325: this_start_code the opening bracket of this subexpression's code
326: current_subject where we currently are in the subject string
327: start_offset start offset in the subject string
328: offsets vector to contain the matching string offsets
329: offsetcount size of same
330: workspace vector of workspace
331: wscount size of same
332: rlevel function call recursion level
333:
334: Returns: > 0 => number of match offset pairs placed in offsets
335: = 0 => offsets overflowed; longest matches are present
336: -1 => failed to match
337: < -1 => some kind of unexpected problem
338:
339: The following macros are used for adding states to the two state vectors (one
340: for the current character, one for the following character). */
341:
342: #define ADD_ACTIVE(x,y) \
343: if (active_count++ < wscount) \
344: { \
345: next_active_state->offset = (x); \
346: next_active_state->count = (y); \
347: next_active_state++; \
348: DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
349: } \
350: else return PCRE_ERROR_DFA_WSSIZE
351:
352: #define ADD_ACTIVE_DATA(x,y,z) \
353: if (active_count++ < wscount) \
354: { \
355: next_active_state->offset = (x); \
356: next_active_state->count = (y); \
357: next_active_state->data = (z); \
358: next_active_state++; \
359: DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
360: } \
361: else return PCRE_ERROR_DFA_WSSIZE
362:
363: #define ADD_NEW(x,y) \
364: if (new_count++ < wscount) \
365: { \
366: next_new_state->offset = (x); \
367: next_new_state->count = (y); \
368: next_new_state++; \
369: DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
370: } \
371: else return PCRE_ERROR_DFA_WSSIZE
372:
373: #define ADD_NEW_DATA(x,y,z) \
374: if (new_count++ < wscount) \
375: { \
376: next_new_state->offset = (x); \
377: next_new_state->count = (y); \
378: next_new_state->data = (z); \
379: next_new_state++; \
380: DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
381: } \
382: else return PCRE_ERROR_DFA_WSSIZE
383:
384: /* And now, here is the code */
385:
386: static int
387: internal_dfa_exec(
388: dfa_match_data *md,
389: const uschar *this_start_code,
390: const uschar *current_subject,
391: int start_offset,
392: int *offsets,
393: int offsetcount,
394: int *workspace,
395: int wscount,
396: int rlevel)
397: {
398: stateblock *active_states, *new_states, *temp_states;
399: stateblock *next_active_state, *next_new_state;
400:
401: const uschar *ctypes, *lcc, *fcc;
402: const uschar *ptr;
403: const uschar *end_code, *first_op;
404:
405: dfa_recursion_info new_recursive;
406:
407: int active_count, new_count, match_count;
408:
409: /* Some fields in the md block are frequently referenced, so we load them into
410: independent variables in the hope that this will perform better. */
411:
412: const uschar *start_subject = md->start_subject;
413: const uschar *end_subject = md->end_subject;
414: const uschar *start_code = md->start_code;
415:
416: #ifdef SUPPORT_UTF8
417: BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
418: #else
419: BOOL utf8 = FALSE;
420: #endif
421:
422: rlevel++;
423: offsetcount &= (-2);
424:
425: wscount -= 2;
426: wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
427: (2 * INTS_PER_STATEBLOCK);
428:
429: DPRINTF(("\n%.*s---------------------\n"
430: "%.*sCall to internal_dfa_exec f=%d\n",
431: rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
432:
433: ctypes = md->tables + ctypes_offset;
434: lcc = md->tables + lcc_offset;
435: fcc = md->tables + fcc_offset;
436:
437: match_count = PCRE_ERROR_NOMATCH; /* A negative number */
438:
439: active_states = (stateblock *)(workspace + 2);
440: next_new_state = new_states = active_states + wscount;
441: new_count = 0;
442:
443: first_op = this_start_code + 1 + LINK_SIZE +
444: ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
445: *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)? 2:0);
446:
447: /* The first thing in any (sub) pattern is a bracket of some sort. Push all
448: the alternative states onto the list, and find out where the end is. This
449: makes is possible to use this function recursively, when we want to stop at a
450: matching internal ket rather than at the end.
451:
452: If the first opcode in the first alternative is OP_REVERSE, we are dealing with
453: a backward assertion. In that case, we have to find out the maximum amount to
454: move back, and set up each alternative appropriately. */
455:
456: if (*first_op == OP_REVERSE)
457: {
458: int max_back = 0;
459: int gone_back;
460:
461: end_code = this_start_code;
462: do
463: {
464: int back = GET(end_code, 2+LINK_SIZE);
465: if (back > max_back) max_back = back;
466: end_code += GET(end_code, 1);
467: }
468: while (*end_code == OP_ALT);
469:
470: /* If we can't go back the amount required for the longest lookbehind
471: pattern, go back as far as we can; some alternatives may still be viable. */
472:
473: #ifdef SUPPORT_UTF8
474: /* In character mode we have to step back character by character */
475:
476: if (utf8)
477: {
478: for (gone_back = 0; gone_back < max_back; gone_back++)
479: {
480: if (current_subject <= start_subject) break;
481: current_subject--;
482: while (current_subject > start_subject &&
483: (*current_subject & 0xc0) == 0x80)
484: current_subject--;
485: }
486: }
487: else
488: #endif
489:
490: /* In byte-mode we can do this quickly. */
491:
492: {
493: gone_back = (current_subject - max_back < start_subject)?
494: (int)(current_subject - start_subject) : max_back;
495: current_subject -= gone_back;
496: }
497:
498: /* Save the earliest consulted character */
499:
500: if (current_subject < md->start_used_ptr)
501: md->start_used_ptr = current_subject;
502:
503: /* Now we can process the individual branches. */
504:
505: end_code = this_start_code;
506: do
507: {
508: int back = GET(end_code, 2+LINK_SIZE);
509: if (back <= gone_back)
510: {
511: int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
512: ADD_NEW_DATA(-bstate, 0, gone_back - back);
513: }
514: end_code += GET(end_code, 1);
515: }
516: while (*end_code == OP_ALT);
517: }
518:
519: /* This is the code for a "normal" subpattern (not a backward assertion). The
520: start of a whole pattern is always one of these. If we are at the top level,
521: we may be asked to restart matching from the same point that we reached for a
522: previous partial match. We still have to scan through the top-level branches to
523: find the end state. */
524:
525: else
526: {
527: end_code = this_start_code;
528:
529: /* Restarting */
530:
531: if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
532: {
533: do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
534: new_count = workspace[1];
535: if (!workspace[0])
536: memcpy(new_states, active_states, new_count * sizeof(stateblock));
537: }
538:
539: /* Not restarting */
540:
541: else
542: {
543: int length = 1 + LINK_SIZE +
544: ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
545: *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)?
546: 2:0);
547: do
548: {
549: ADD_NEW((int)(end_code - start_code + length), 0);
550: end_code += GET(end_code, 1);
551: length = 1 + LINK_SIZE;
552: }
553: while (*end_code == OP_ALT);
554: }
555: }
556:
557: workspace[0] = 0; /* Bit indicating which vector is current */
558:
559: DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
560:
561: /* Loop for scanning the subject */
562:
563: ptr = current_subject;
564: for (;;)
565: {
566: int i, j;
567: int clen, dlen;
568: unsigned int c, d;
569: int forced_fail = 0;
570: BOOL could_continue = FALSE;
571:
572: /* Make the new state list into the active state list and empty the
573: new state list. */
574:
575: temp_states = active_states;
576: active_states = new_states;
577: new_states = temp_states;
578: active_count = new_count;
579: new_count = 0;
580:
581: workspace[0] ^= 1; /* Remember for the restarting feature */
582: workspace[1] = active_count;
583:
584: #ifdef PCRE_DEBUG
585: printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
586: pchars((uschar *)ptr, strlen((char *)ptr), stdout);
587: printf("\"\n");
588:
589: printf("%.*sActive states: ", rlevel*2-2, SP);
590: for (i = 0; i < active_count; i++)
591: printf("%d/%d ", active_states[i].offset, active_states[i].count);
592: printf("\n");
593: #endif
594:
595: /* Set the pointers for adding new states */
596:
597: next_active_state = active_states + active_count;
598: next_new_state = new_states;
599:
600: /* Load the current character from the subject outside the loop, as many
601: different states may want to look at it, and we assume that at least one
602: will. */
603:
604: if (ptr < end_subject)
605: {
606: clen = 1; /* Number of bytes in the character */
607: #ifdef SUPPORT_UTF8
608: if (utf8) { GETCHARLEN(c, ptr, clen); } else
609: #endif /* SUPPORT_UTF8 */
610: c = *ptr;
611: }
612: else
613: {
614: clen = 0; /* This indicates the end of the subject */
615: c = NOTACHAR; /* This value should never actually be used */
616: }
617:
618: /* Scan up the active states and act on each one. The result of an action
619: may be to add more states to the currently active list (e.g. on hitting a
620: parenthesis) or it may be to put states on the new list, for considering
621: when we move the character pointer on. */
622:
623: for (i = 0; i < active_count; i++)
624: {
625: stateblock *current_state = active_states + i;
626: BOOL caseless = FALSE;
627: const uschar *code;
628: int state_offset = current_state->offset;
629: int count, codevalue, rrc;
630:
631: #ifdef PCRE_DEBUG
632: printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
633: if (clen == 0) printf("EOL\n");
634: else if (c > 32 && c < 127) printf("'%c'\n", c);
635: else printf("0x%02x\n", c);
636: #endif
637:
638: /* A negative offset is a special case meaning "hold off going to this
639: (negated) state until the number of characters in the data field have
640: been skipped". */
641:
642: if (state_offset < 0)
643: {
644: if (current_state->data > 0)
645: {
646: DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
647: ADD_NEW_DATA(state_offset, current_state->count,
648: current_state->data - 1);
649: continue;
650: }
651: else
652: {
653: current_state->offset = state_offset = -state_offset;
654: }
655: }
656:
657: /* Check for a duplicate state with the same count, and skip if found.
658: See the note at the head of this module about the possibility of improving
659: performance here. */
660:
661: for (j = 0; j < i; j++)
662: {
663: if (active_states[j].offset == state_offset &&
664: active_states[j].count == current_state->count)
665: {
666: DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
667: goto NEXT_ACTIVE_STATE;
668: }
669: }
670:
671: /* The state offset is the offset to the opcode */
672:
673: code = start_code + state_offset;
674: codevalue = *code;
675:
676: /* If this opcode inspects a character, but we are at the end of the
677: subject, remember the fact for use when testing for a partial match. */
678:
679: if (clen == 0 && poptable[codevalue] != 0)
680: could_continue = TRUE;
681:
682: /* If this opcode is followed by an inline character, load it. It is
683: tempting to test for the presence of a subject character here, but that
684: is wrong, because sometimes zero repetitions of the subject are
685: permitted.
686:
687: We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
688: argument that is not a data character - but is always one byte long. We
689: have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
690: this case. To keep the other cases fast, convert these ones to new opcodes.
691: */
692:
693: if (coptable[codevalue] > 0)
694: {
695: dlen = 1;
696: #ifdef SUPPORT_UTF8
697: if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
698: #endif /* SUPPORT_UTF8 */
699: d = code[coptable[codevalue]];
700: if (codevalue >= OP_TYPESTAR)
701: {
702: switch(d)
703: {
704: case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
705: case OP_NOTPROP:
706: case OP_PROP: codevalue += OP_PROP_EXTRA; break;
707: case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
708: case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
709: case OP_NOT_HSPACE:
710: case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
711: case OP_NOT_VSPACE:
712: case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
713: default: break;
714: }
715: }
716: }
717: else
718: {
719: dlen = 0; /* Not strictly necessary, but compilers moan */
720: d = NOTACHAR; /* if these variables are not set. */
721: }
722:
723:
724: /* Now process the individual opcodes */
725:
726: switch (codevalue)
727: {
728: /* ========================================================================== */
729: /* These cases are never obeyed. This is a fudge that causes a compile-
730: time error if the vectors coptable or poptable, which are indexed by
731: opcode, are not the correct length. It seems to be the only way to do
732: such a check at compile time, as the sizeof() operator does not work
733: in the C preprocessor. */
734:
735: case OP_TABLE_LENGTH:
736: case OP_TABLE_LENGTH +
737: ((sizeof(coptable) == OP_TABLE_LENGTH) &&
738: (sizeof(poptable) == OP_TABLE_LENGTH)):
739: break;
740:
741: /* ========================================================================== */
742: /* Reached a closing bracket. If not at the end of the pattern, carry
743: on with the next opcode. For repeating opcodes, also add the repeat
744: state. Note that KETRPOS will always be encountered at the end of the
745: subpattern, because the possessive subpattern repeats are always handled
746: using recursive calls. Thus, it never adds any new states.
747:
748: At the end of the (sub)pattern, unless we have an empty string and
749: PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
750: start of the subject, save the match data, shifting up all previous
751: matches so we always have the longest first. */
752:
753: case OP_KET:
754: case OP_KETRMIN:
755: case OP_KETRMAX:
756: case OP_KETRPOS:
757: if (code != end_code)
758: {
759: ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
760: if (codevalue != OP_KET)
761: {
762: ADD_ACTIVE(state_offset - GET(code, 1), 0);
763: }
764: }
765: else
766: {
767: if (ptr > current_subject ||
768: ((md->moptions & PCRE_NOTEMPTY) == 0 &&
769: ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
770: current_subject > start_subject + md->start_offset)))
771: {
772: if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
773: else if (match_count > 0 && ++match_count * 2 > offsetcount)
774: match_count = 0;
775: count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
776: if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
777: if (offsetcount >= 2)
778: {
779: offsets[0] = (int)(current_subject - start_subject);
780: offsets[1] = (int)(ptr - start_subject);
781: DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
782: offsets[1] - offsets[0], current_subject));
783: }
784: if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
785: {
786: DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
787: "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
788: match_count, rlevel*2-2, SP));
789: return match_count;
790: }
791: }
792: }
793: break;
794:
795: /* ========================================================================== */
796: /* These opcodes add to the current list of states without looking
797: at the current character. */
798:
799: /*-----------------------------------------------------------------*/
800: case OP_ALT:
801: do { code += GET(code, 1); } while (*code == OP_ALT);
802: ADD_ACTIVE((int)(code - start_code), 0);
803: break;
804:
805: /*-----------------------------------------------------------------*/
806: case OP_BRA:
807: case OP_SBRA:
808: do
809: {
810: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
811: code += GET(code, 1);
812: }
813: while (*code == OP_ALT);
814: break;
815:
816: /*-----------------------------------------------------------------*/
817: case OP_CBRA:
818: case OP_SCBRA:
819: ADD_ACTIVE((int)(code - start_code + 3 + LINK_SIZE), 0);
820: code += GET(code, 1);
821: while (*code == OP_ALT)
822: {
823: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
824: code += GET(code, 1);
825: }
826: break;
827:
828: /*-----------------------------------------------------------------*/
829: case OP_BRAZERO:
830: case OP_BRAMINZERO:
831: ADD_ACTIVE(state_offset + 1, 0);
832: code += 1 + GET(code, 2);
833: while (*code == OP_ALT) code += GET(code, 1);
834: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
835: break;
836:
837: /*-----------------------------------------------------------------*/
838: case OP_SKIPZERO:
839: code += 1 + GET(code, 2);
840: while (*code == OP_ALT) code += GET(code, 1);
841: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
842: break;
843:
844: /*-----------------------------------------------------------------*/
845: case OP_CIRC:
846: if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
847: { ADD_ACTIVE(state_offset + 1, 0); }
848: break;
849:
850: /*-----------------------------------------------------------------*/
851: case OP_CIRCM:
852: if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
853: (ptr != end_subject && WAS_NEWLINE(ptr)))
854: { ADD_ACTIVE(state_offset + 1, 0); }
855: break;
856:
857: /*-----------------------------------------------------------------*/
858: case OP_EOD:
859: if (ptr >= end_subject)
860: {
861: if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
862: could_continue = TRUE;
863: else { ADD_ACTIVE(state_offset + 1, 0); }
864: }
865: break;
866:
867: /*-----------------------------------------------------------------*/
868: case OP_SOD:
869: if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
870: break;
871:
872: /*-----------------------------------------------------------------*/
873: case OP_SOM:
874: if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
875: break;
876:
877:
878: /* ========================================================================== */
879: /* These opcodes inspect the next subject character, and sometimes
880: the previous one as well, but do not have an argument. The variable
881: clen contains the length of the current character and is zero if we are
882: at the end of the subject. */
883:
884: /*-----------------------------------------------------------------*/
885: case OP_ANY:
886: if (clen > 0 && !IS_NEWLINE(ptr))
887: { ADD_NEW(state_offset + 1, 0); }
888: break;
889:
890: /*-----------------------------------------------------------------*/
891: case OP_ALLANY:
892: if (clen > 0)
893: { ADD_NEW(state_offset + 1, 0); }
894: break;
895:
896: /*-----------------------------------------------------------------*/
897: case OP_EODN:
898: if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
899: could_continue = TRUE;
900: else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
901: { ADD_ACTIVE(state_offset + 1, 0); }
902: break;
903:
904: /*-----------------------------------------------------------------*/
905: case OP_DOLL:
906: if ((md->moptions & PCRE_NOTEOL) == 0)
907: {
908: if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
909: could_continue = TRUE;
910: else if (clen == 0 ||
911: ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
912: (ptr == end_subject - md->nllen)
913: ))
914: { ADD_ACTIVE(state_offset + 1, 0); }
915: }
916: break;
917:
918: /*-----------------------------------------------------------------*/
919: case OP_DOLLM:
920: if ((md->moptions & PCRE_NOTEOL) == 0)
921: {
922: if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
923: could_continue = TRUE;
924: else if (clen == 0 ||
925: ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
926: { ADD_ACTIVE(state_offset + 1, 0); }
927: }
928: else if (IS_NEWLINE(ptr))
929: { ADD_ACTIVE(state_offset + 1, 0); }
930: break;
931:
932: /*-----------------------------------------------------------------*/
933:
934: case OP_DIGIT:
935: case OP_WHITESPACE:
936: case OP_WORDCHAR:
937: if (clen > 0 && c < 256 &&
938: ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
939: { ADD_NEW(state_offset + 1, 0); }
940: break;
941:
942: /*-----------------------------------------------------------------*/
943: case OP_NOT_DIGIT:
944: case OP_NOT_WHITESPACE:
945: case OP_NOT_WORDCHAR:
946: if (clen > 0 && (c >= 256 ||
947: ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
948: { ADD_NEW(state_offset + 1, 0); }
949: break;
950:
951: /*-----------------------------------------------------------------*/
952: case OP_WORD_BOUNDARY:
953: case OP_NOT_WORD_BOUNDARY:
954: {
955: int left_word, right_word;
956:
957: if (ptr > start_subject)
958: {
959: const uschar *temp = ptr - 1;
960: if (temp < md->start_used_ptr) md->start_used_ptr = temp;
961: #ifdef SUPPORT_UTF8
962: if (utf8) BACKCHAR(temp);
963: #endif
964: GETCHARTEST(d, temp);
965: #ifdef SUPPORT_UCP
966: if ((md->poptions & PCRE_UCP) != 0)
967: {
968: if (d == '_') left_word = TRUE; else
969: {
970: int cat = UCD_CATEGORY(d);
971: left_word = (cat == ucp_L || cat == ucp_N);
972: }
973: }
974: else
975: #endif
976: left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
977: }
978: else left_word = FALSE;
979:
980: if (clen > 0)
981: {
982: #ifdef SUPPORT_UCP
983: if ((md->poptions & PCRE_UCP) != 0)
984: {
985: if (c == '_') right_word = TRUE; else
986: {
987: int cat = UCD_CATEGORY(c);
988: right_word = (cat == ucp_L || cat == ucp_N);
989: }
990: }
991: else
992: #endif
993: right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
994: }
995: else right_word = FALSE;
996:
997: if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
998: { ADD_ACTIVE(state_offset + 1, 0); }
999: }
1000: break;
1001:
1002:
1003: /*-----------------------------------------------------------------*/
1004: /* Check the next character by Unicode property. We will get here only
1005: if the support is in the binary; otherwise a compile-time error occurs.
1006: */
1007:
1008: #ifdef SUPPORT_UCP
1009: case OP_PROP:
1010: case OP_NOTPROP:
1011: if (clen > 0)
1012: {
1013: BOOL OK;
1014: const ucd_record * prop = GET_UCD(c);
1015: switch(code[1])
1016: {
1017: case PT_ANY:
1018: OK = TRUE;
1019: break;
1020:
1021: case PT_LAMP:
1022: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1023: prop->chartype == ucp_Lt;
1024: break;
1025:
1026: case PT_GC:
1027: OK = _pcre_ucp_gentype[prop->chartype] == code[2];
1028: break;
1029:
1030: case PT_PC:
1031: OK = prop->chartype == code[2];
1032: break;
1033:
1034: case PT_SC:
1035: OK = prop->script == code[2];
1036: break;
1037:
1038: /* These are specials for combination cases. */
1039:
1040: case PT_ALNUM:
1041: OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1042: _pcre_ucp_gentype[prop->chartype] == ucp_N;
1043: break;
1044:
1045: case PT_SPACE: /* Perl space */
1046: OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1047: c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1048: break;
1049:
1050: case PT_PXSPACE: /* POSIX space */
1051: OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1052: c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1053: c == CHAR_FF || c == CHAR_CR;
1054: break;
1055:
1056: case PT_WORD:
1057: OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1058: _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1059: c == CHAR_UNDERSCORE;
1060: break;
1061:
1062: /* Should never occur, but keep compilers from grumbling. */
1063:
1064: default:
1065: OK = codevalue != OP_PROP;
1066: break;
1067: }
1068:
1069: if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1070: }
1071: break;
1072: #endif
1073:
1074:
1075:
1076: /* ========================================================================== */
1077: /* These opcodes likewise inspect the subject character, but have an
1078: argument that is not a data character. It is one of these opcodes:
1079: OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1080: OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1081:
1082: case OP_TYPEPLUS:
1083: case OP_TYPEMINPLUS:
1084: case OP_TYPEPOSPLUS:
1085: count = current_state->count; /* Already matched */
1086: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1087: if (clen > 0)
1088: {
1089: if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1090: (c < 256 &&
1091: (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1092: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1093: {
1094: if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1095: {
1096: active_count--; /* Remove non-match possibility */
1097: next_active_state--;
1098: }
1099: count++;
1100: ADD_NEW(state_offset, count);
1101: }
1102: }
1103: break;
1104:
1105: /*-----------------------------------------------------------------*/
1106: case OP_TYPEQUERY:
1107: case OP_TYPEMINQUERY:
1108: case OP_TYPEPOSQUERY:
1109: ADD_ACTIVE(state_offset + 2, 0);
1110: if (clen > 0)
1111: {
1112: if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1113: (c < 256 &&
1114: (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1115: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1116: {
1117: if (codevalue == OP_TYPEPOSQUERY)
1118: {
1119: active_count--; /* Remove non-match possibility */
1120: next_active_state--;
1121: }
1122: ADD_NEW(state_offset + 2, 0);
1123: }
1124: }
1125: break;
1126:
1127: /*-----------------------------------------------------------------*/
1128: case OP_TYPESTAR:
1129: case OP_TYPEMINSTAR:
1130: case OP_TYPEPOSSTAR:
1131: ADD_ACTIVE(state_offset + 2, 0);
1132: if (clen > 0)
1133: {
1134: if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1135: (c < 256 &&
1136: (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1137: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1138: {
1139: if (codevalue == OP_TYPEPOSSTAR)
1140: {
1141: active_count--; /* Remove non-match possibility */
1142: next_active_state--;
1143: }
1144: ADD_NEW(state_offset, 0);
1145: }
1146: }
1147: break;
1148:
1149: /*-----------------------------------------------------------------*/
1150: case OP_TYPEEXACT:
1151: count = current_state->count; /* Number already matched */
1152: if (clen > 0)
1153: {
1154: if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1155: (c < 256 &&
1156: (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1157: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1158: {
1159: if (++count >= GET2(code, 1))
1160: { ADD_NEW(state_offset + 4, 0); }
1161: else
1162: { ADD_NEW(state_offset, count); }
1163: }
1164: }
1165: break;
1166:
1167: /*-----------------------------------------------------------------*/
1168: case OP_TYPEUPTO:
1169: case OP_TYPEMINUPTO:
1170: case OP_TYPEPOSUPTO:
1171: ADD_ACTIVE(state_offset + 4, 0);
1172: count = current_state->count; /* Number already matched */
1173: if (clen > 0)
1174: {
1175: if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1176: (c < 256 &&
1177: (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1178: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1179: {
1180: if (codevalue == OP_TYPEPOSUPTO)
1181: {
1182: active_count--; /* Remove non-match possibility */
1183: next_active_state--;
1184: }
1185: if (++count >= GET2(code, 1))
1186: { ADD_NEW(state_offset + 4, 0); }
1187: else
1188: { ADD_NEW(state_offset, count); }
1189: }
1190: }
1191: break;
1192:
1193: /* ========================================================================== */
1194: /* These are virtual opcodes that are used when something like
1195: OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1196: argument. It keeps the code above fast for the other cases. The argument
1197: is in the d variable. */
1198:
1199: #ifdef SUPPORT_UCP
1200: case OP_PROP_EXTRA + OP_TYPEPLUS:
1201: case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1202: case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1203: count = current_state->count; /* Already matched */
1204: if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1205: if (clen > 0)
1206: {
1207: BOOL OK;
1208: const ucd_record * prop = GET_UCD(c);
1209: switch(code[2])
1210: {
1211: case PT_ANY:
1212: OK = TRUE;
1213: break;
1214:
1215: case PT_LAMP:
1216: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1217: prop->chartype == ucp_Lt;
1218: break;
1219:
1220: case PT_GC:
1221: OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1222: break;
1223:
1224: case PT_PC:
1225: OK = prop->chartype == code[3];
1226: break;
1227:
1228: case PT_SC:
1229: OK = prop->script == code[3];
1230: break;
1231:
1232: /* These are specials for combination cases. */
1233:
1234: case PT_ALNUM:
1235: OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1236: _pcre_ucp_gentype[prop->chartype] == ucp_N;
1237: break;
1238:
1239: case PT_SPACE: /* Perl space */
1240: OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1241: c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1242: break;
1243:
1244: case PT_PXSPACE: /* POSIX space */
1245: OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1246: c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1247: c == CHAR_FF || c == CHAR_CR;
1248: break;
1249:
1250: case PT_WORD:
1251: OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1252: _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1253: c == CHAR_UNDERSCORE;
1254: break;
1255:
1256: /* Should never occur, but keep compilers from grumbling. */
1257:
1258: default:
1259: OK = codevalue != OP_PROP;
1260: break;
1261: }
1262:
1263: if (OK == (d == OP_PROP))
1264: {
1265: if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1266: {
1267: active_count--; /* Remove non-match possibility */
1268: next_active_state--;
1269: }
1270: count++;
1271: ADD_NEW(state_offset, count);
1272: }
1273: }
1274: break;
1275:
1276: /*-----------------------------------------------------------------*/
1277: case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1278: case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1279: case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1280: count = current_state->count; /* Already matched */
1281: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1282: if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1283: {
1284: const uschar *nptr = ptr + clen;
1285: int ncount = 0;
1286: if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1287: {
1288: active_count--; /* Remove non-match possibility */
1289: next_active_state--;
1290: }
1291: while (nptr < end_subject)
1292: {
1293: int nd;
1294: int ndlen = 1;
1295: GETCHARLEN(nd, nptr, ndlen);
1296: if (UCD_CATEGORY(nd) != ucp_M) break;
1297: ncount++;
1298: nptr += ndlen;
1299: }
1300: count++;
1301: ADD_NEW_DATA(-state_offset, count, ncount);
1302: }
1303: break;
1304: #endif
1305:
1306: /*-----------------------------------------------------------------*/
1307: case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1308: case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1309: case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1310: count = current_state->count; /* Already matched */
1311: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1312: if (clen > 0)
1313: {
1314: int ncount = 0;
1315: switch (c)
1316: {
1317: case 0x000b:
1318: case 0x000c:
1319: case 0x0085:
1320: case 0x2028:
1321: case 0x2029:
1322: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1323: goto ANYNL01;
1324:
1325: case 0x000d:
1326: if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1327: /* Fall through */
1328:
1329: ANYNL01:
1330: case 0x000a:
1331: if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1332: {
1333: active_count--; /* Remove non-match possibility */
1334: next_active_state--;
1335: }
1336: count++;
1337: ADD_NEW_DATA(-state_offset, count, ncount);
1338: break;
1339:
1340: default:
1341: break;
1342: }
1343: }
1344: break;
1345:
1346: /*-----------------------------------------------------------------*/
1347: case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1348: case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1349: case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1350: count = current_state->count; /* Already matched */
1351: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1352: if (clen > 0)
1353: {
1354: BOOL OK;
1355: switch (c)
1356: {
1357: case 0x000a:
1358: case 0x000b:
1359: case 0x000c:
1360: case 0x000d:
1361: case 0x0085:
1362: case 0x2028:
1363: case 0x2029:
1364: OK = TRUE;
1365: break;
1366:
1367: default:
1368: OK = FALSE;
1369: break;
1370: }
1371:
1372: if (OK == (d == OP_VSPACE))
1373: {
1374: if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1375: {
1376: active_count--; /* Remove non-match possibility */
1377: next_active_state--;
1378: }
1379: count++;
1380: ADD_NEW_DATA(-state_offset, count, 0);
1381: }
1382: }
1383: break;
1384:
1385: /*-----------------------------------------------------------------*/
1386: case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1387: case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1388: case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1389: count = current_state->count; /* Already matched */
1390: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1391: if (clen > 0)
1392: {
1393: BOOL OK;
1394: switch (c)
1395: {
1396: case 0x09: /* HT */
1397: case 0x20: /* SPACE */
1398: case 0xa0: /* NBSP */
1399: case 0x1680: /* OGHAM SPACE MARK */
1400: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1401: case 0x2000: /* EN QUAD */
1402: case 0x2001: /* EM QUAD */
1403: case 0x2002: /* EN SPACE */
1404: case 0x2003: /* EM SPACE */
1405: case 0x2004: /* THREE-PER-EM SPACE */
1406: case 0x2005: /* FOUR-PER-EM SPACE */
1407: case 0x2006: /* SIX-PER-EM SPACE */
1408: case 0x2007: /* FIGURE SPACE */
1409: case 0x2008: /* PUNCTUATION SPACE */
1410: case 0x2009: /* THIN SPACE */
1411: case 0x200A: /* HAIR SPACE */
1412: case 0x202f: /* NARROW NO-BREAK SPACE */
1413: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1414: case 0x3000: /* IDEOGRAPHIC SPACE */
1415: OK = TRUE;
1416: break;
1417:
1418: default:
1419: OK = FALSE;
1420: break;
1421: }
1422:
1423: if (OK == (d == OP_HSPACE))
1424: {
1425: if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1426: {
1427: active_count--; /* Remove non-match possibility */
1428: next_active_state--;
1429: }
1430: count++;
1431: ADD_NEW_DATA(-state_offset, count, 0);
1432: }
1433: }
1434: break;
1435:
1436: /*-----------------------------------------------------------------*/
1437: #ifdef SUPPORT_UCP
1438: case OP_PROP_EXTRA + OP_TYPEQUERY:
1439: case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1440: case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1441: count = 4;
1442: goto QS1;
1443:
1444: case OP_PROP_EXTRA + OP_TYPESTAR:
1445: case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1446: case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1447: count = 0;
1448:
1449: QS1:
1450:
1451: ADD_ACTIVE(state_offset + 4, 0);
1452: if (clen > 0)
1453: {
1454: BOOL OK;
1455: const ucd_record * prop = GET_UCD(c);
1456: switch(code[2])
1457: {
1458: case PT_ANY:
1459: OK = TRUE;
1460: break;
1461:
1462: case PT_LAMP:
1463: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1464: prop->chartype == ucp_Lt;
1465: break;
1466:
1467: case PT_GC:
1468: OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1469: break;
1470:
1471: case PT_PC:
1472: OK = prop->chartype == code[3];
1473: break;
1474:
1475: case PT_SC:
1476: OK = prop->script == code[3];
1477: break;
1478:
1479: /* These are specials for combination cases. */
1480:
1481: case PT_ALNUM:
1482: OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1483: _pcre_ucp_gentype[prop->chartype] == ucp_N;
1484: break;
1485:
1486: case PT_SPACE: /* Perl space */
1487: OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1488: c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1489: break;
1490:
1491: case PT_PXSPACE: /* POSIX space */
1492: OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1493: c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1494: c == CHAR_FF || c == CHAR_CR;
1495: break;
1496:
1497: case PT_WORD:
1498: OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1499: _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1500: c == CHAR_UNDERSCORE;
1501: break;
1502:
1503: /* Should never occur, but keep compilers from grumbling. */
1504:
1505: default:
1506: OK = codevalue != OP_PROP;
1507: break;
1508: }
1509:
1510: if (OK == (d == OP_PROP))
1511: {
1512: if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1513: codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1514: {
1515: active_count--; /* Remove non-match possibility */
1516: next_active_state--;
1517: }
1518: ADD_NEW(state_offset + count, 0);
1519: }
1520: }
1521: break;
1522:
1523: /*-----------------------------------------------------------------*/
1524: case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1525: case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1526: case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1527: count = 2;
1528: goto QS2;
1529:
1530: case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1531: case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1532: case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1533: count = 0;
1534:
1535: QS2:
1536:
1537: ADD_ACTIVE(state_offset + 2, 0);
1538: if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1539: {
1540: const uschar *nptr = ptr + clen;
1541: int ncount = 0;
1542: if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1543: codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1544: {
1545: active_count--; /* Remove non-match possibility */
1546: next_active_state--;
1547: }
1548: while (nptr < end_subject)
1549: {
1550: int nd;
1551: int ndlen = 1;
1552: GETCHARLEN(nd, nptr, ndlen);
1553: if (UCD_CATEGORY(nd) != ucp_M) break;
1554: ncount++;
1555: nptr += ndlen;
1556: }
1557: ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1558: }
1559: break;
1560: #endif
1561:
1562: /*-----------------------------------------------------------------*/
1563: case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1564: case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1565: case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1566: count = 2;
1567: goto QS3;
1568:
1569: case OP_ANYNL_EXTRA + OP_TYPESTAR:
1570: case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1571: case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1572: count = 0;
1573:
1574: QS3:
1575: ADD_ACTIVE(state_offset + 2, 0);
1576: if (clen > 0)
1577: {
1578: int ncount = 0;
1579: switch (c)
1580: {
1581: case 0x000b:
1582: case 0x000c:
1583: case 0x0085:
1584: case 0x2028:
1585: case 0x2029:
1586: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1587: goto ANYNL02;
1588:
1589: case 0x000d:
1590: if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1591: /* Fall through */
1592:
1593: ANYNL02:
1594: case 0x000a:
1595: if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1596: codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1597: {
1598: active_count--; /* Remove non-match possibility */
1599: next_active_state--;
1600: }
1601: ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1602: break;
1603:
1604: default:
1605: break;
1606: }
1607: }
1608: break;
1609:
1610: /*-----------------------------------------------------------------*/
1611: case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1612: case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1613: case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1614: count = 2;
1615: goto QS4;
1616:
1617: case OP_VSPACE_EXTRA + OP_TYPESTAR:
1618: case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1619: case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1620: count = 0;
1621:
1622: QS4:
1623: ADD_ACTIVE(state_offset + 2, 0);
1624: if (clen > 0)
1625: {
1626: BOOL OK;
1627: switch (c)
1628: {
1629: case 0x000a:
1630: case 0x000b:
1631: case 0x000c:
1632: case 0x000d:
1633: case 0x0085:
1634: case 0x2028:
1635: case 0x2029:
1636: OK = TRUE;
1637: break;
1638:
1639: default:
1640: OK = FALSE;
1641: break;
1642: }
1643: if (OK == (d == OP_VSPACE))
1644: {
1645: if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1646: codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1647: {
1648: active_count--; /* Remove non-match possibility */
1649: next_active_state--;
1650: }
1651: ADD_NEW_DATA(-(state_offset + count), 0, 0);
1652: }
1653: }
1654: break;
1655:
1656: /*-----------------------------------------------------------------*/
1657: case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1658: case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1659: case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1660: count = 2;
1661: goto QS5;
1662:
1663: case OP_HSPACE_EXTRA + OP_TYPESTAR:
1664: case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1665: case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1666: count = 0;
1667:
1668: QS5:
1669: ADD_ACTIVE(state_offset + 2, 0);
1670: if (clen > 0)
1671: {
1672: BOOL OK;
1673: switch (c)
1674: {
1675: case 0x09: /* HT */
1676: case 0x20: /* SPACE */
1677: case 0xa0: /* NBSP */
1678: case 0x1680: /* OGHAM SPACE MARK */
1679: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1680: case 0x2000: /* EN QUAD */
1681: case 0x2001: /* EM QUAD */
1682: case 0x2002: /* EN SPACE */
1683: case 0x2003: /* EM SPACE */
1684: case 0x2004: /* THREE-PER-EM SPACE */
1685: case 0x2005: /* FOUR-PER-EM SPACE */
1686: case 0x2006: /* SIX-PER-EM SPACE */
1687: case 0x2007: /* FIGURE SPACE */
1688: case 0x2008: /* PUNCTUATION SPACE */
1689: case 0x2009: /* THIN SPACE */
1690: case 0x200A: /* HAIR SPACE */
1691: case 0x202f: /* NARROW NO-BREAK SPACE */
1692: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1693: case 0x3000: /* IDEOGRAPHIC SPACE */
1694: OK = TRUE;
1695: break;
1696:
1697: default:
1698: OK = FALSE;
1699: break;
1700: }
1701:
1702: if (OK == (d == OP_HSPACE))
1703: {
1704: if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1705: codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1706: {
1707: active_count--; /* Remove non-match possibility */
1708: next_active_state--;
1709: }
1710: ADD_NEW_DATA(-(state_offset + count), 0, 0);
1711: }
1712: }
1713: break;
1714:
1715: /*-----------------------------------------------------------------*/
1716: #ifdef SUPPORT_UCP
1717: case OP_PROP_EXTRA + OP_TYPEEXACT:
1718: case OP_PROP_EXTRA + OP_TYPEUPTO:
1719: case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1720: case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1721: if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1722: { ADD_ACTIVE(state_offset + 6, 0); }
1723: count = current_state->count; /* Number already matched */
1724: if (clen > 0)
1725: {
1726: BOOL OK;
1727: const ucd_record * prop = GET_UCD(c);
1728: switch(code[4])
1729: {
1730: case PT_ANY:
1731: OK = TRUE;
1732: break;
1733:
1734: case PT_LAMP:
1735: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1736: prop->chartype == ucp_Lt;
1737: break;
1738:
1739: case PT_GC:
1740: OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1741: break;
1742:
1743: case PT_PC:
1744: OK = prop->chartype == code[5];
1745: break;
1746:
1747: case PT_SC:
1748: OK = prop->script == code[5];
1749: break;
1750:
1751: /* These are specials for combination cases. */
1752:
1753: case PT_ALNUM:
1754: OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1755: _pcre_ucp_gentype[prop->chartype] == ucp_N;
1756: break;
1757:
1758: case PT_SPACE: /* Perl space */
1759: OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1760: c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1761: break;
1762:
1763: case PT_PXSPACE: /* POSIX space */
1764: OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1765: c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1766: c == CHAR_FF || c == CHAR_CR;
1767: break;
1768:
1769: case PT_WORD:
1770: OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1771: _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1772: c == CHAR_UNDERSCORE;
1773: break;
1774:
1775: /* Should never occur, but keep compilers from grumbling. */
1776:
1777: default:
1778: OK = codevalue != OP_PROP;
1779: break;
1780: }
1781:
1782: if (OK == (d == OP_PROP))
1783: {
1784: if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1785: {
1786: active_count--; /* Remove non-match possibility */
1787: next_active_state--;
1788: }
1789: if (++count >= GET2(code, 1))
1790: { ADD_NEW(state_offset + 6, 0); }
1791: else
1792: { ADD_NEW(state_offset, count); }
1793: }
1794: }
1795: break;
1796:
1797: /*-----------------------------------------------------------------*/
1798: case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1799: case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1800: case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1801: case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1802: if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1803: { ADD_ACTIVE(state_offset + 4, 0); }
1804: count = current_state->count; /* Number already matched */
1805: if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1806: {
1807: const uschar *nptr = ptr + clen;
1808: int ncount = 0;
1809: if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1810: {
1811: active_count--; /* Remove non-match possibility */
1812: next_active_state--;
1813: }
1814: while (nptr < end_subject)
1815: {
1816: int nd;
1817: int ndlen = 1;
1818: GETCHARLEN(nd, nptr, ndlen);
1819: if (UCD_CATEGORY(nd) != ucp_M) break;
1820: ncount++;
1821: nptr += ndlen;
1822: }
1823: if (++count >= GET2(code, 1))
1824: { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1825: else
1826: { ADD_NEW_DATA(-state_offset, count, ncount); }
1827: }
1828: break;
1829: #endif
1830:
1831: /*-----------------------------------------------------------------*/
1832: case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1833: case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1834: case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1835: case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1836: if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1837: { ADD_ACTIVE(state_offset + 4, 0); }
1838: count = current_state->count; /* Number already matched */
1839: if (clen > 0)
1840: {
1841: int ncount = 0;
1842: switch (c)
1843: {
1844: case 0x000b:
1845: case 0x000c:
1846: case 0x0085:
1847: case 0x2028:
1848: case 0x2029:
1849: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1850: goto ANYNL03;
1851:
1852: case 0x000d:
1853: if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1854: /* Fall through */
1855:
1856: ANYNL03:
1857: case 0x000a:
1858: if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1859: {
1860: active_count--; /* Remove non-match possibility */
1861: next_active_state--;
1862: }
1863: if (++count >= GET2(code, 1))
1864: { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1865: else
1866: { ADD_NEW_DATA(-state_offset, count, ncount); }
1867: break;
1868:
1869: default:
1870: break;
1871: }
1872: }
1873: break;
1874:
1875: /*-----------------------------------------------------------------*/
1876: case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1877: case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1878: case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1879: case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1880: if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1881: { ADD_ACTIVE(state_offset + 4, 0); }
1882: count = current_state->count; /* Number already matched */
1883: if (clen > 0)
1884: {
1885: BOOL OK;
1886: switch (c)
1887: {
1888: case 0x000a:
1889: case 0x000b:
1890: case 0x000c:
1891: case 0x000d:
1892: case 0x0085:
1893: case 0x2028:
1894: case 0x2029:
1895: OK = TRUE;
1896: break;
1897:
1898: default:
1899: OK = FALSE;
1900: }
1901:
1902: if (OK == (d == OP_VSPACE))
1903: {
1904: if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1905: {
1906: active_count--; /* Remove non-match possibility */
1907: next_active_state--;
1908: }
1909: if (++count >= GET2(code, 1))
1910: { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1911: else
1912: { ADD_NEW_DATA(-state_offset, count, 0); }
1913: }
1914: }
1915: break;
1916:
1917: /*-----------------------------------------------------------------*/
1918: case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1919: case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1920: case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1921: case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1922: if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1923: { ADD_ACTIVE(state_offset + 4, 0); }
1924: count = current_state->count; /* Number already matched */
1925: if (clen > 0)
1926: {
1927: BOOL OK;
1928: switch (c)
1929: {
1930: case 0x09: /* HT */
1931: case 0x20: /* SPACE */
1932: case 0xa0: /* NBSP */
1933: case 0x1680: /* OGHAM SPACE MARK */
1934: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1935: case 0x2000: /* EN QUAD */
1936: case 0x2001: /* EM QUAD */
1937: case 0x2002: /* EN SPACE */
1938: case 0x2003: /* EM SPACE */
1939: case 0x2004: /* THREE-PER-EM SPACE */
1940: case 0x2005: /* FOUR-PER-EM SPACE */
1941: case 0x2006: /* SIX-PER-EM SPACE */
1942: case 0x2007: /* FIGURE SPACE */
1943: case 0x2008: /* PUNCTUATION SPACE */
1944: case 0x2009: /* THIN SPACE */
1945: case 0x200A: /* HAIR SPACE */
1946: case 0x202f: /* NARROW NO-BREAK SPACE */
1947: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1948: case 0x3000: /* IDEOGRAPHIC SPACE */
1949: OK = TRUE;
1950: break;
1951:
1952: default:
1953: OK = FALSE;
1954: break;
1955: }
1956:
1957: if (OK == (d == OP_HSPACE))
1958: {
1959: if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1960: {
1961: active_count--; /* Remove non-match possibility */
1962: next_active_state--;
1963: }
1964: if (++count >= GET2(code, 1))
1965: { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1966: else
1967: { ADD_NEW_DATA(-state_offset, count, 0); }
1968: }
1969: }
1970: break;
1971:
1972: /* ========================================================================== */
1973: /* These opcodes are followed by a character that is usually compared
1974: to the current subject character; it is loaded into d. We still get
1975: here even if there is no subject character, because in some cases zero
1976: repetitions are permitted. */
1977:
1978: /*-----------------------------------------------------------------*/
1979: case OP_CHAR:
1980: if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1981: break;
1982:
1983: /*-----------------------------------------------------------------*/
1984: case OP_CHARI:
1985: if (clen == 0) break;
1986:
1987: #ifdef SUPPORT_UTF8
1988: if (utf8)
1989: {
1990: if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1991: {
1992: unsigned int othercase;
1993: if (c < 128) othercase = fcc[c]; else
1994:
1995: /* If we have Unicode property support, we can use it to test the
1996: other case of the character. */
1997:
1998: #ifdef SUPPORT_UCP
1999: othercase = UCD_OTHERCASE(c);
2000: #else
2001: othercase = NOTACHAR;
2002: #endif
2003:
2004: if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2005: }
2006: }
2007: else
2008: #endif /* SUPPORT_UTF8 */
2009:
2010: /* Non-UTF-8 mode */
2011: {
2012: if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
2013: }
2014: break;
2015:
2016:
2017: #ifdef SUPPORT_UCP
2018: /*-----------------------------------------------------------------*/
2019: /* This is a tricky one because it can match more than one character.
2020: Find out how many characters to skip, and then set up a negative state
2021: to wait for them to pass before continuing. */
2022:
2023: case OP_EXTUNI:
2024: if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
2025: {
2026: const uschar *nptr = ptr + clen;
2027: int ncount = 0;
2028: while (nptr < end_subject)
2029: {
2030: int nclen = 1;
2031: GETCHARLEN(c, nptr, nclen);
2032: if (UCD_CATEGORY(c) != ucp_M) break;
2033: ncount++;
2034: nptr += nclen;
2035: }
2036: ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2037: }
2038: break;
2039: #endif
2040:
2041: /*-----------------------------------------------------------------*/
2042: /* This is a tricky like EXTUNI because it too can match more than one
2043: character (when CR is followed by LF). In this case, set up a negative
2044: state to wait for one character to pass before continuing. */
2045:
2046: case OP_ANYNL:
2047: if (clen > 0) switch(c)
2048: {
2049: case 0x000b:
2050: case 0x000c:
2051: case 0x0085:
2052: case 0x2028:
2053: case 0x2029:
2054: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2055:
2056: case 0x000a:
2057: ADD_NEW(state_offset + 1, 0);
2058: break;
2059:
2060: case 0x000d:
2061: if (ptr + 1 < end_subject && ptr[1] == 0x0a)
2062: {
2063: ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2064: }
2065: else
2066: {
2067: ADD_NEW(state_offset + 1, 0);
2068: }
2069: break;
2070: }
2071: break;
2072:
2073: /*-----------------------------------------------------------------*/
2074: case OP_NOT_VSPACE:
2075: if (clen > 0) switch(c)
2076: {
2077: case 0x000a:
2078: case 0x000b:
2079: case 0x000c:
2080: case 0x000d:
2081: case 0x0085:
2082: case 0x2028:
2083: case 0x2029:
2084: break;
2085:
2086: default:
2087: ADD_NEW(state_offset + 1, 0);
2088: break;
2089: }
2090: break;
2091:
2092: /*-----------------------------------------------------------------*/
2093: case OP_VSPACE:
2094: if (clen > 0) switch(c)
2095: {
2096: case 0x000a:
2097: case 0x000b:
2098: case 0x000c:
2099: case 0x000d:
2100: case 0x0085:
2101: case 0x2028:
2102: case 0x2029:
2103: ADD_NEW(state_offset + 1, 0);
2104: break;
2105:
2106: default: break;
2107: }
2108: break;
2109:
2110: /*-----------------------------------------------------------------*/
2111: case OP_NOT_HSPACE:
2112: if (clen > 0) switch(c)
2113: {
2114: case 0x09: /* HT */
2115: case 0x20: /* SPACE */
2116: case 0xa0: /* NBSP */
2117: case 0x1680: /* OGHAM SPACE MARK */
2118: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2119: case 0x2000: /* EN QUAD */
2120: case 0x2001: /* EM QUAD */
2121: case 0x2002: /* EN SPACE */
2122: case 0x2003: /* EM SPACE */
2123: case 0x2004: /* THREE-PER-EM SPACE */
2124: case 0x2005: /* FOUR-PER-EM SPACE */
2125: case 0x2006: /* SIX-PER-EM SPACE */
2126: case 0x2007: /* FIGURE SPACE */
2127: case 0x2008: /* PUNCTUATION SPACE */
2128: case 0x2009: /* THIN SPACE */
2129: case 0x200A: /* HAIR SPACE */
2130: case 0x202f: /* NARROW NO-BREAK SPACE */
2131: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2132: case 0x3000: /* IDEOGRAPHIC SPACE */
2133: break;
2134:
2135: default:
2136: ADD_NEW(state_offset + 1, 0);
2137: break;
2138: }
2139: break;
2140:
2141: /*-----------------------------------------------------------------*/
2142: case OP_HSPACE:
2143: if (clen > 0) switch(c)
2144: {
2145: case 0x09: /* HT */
2146: case 0x20: /* SPACE */
2147: case 0xa0: /* NBSP */
2148: case 0x1680: /* OGHAM SPACE MARK */
2149: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2150: case 0x2000: /* EN QUAD */
2151: case 0x2001: /* EM QUAD */
2152: case 0x2002: /* EN SPACE */
2153: case 0x2003: /* EM SPACE */
2154: case 0x2004: /* THREE-PER-EM SPACE */
2155: case 0x2005: /* FOUR-PER-EM SPACE */
2156: case 0x2006: /* SIX-PER-EM SPACE */
2157: case 0x2007: /* FIGURE SPACE */
2158: case 0x2008: /* PUNCTUATION SPACE */
2159: case 0x2009: /* THIN SPACE */
2160: case 0x200A: /* HAIR SPACE */
2161: case 0x202f: /* NARROW NO-BREAK SPACE */
2162: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2163: case 0x3000: /* IDEOGRAPHIC SPACE */
2164: ADD_NEW(state_offset + 1, 0);
2165: break;
2166: }
2167: break;
2168:
2169: /*-----------------------------------------------------------------*/
2170: /* Match a negated single character casefully. This is only used for
2171: one-byte characters, that is, we know that d < 256. The character we are
2172: checking (c) can be multibyte. */
2173:
2174: case OP_NOT:
2175: if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2176: break;
2177:
2178: /*-----------------------------------------------------------------*/
2179: /* Match a negated single character caselessly. This is only used for
2180: one-byte characters, that is, we know that d < 256. The character we are
2181: checking (c) can be multibyte. */
2182:
2183: case OP_NOTI:
2184: if (clen > 0 && c != d && c != fcc[d])
2185: { ADD_NEW(state_offset + dlen + 1, 0); }
2186: break;
2187:
2188: /*-----------------------------------------------------------------*/
2189: case OP_PLUSI:
2190: case OP_MINPLUSI:
2191: case OP_POSPLUSI:
2192: case OP_NOTPLUSI:
2193: case OP_NOTMINPLUSI:
2194: case OP_NOTPOSPLUSI:
2195: caseless = TRUE;
2196: codevalue -= OP_STARI - OP_STAR;
2197:
2198: /* Fall through */
2199: case OP_PLUS:
2200: case OP_MINPLUS:
2201: case OP_POSPLUS:
2202: case OP_NOTPLUS:
2203: case OP_NOTMINPLUS:
2204: case OP_NOTPOSPLUS:
2205: count = current_state->count; /* Already matched */
2206: if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2207: if (clen > 0)
2208: {
2209: unsigned int otherd = NOTACHAR;
2210: if (caseless)
2211: {
2212: #ifdef SUPPORT_UTF8
2213: if (utf8 && d >= 128)
2214: {
2215: #ifdef SUPPORT_UCP
2216: otherd = UCD_OTHERCASE(d);
2217: #endif /* SUPPORT_UCP */
2218: }
2219: else
2220: #endif /* SUPPORT_UTF8 */
2221: otherd = fcc[d];
2222: }
2223: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2224: {
2225: if (count > 0 &&
2226: (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2227: {
2228: active_count--; /* Remove non-match possibility */
2229: next_active_state--;
2230: }
2231: count++;
2232: ADD_NEW(state_offset, count);
2233: }
2234: }
2235: break;
2236:
2237: /*-----------------------------------------------------------------*/
2238: case OP_QUERYI:
2239: case OP_MINQUERYI:
2240: case OP_POSQUERYI:
2241: case OP_NOTQUERYI:
2242: case OP_NOTMINQUERYI:
2243: case OP_NOTPOSQUERYI:
2244: caseless = TRUE;
2245: codevalue -= OP_STARI - OP_STAR;
2246: /* Fall through */
2247: case OP_QUERY:
2248: case OP_MINQUERY:
2249: case OP_POSQUERY:
2250: case OP_NOTQUERY:
2251: case OP_NOTMINQUERY:
2252: case OP_NOTPOSQUERY:
2253: ADD_ACTIVE(state_offset + dlen + 1, 0);
2254: if (clen > 0)
2255: {
2256: unsigned int otherd = NOTACHAR;
2257: if (caseless)
2258: {
2259: #ifdef SUPPORT_UTF8
2260: if (utf8 && d >= 128)
2261: {
2262: #ifdef SUPPORT_UCP
2263: otherd = UCD_OTHERCASE(d);
2264: #endif /* SUPPORT_UCP */
2265: }
2266: else
2267: #endif /* SUPPORT_UTF8 */
2268: otherd = fcc[d];
2269: }
2270: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2271: {
2272: if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2273: {
2274: active_count--; /* Remove non-match possibility */
2275: next_active_state--;
2276: }
2277: ADD_NEW(state_offset + dlen + 1, 0);
2278: }
2279: }
2280: break;
2281:
2282: /*-----------------------------------------------------------------*/
2283: case OP_STARI:
2284: case OP_MINSTARI:
2285: case OP_POSSTARI:
2286: case OP_NOTSTARI:
2287: case OP_NOTMINSTARI:
2288: case OP_NOTPOSSTARI:
2289: caseless = TRUE;
2290: codevalue -= OP_STARI - OP_STAR;
2291: /* Fall through */
2292: case OP_STAR:
2293: case OP_MINSTAR:
2294: case OP_POSSTAR:
2295: case OP_NOTSTAR:
2296: case OP_NOTMINSTAR:
2297: case OP_NOTPOSSTAR:
2298: ADD_ACTIVE(state_offset + dlen + 1, 0);
2299: if (clen > 0)
2300: {
2301: unsigned int otherd = NOTACHAR;
2302: if (caseless)
2303: {
2304: #ifdef SUPPORT_UTF8
2305: if (utf8 && d >= 128)
2306: {
2307: #ifdef SUPPORT_UCP
2308: otherd = UCD_OTHERCASE(d);
2309: #endif /* SUPPORT_UCP */
2310: }
2311: else
2312: #endif /* SUPPORT_UTF8 */
2313: otherd = fcc[d];
2314: }
2315: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2316: {
2317: if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2318: {
2319: active_count--; /* Remove non-match possibility */
2320: next_active_state--;
2321: }
2322: ADD_NEW(state_offset, 0);
2323: }
2324: }
2325: break;
2326:
2327: /*-----------------------------------------------------------------*/
2328: case OP_EXACTI:
2329: case OP_NOTEXACTI:
2330: caseless = TRUE;
2331: codevalue -= OP_STARI - OP_STAR;
2332: /* Fall through */
2333: case OP_EXACT:
2334: case OP_NOTEXACT:
2335: count = current_state->count; /* Number already matched */
2336: if (clen > 0)
2337: {
2338: unsigned int otherd = NOTACHAR;
2339: if (caseless)
2340: {
2341: #ifdef SUPPORT_UTF8
2342: if (utf8 && d >= 128)
2343: {
2344: #ifdef SUPPORT_UCP
2345: otherd = UCD_OTHERCASE(d);
2346: #endif /* SUPPORT_UCP */
2347: }
2348: else
2349: #endif /* SUPPORT_UTF8 */
2350: otherd = fcc[d];
2351: }
2352: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2353: {
2354: if (++count >= GET2(code, 1))
2355: { ADD_NEW(state_offset + dlen + 3, 0); }
2356: else
2357: { ADD_NEW(state_offset, count); }
2358: }
2359: }
2360: break;
2361:
2362: /*-----------------------------------------------------------------*/
2363: case OP_UPTOI:
2364: case OP_MINUPTOI:
2365: case OP_POSUPTOI:
2366: case OP_NOTUPTOI:
2367: case OP_NOTMINUPTOI:
2368: case OP_NOTPOSUPTOI:
2369: caseless = TRUE;
2370: codevalue -= OP_STARI - OP_STAR;
2371: /* Fall through */
2372: case OP_UPTO:
2373: case OP_MINUPTO:
2374: case OP_POSUPTO:
2375: case OP_NOTUPTO:
2376: case OP_NOTMINUPTO:
2377: case OP_NOTPOSUPTO:
2378: ADD_ACTIVE(state_offset + dlen + 3, 0);
2379: count = current_state->count; /* Number already matched */
2380: if (clen > 0)
2381: {
2382: unsigned int otherd = NOTACHAR;
2383: if (caseless)
2384: {
2385: #ifdef SUPPORT_UTF8
2386: if (utf8 && d >= 128)
2387: {
2388: #ifdef SUPPORT_UCP
2389: otherd = UCD_OTHERCASE(d);
2390: #endif /* SUPPORT_UCP */
2391: }
2392: else
2393: #endif /* SUPPORT_UTF8 */
2394: otherd = fcc[d];
2395: }
2396: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2397: {
2398: if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2399: {
2400: active_count--; /* Remove non-match possibility */
2401: next_active_state--;
2402: }
2403: if (++count >= GET2(code, 1))
2404: { ADD_NEW(state_offset + dlen + 3, 0); }
2405: else
2406: { ADD_NEW(state_offset, count); }
2407: }
2408: }
2409: break;
2410:
2411:
2412: /* ========================================================================== */
2413: /* These are the class-handling opcodes */
2414:
2415: case OP_CLASS:
2416: case OP_NCLASS:
2417: case OP_XCLASS:
2418: {
2419: BOOL isinclass = FALSE;
2420: int next_state_offset;
2421: const uschar *ecode;
2422:
2423: /* For a simple class, there is always just a 32-byte table, and we
2424: can set isinclass from it. */
2425:
2426: if (codevalue != OP_XCLASS)
2427: {
2428: ecode = code + 33;
2429: if (clen > 0)
2430: {
2431: isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2432: ((code[1 + c/8] & (1 << (c&7))) != 0);
2433: }
2434: }
2435:
2436: /* An extended class may have a table or a list of single characters,
2437: ranges, or both, and it may be positive or negative. There's a
2438: function that sorts all this out. */
2439:
2440: else
2441: {
2442: ecode = code + GET(code, 1);
2443: if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2444: }
2445:
2446: /* At this point, isinclass is set for all kinds of class, and ecode
2447: points to the byte after the end of the class. If there is a
2448: quantifier, this is where it will be. */
2449:
2450: next_state_offset = (int)(ecode - start_code);
2451:
2452: switch (*ecode)
2453: {
2454: case OP_CRSTAR:
2455: case OP_CRMINSTAR:
2456: ADD_ACTIVE(next_state_offset + 1, 0);
2457: if (isinclass) { ADD_NEW(state_offset, 0); }
2458: break;
2459:
2460: case OP_CRPLUS:
2461: case OP_CRMINPLUS:
2462: count = current_state->count; /* Already matched */
2463: if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2464: if (isinclass) { count++; ADD_NEW(state_offset, count); }
2465: break;
2466:
2467: case OP_CRQUERY:
2468: case OP_CRMINQUERY:
2469: ADD_ACTIVE(next_state_offset + 1, 0);
2470: if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2471: break;
2472:
2473: case OP_CRRANGE:
2474: case OP_CRMINRANGE:
2475: count = current_state->count; /* Already matched */
2476: if (count >= GET2(ecode, 1))
2477: { ADD_ACTIVE(next_state_offset + 5, 0); }
2478: if (isinclass)
2479: {
2480: int max = GET2(ecode, 3);
2481: if (++count >= max && max != 0) /* Max 0 => no limit */
2482: { ADD_NEW(next_state_offset + 5, 0); }
2483: else
2484: { ADD_NEW(state_offset, count); }
2485: }
2486: break;
2487:
2488: default:
2489: if (isinclass) { ADD_NEW(next_state_offset, 0); }
2490: break;
2491: }
2492: }
2493: break;
2494:
2495: /* ========================================================================== */
2496: /* These are the opcodes for fancy brackets of various kinds. We have
2497: to use recursion in order to handle them. The "always failing" assertion
2498: (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2499: though the other "backtracking verbs" are not supported. */
2500:
2501: case OP_FAIL:
2502: forced_fail++; /* Count FAILs for multiple states */
2503: break;
2504:
2505: case OP_ASSERT:
2506: case OP_ASSERT_NOT:
2507: case OP_ASSERTBACK:
2508: case OP_ASSERTBACK_NOT:
2509: {
2510: int rc;
2511: int local_offsets[2];
2512: int local_workspace[1000];
2513: const uschar *endasscode = code + GET(code, 1);
2514:
2515: while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2516:
2517: rc = internal_dfa_exec(
2518: md, /* static match data */
2519: code, /* this subexpression's code */
2520: ptr, /* where we currently are */
2521: (int)(ptr - start_subject), /* start offset */
2522: local_offsets, /* offset vector */
2523: sizeof(local_offsets)/sizeof(int), /* size of same */
2524: local_workspace, /* workspace vector */
2525: sizeof(local_workspace)/sizeof(int), /* size of same */
2526: rlevel); /* function recursion level */
2527:
2528: if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2529: if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2530: { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2531: }
2532: break;
2533:
2534: /*-----------------------------------------------------------------*/
2535: case OP_COND:
2536: case OP_SCOND:
2537: {
2538: int local_offsets[1000];
2539: int local_workspace[1000];
2540: int codelink = GET(code, 1);
2541: int condcode;
2542:
2543: /* Because of the way auto-callout works during compile, a callout item
2544: is inserted between OP_COND and an assertion condition. This does not
2545: happen for the other conditions. */
2546:
2547: if (code[LINK_SIZE+1] == OP_CALLOUT)
2548: {
2549: rrc = 0;
2550: if (pcre_callout != NULL)
2551: {
2552: pcre_callout_block cb;
2553: cb.version = 1; /* Version 1 of the callout block */
2554: cb.callout_number = code[LINK_SIZE+2];
2555: cb.offset_vector = offsets;
2556: cb.subject = (PCRE_SPTR)start_subject;
2557: cb.subject_length = (int)(end_subject - start_subject);
2558: cb.start_match = (int)(current_subject - start_subject);
2559: cb.current_position = (int)(ptr - start_subject);
2560: cb.pattern_position = GET(code, LINK_SIZE + 3);
2561: cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2562: cb.capture_top = 1;
2563: cb.capture_last = -1;
2564: cb.callout_data = md->callout_data;
2565: cb.mark = NULL; /* No (*MARK) support */
2566: if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2567: }
2568: if (rrc > 0) break; /* Fail this thread */
2569: code += _pcre_OP_lengths[OP_CALLOUT]; /* Skip callout data */
2570: }
2571:
2572: condcode = code[LINK_SIZE+1];
2573:
2574: /* Back reference conditions are not supported */
2575:
2576: if (condcode == OP_CREF || condcode == OP_NCREF)
2577: return PCRE_ERROR_DFA_UCOND;
2578:
2579: /* The DEFINE condition is always false */
2580:
2581: if (condcode == OP_DEF)
2582: { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2583:
2584: /* The only supported version of OP_RREF is for the value RREF_ANY,
2585: which means "test if in any recursion". We can't test for specifically
2586: recursed groups. */
2587:
2588: else if (condcode == OP_RREF || condcode == OP_NRREF)
2589: {
2590: int value = GET2(code, LINK_SIZE+2);
2591: if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2592: if (md->recursive != NULL)
2593: { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2594: else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2595: }
2596:
2597: /* Otherwise, the condition is an assertion */
2598:
2599: else
2600: {
2601: int rc;
2602: const uschar *asscode = code + LINK_SIZE + 1;
2603: const uschar *endasscode = asscode + GET(asscode, 1);
2604:
2605: while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2606:
2607: rc = internal_dfa_exec(
2608: md, /* fixed match data */
2609: asscode, /* this subexpression's code */
2610: ptr, /* where we currently are */
2611: (int)(ptr - start_subject), /* start offset */
2612: local_offsets, /* offset vector */
2613: sizeof(local_offsets)/sizeof(int), /* size of same */
2614: local_workspace, /* workspace vector */
2615: sizeof(local_workspace)/sizeof(int), /* size of same */
2616: rlevel); /* function recursion level */
2617:
2618: if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2619: if ((rc >= 0) ==
2620: (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2621: { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2622: else
2623: { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2624: }
2625: }
2626: break;
2627:
2628: /*-----------------------------------------------------------------*/
2629: case OP_RECURSE:
2630: {
2631: dfa_recursion_info *ri;
2632: int local_offsets[1000];
2633: int local_workspace[1000];
2634: const uschar *callpat = start_code + GET(code, 1);
2635: int recno = (callpat == md->start_code)? 0 :
2636: GET2(callpat, 1 + LINK_SIZE);
2637: int rc;
2638:
2639: DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2640:
2641: /* Check for repeating a recursion without advancing the subject
2642: pointer. This should catch convoluted mutual recursions. (Some simple
2643: cases are caught at compile time.) */
2644:
2645: for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2646: if (recno == ri->group_num && ptr == ri->subject_position)
2647: return PCRE_ERROR_RECURSELOOP;
2648:
2649: /* Remember this recursion and where we started it so as to
2650: catch infinite loops. */
2651:
2652: new_recursive.group_num = recno;
2653: new_recursive.subject_position = ptr;
2654: new_recursive.prevrec = md->recursive;
2655: md->recursive = &new_recursive;
2656:
2657: rc = internal_dfa_exec(
2658: md, /* fixed match data */
2659: callpat, /* this subexpression's code */
2660: ptr, /* where we currently are */
2661: (int)(ptr - start_subject), /* start offset */
2662: local_offsets, /* offset vector */
2663: sizeof(local_offsets)/sizeof(int), /* size of same */
2664: local_workspace, /* workspace vector */
2665: sizeof(local_workspace)/sizeof(int), /* size of same */
2666: rlevel); /* function recursion level */
2667:
2668: md->recursive = new_recursive.prevrec; /* Done this recursion */
2669:
2670: DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2671: rc));
2672:
2673: /* Ran out of internal offsets */
2674:
2675: if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2676:
2677: /* For each successful matched substring, set up the next state with a
2678: count of characters to skip before trying it. Note that the count is in
2679: characters, not bytes. */
2680:
2681: if (rc > 0)
2682: {
2683: for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2684: {
2685: const uschar *p = start_subject + local_offsets[rc];
2686: const uschar *pp = start_subject + local_offsets[rc+1];
2687: int charcount = local_offsets[rc+1] - local_offsets[rc];
2688: while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2689: if (charcount > 0)
2690: {
2691: ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2692: }
2693: else
2694: {
2695: ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2696: }
2697: }
2698: }
2699: else if (rc != PCRE_ERROR_NOMATCH) return rc;
2700: }
2701: break;
2702:
2703: /*-----------------------------------------------------------------*/
2704: case OP_BRAPOS:
2705: case OP_SBRAPOS:
2706: case OP_CBRAPOS:
2707: case OP_SCBRAPOS:
2708: case OP_BRAPOSZERO:
2709: {
2710: int charcount, matched_count;
2711: const uschar *local_ptr = ptr;
2712: BOOL allow_zero;
2713:
2714: if (codevalue == OP_BRAPOSZERO)
2715: {
2716: allow_zero = TRUE;
2717: codevalue = *(++code); /* Codevalue will be one of above BRAs */
2718: }
2719: else allow_zero = FALSE;
2720:
2721: /* Loop to match the subpattern as many times as possible as if it were
2722: a complete pattern. */
2723:
2724: for (matched_count = 0;; matched_count++)
2725: {
2726: int local_offsets[2];
2727: int local_workspace[1000];
2728:
2729: int rc = internal_dfa_exec(
2730: md, /* fixed match data */
2731: code, /* this subexpression's code */
2732: local_ptr, /* where we currently are */
2733: (int)(ptr - start_subject), /* start offset */
2734: local_offsets, /* offset vector */
2735: sizeof(local_offsets)/sizeof(int), /* size of same */
2736: local_workspace, /* workspace vector */
2737: sizeof(local_workspace)/sizeof(int), /* size of same */
2738: rlevel); /* function recursion level */
2739:
2740: /* Failed to match */
2741:
2742: if (rc < 0)
2743: {
2744: if (rc != PCRE_ERROR_NOMATCH) return rc;
2745: break;
2746: }
2747:
2748: /* Matched: break the loop if zero characters matched. */
2749:
2750: charcount = local_offsets[1] - local_offsets[0];
2751: if (charcount == 0) break;
2752: local_ptr += charcount; /* Advance temporary position ptr */
2753: }
2754:
2755: /* At this point we have matched the subpattern matched_count
2756: times, and local_ptr is pointing to the character after the end of the
2757: last match. */
2758:
2759: if (matched_count > 0 || allow_zero)
2760: {
2761: const uschar *end_subpattern = code;
2762: int next_state_offset;
2763:
2764: do { end_subpattern += GET(end_subpattern, 1); }
2765: while (*end_subpattern == OP_ALT);
2766: next_state_offset =
2767: (int)(end_subpattern - start_code + LINK_SIZE + 1);
2768:
2769: /* Optimization: if there are no more active states, and there
2770: are no new states yet set up, then skip over the subject string
2771: right here, to save looping. Otherwise, set up the new state to swing
2772: into action when the end of the matched substring is reached. */
2773:
2774: if (i + 1 >= active_count && new_count == 0)
2775: {
2776: ptr = local_ptr;
2777: clen = 0;
2778: ADD_NEW(next_state_offset, 0);
2779: }
2780: else
2781: {
2782: const uschar *p = ptr;
2783: const uschar *pp = local_ptr;
2784: charcount = (int)(pp - p);
2785: while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2786: ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2787: }
2788: }
2789: }
2790: break;
2791:
2792: /*-----------------------------------------------------------------*/
2793: case OP_ONCE:
2794: case OP_ONCE_NC:
2795: {
2796: int local_offsets[2];
2797: int local_workspace[1000];
2798:
2799: int rc = internal_dfa_exec(
2800: md, /* fixed match data */
2801: code, /* this subexpression's code */
2802: ptr, /* where we currently are */
2803: (int)(ptr - start_subject), /* start offset */
2804: local_offsets, /* offset vector */
2805: sizeof(local_offsets)/sizeof(int), /* size of same */
2806: local_workspace, /* workspace vector */
2807: sizeof(local_workspace)/sizeof(int), /* size of same */
2808: rlevel); /* function recursion level */
2809:
2810: if (rc >= 0)
2811: {
2812: const uschar *end_subpattern = code;
2813: int charcount = local_offsets[1] - local_offsets[0];
2814: int next_state_offset, repeat_state_offset;
2815:
2816: do { end_subpattern += GET(end_subpattern, 1); }
2817: while (*end_subpattern == OP_ALT);
2818: next_state_offset =
2819: (int)(end_subpattern - start_code + LINK_SIZE + 1);
2820:
2821: /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2822: arrange for the repeat state also to be added to the relevant list.
2823: Calculate the offset, or set -1 for no repeat. */
2824:
2825: repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2826: *end_subpattern == OP_KETRMIN)?
2827: (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2828:
2829: /* If we have matched an empty string, add the next state at the
2830: current character pointer. This is important so that the duplicate
2831: checking kicks in, which is what breaks infinite loops that match an
2832: empty string. */
2833:
2834: if (charcount == 0)
2835: {
2836: ADD_ACTIVE(next_state_offset, 0);
2837: }
2838:
2839: /* Optimization: if there are no more active states, and there
2840: are no new states yet set up, then skip over the subject string
2841: right here, to save looping. Otherwise, set up the new state to swing
2842: into action when the end of the matched substring is reached. */
2843:
2844: else if (i + 1 >= active_count && new_count == 0)
2845: {
2846: ptr += charcount;
2847: clen = 0;
2848: ADD_NEW(next_state_offset, 0);
2849:
2850: /* If we are adding a repeat state at the new character position,
2851: we must fudge things so that it is the only current state.
2852: Otherwise, it might be a duplicate of one we processed before, and
2853: that would cause it to be skipped. */
2854:
2855: if (repeat_state_offset >= 0)
2856: {
2857: next_active_state = active_states;
2858: active_count = 0;
2859: i = -1;
2860: ADD_ACTIVE(repeat_state_offset, 0);
2861: }
2862: }
2863: else
2864: {
2865: const uschar *p = start_subject + local_offsets[0];
2866: const uschar *pp = start_subject + local_offsets[1];
2867: while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2868: ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2869: if (repeat_state_offset >= 0)
2870: { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2871: }
2872: }
2873: else if (rc != PCRE_ERROR_NOMATCH) return rc;
2874: }
2875: break;
2876:
2877:
2878: /* ========================================================================== */
2879: /* Handle callouts */
2880:
2881: case OP_CALLOUT:
2882: rrc = 0;
2883: if (pcre_callout != NULL)
2884: {
2885: pcre_callout_block cb;
2886: cb.version = 1; /* Version 1 of the callout block */
2887: cb.callout_number = code[1];
2888: cb.offset_vector = offsets;
2889: cb.subject = (PCRE_SPTR)start_subject;
2890: cb.subject_length = (int)(end_subject - start_subject);
2891: cb.start_match = (int)(current_subject - start_subject);
2892: cb.current_position = (int)(ptr - start_subject);
2893: cb.pattern_position = GET(code, 2);
2894: cb.next_item_length = GET(code, 2 + LINK_SIZE);
2895: cb.capture_top = 1;
2896: cb.capture_last = -1;
2897: cb.callout_data = md->callout_data;
2898: cb.mark = NULL; /* No (*MARK) support */
2899: if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2900: }
2901: if (rrc == 0)
2902: { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2903: break;
2904:
2905:
2906: /* ========================================================================== */
2907: default: /* Unsupported opcode */
2908: return PCRE_ERROR_DFA_UITEM;
2909: }
2910:
2911: NEXT_ACTIVE_STATE: continue;
2912:
2913: } /* End of loop scanning active states */
2914:
2915: /* We have finished the processing at the current subject character. If no
2916: new states have been set for the next character, we have found all the
2917: matches that we are going to find. If we are at the top level and partial
2918: matching has been requested, check for appropriate conditions.
2919:
2920: The "forced_ fail" variable counts the number of (*F) encountered for the
2921: character. If it is equal to the original active_count (saved in
2922: workspace[1]) it means that (*F) was found on every active state. In this
2923: case we don't want to give a partial match.
2924:
2925: The "could_continue" variable is true if a state could have continued but
2926: for the fact that the end of the subject was reached. */
2927:
2928: if (new_count <= 0)
2929: {
2930: if (rlevel == 1 && /* Top level, and */
2931: could_continue && /* Some could go on */
2932: forced_fail != workspace[1] && /* Not all forced fail & */
2933: ( /* either... */
2934: (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
2935: || /* or... */
2936: ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
2937: match_count < 0) /* no matches */
2938: ) && /* And... */
2939: ptr >= end_subject && /* Reached end of subject */
2940: ptr > md->start_used_ptr) /* Inspected non-empty string */
2941: {
2942: if (offsetcount >= 2)
2943: {
2944: offsets[0] = (int)(md->start_used_ptr - start_subject);
2945: offsets[1] = (int)(end_subject - start_subject);
2946: }
2947: match_count = PCRE_ERROR_PARTIAL;
2948: }
2949:
2950: DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2951: "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2952: rlevel*2-2, SP));
2953: break; /* In effect, "return", but see the comment below */
2954: }
2955:
2956: /* One or more states are active for the next character. */
2957:
2958: ptr += clen; /* Advance to next subject character */
2959: } /* Loop to move along the subject string */
2960:
2961: /* Control gets here from "break" a few lines above. We do it this way because
2962: if we use "return" above, we have compiler trouble. Some compilers warn if
2963: there's nothing here because they think the function doesn't return a value. On
2964: the other hand, if we put a dummy statement here, some more clever compilers
2965: complain that it can't be reached. Sigh. */
2966:
2967: return match_count;
2968: }
2969:
2970:
2971:
2972:
2973: /*************************************************
2974: * Execute a Regular Expression - DFA engine *
2975: *************************************************/
2976:
2977: /* This external function applies a compiled re to a subject string using a DFA
2978: engine. This function calls the internal function multiple times if the pattern
2979: is not anchored.
2980:
2981: Arguments:
2982: argument_re points to the compiled expression
2983: extra_data points to extra data or is NULL
2984: subject points to the subject string
2985: length length of subject string (may contain binary zeros)
2986: start_offset where to start in the subject string
2987: options option bits
2988: offsets vector of match offsets
2989: offsetcount size of same
2990: workspace workspace vector
2991: wscount size of same
2992:
2993: Returns: > 0 => number of match offset pairs placed in offsets
2994: = 0 => offsets overflowed; longest matches are present
2995: -1 => failed to match
2996: < -1 => some kind of unexpected problem
2997: */
2998:
2999: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3000: pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3001: const char *subject, int length, int start_offset, int options, int *offsets,
3002: int offsetcount, int *workspace, int wscount)
3003: {
3004: real_pcre *re = (real_pcre *)argument_re;
3005: dfa_match_data match_block;
3006: dfa_match_data *md = &match_block;
3007: BOOL utf8, anchored, startline, firstline;
3008: const uschar *current_subject, *end_subject, *lcc;
3009:
3010: pcre_study_data internal_study;
3011: const pcre_study_data *study = NULL;
3012: real_pcre internal_re;
3013:
3014: const uschar *req_byte_ptr;
3015: const uschar *start_bits = NULL;
3016: BOOL first_byte_caseless = FALSE;
3017: BOOL req_byte_caseless = FALSE;
3018: int first_byte = -1;
3019: int req_byte = -1;
3020: int req_byte2 = -1;
3021: int newline;
3022:
3023: /* Plausibility checks */
3024:
3025: if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3026: if (re == NULL || subject == NULL || workspace == NULL ||
3027: (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3028: if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3029: if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3030: if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3031:
3032: /* We need to find the pointer to any study data before we test for byte
3033: flipping, so we scan the extra_data block first. This may set two fields in the
3034: match block, so we must initialize them beforehand. However, the other fields
3035: in the match block must not be set until after the byte flipping. */
3036:
3037: md->tables = re->tables;
3038: md->callout_data = NULL;
3039:
3040: if (extra_data != NULL)
3041: {
3042: unsigned int flags = extra_data->flags;
3043: if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3044: study = (const pcre_study_data *)extra_data->study_data;
3045: if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3046: if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3047: return PCRE_ERROR_DFA_UMLIMIT;
3048: if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3049: md->callout_data = extra_data->callout_data;
3050: if ((flags & PCRE_EXTRA_TABLES) != 0)
3051: md->tables = extra_data->tables;
3052: }
3053:
3054: /* Check that the first field in the block is the magic number. If it is not,
3055: test for a regex that was compiled on a host of opposite endianness. If this is
3056: the case, flipped values are put in internal_re and internal_study if there was
3057: study data too. */
3058:
3059: if (re->magic_number != MAGIC_NUMBER)
3060: {
3061: re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
3062: if (re == NULL) return PCRE_ERROR_BADMAGIC;
3063: if (study != NULL) study = &internal_study;
3064: }
3065:
3066: /* Set some local values */
3067:
3068: current_subject = (const unsigned char *)subject + start_offset;
3069: end_subject = (const unsigned char *)subject + length;
3070: req_byte_ptr = current_subject - 1;
3071:
3072: #ifdef SUPPORT_UTF8
3073: utf8 = (re->options & PCRE_UTF8) != 0;
3074: #else
3075: utf8 = FALSE;
3076: #endif
3077:
3078: anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3079: (re->options & PCRE_ANCHORED) != 0;
3080:
3081: /* The remaining fixed data for passing around. */
3082:
3083: md->start_code = (const uschar *)argument_re +
3084: re->name_table_offset + re->name_count * re->name_entry_size;
3085: md->start_subject = (const unsigned char *)subject;
3086: md->end_subject = end_subject;
3087: md->start_offset = start_offset;
3088: md->moptions = options;
3089: md->poptions = re->options;
3090:
3091: /* If the BSR option is not set at match time, copy what was set
3092: at compile time. */
3093:
3094: if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3095: {
3096: if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3097: md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3098: #ifdef BSR_ANYCRLF
3099: else md->moptions |= PCRE_BSR_ANYCRLF;
3100: #endif
3101: }
3102:
3103: /* Handle different types of newline. The three bits give eight cases. If
3104: nothing is set at run time, whatever was used at compile time applies. */
3105:
3106: switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3107: PCRE_NEWLINE_BITS)
3108: {
3109: case 0: newline = NEWLINE; break; /* Compile-time default */
3110: case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3111: case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3112: case PCRE_NEWLINE_CR+
3113: PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3114: case PCRE_NEWLINE_ANY: newline = -1; break;
3115: case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3116: default: return PCRE_ERROR_BADNEWLINE;
3117: }
3118:
3119: if (newline == -2)
3120: {
3121: md->nltype = NLTYPE_ANYCRLF;
3122: }
3123: else if (newline < 0)
3124: {
3125: md->nltype = NLTYPE_ANY;
3126: }
3127: else
3128: {
3129: md->nltype = NLTYPE_FIXED;
3130: if (newline > 255)
3131: {
3132: md->nllen = 2;
3133: md->nl[0] = (newline >> 8) & 255;
3134: md->nl[1] = newline & 255;
3135: }
3136: else
3137: {
3138: md->nllen = 1;
3139: md->nl[0] = newline;
3140: }
3141: }
3142:
3143: /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3144: back the character offset. */
3145:
3146: #ifdef SUPPORT_UTF8
3147: if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3148: {
3149: int erroroffset;
3150: int errorcode = _pcre_valid_utf8((uschar *)subject, length, &erroroffset);
3151: if (errorcode != 0)
3152: {
3153: if (offsetcount >= 2)
3154: {
3155: offsets[0] = erroroffset;
3156: offsets[1] = errorcode;
3157: }
3158: return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3159: PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3160: }
3161: if (start_offset > 0 && start_offset < length &&
3162: (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
3163: return PCRE_ERROR_BADUTF8_OFFSET;
3164: }
3165: #endif
3166:
3167: /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3168: is a feature that makes it possible to save compiled regex and re-use them
3169: in other programs later. */
3170:
3171: if (md->tables == NULL) md->tables = _pcre_default_tables;
3172:
3173: /* The lower casing table and the "must be at the start of a line" flag are
3174: used in a loop when finding where to start. */
3175:
3176: lcc = md->tables + lcc_offset;
3177: startline = (re->flags & PCRE_STARTLINE) != 0;
3178: firstline = (re->options & PCRE_FIRSTLINE) != 0;
3179:
3180: /* Set up the first character to match, if available. The first_byte value is
3181: never set for an anchored regular expression, but the anchoring may be forced
3182: at run time, so we have to test for anchoring. The first char may be unset for
3183: an unanchored pattern, of course. If there's no first char and the pattern was
3184: studied, there may be a bitmap of possible first characters. */
3185:
3186: if (!anchored)
3187: {
3188: if ((re->flags & PCRE_FIRSTSET) != 0)
3189: {
3190: first_byte = re->first_byte & 255;
3191: if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
3192: first_byte = lcc[first_byte];
3193: }
3194: else
3195: {
3196: if (!startline && study != NULL &&
3197: (study->flags & PCRE_STUDY_MAPPED) != 0)
3198: start_bits = study->start_bits;
3199: }
3200: }
3201:
3202: /* For anchored or unanchored matches, there may be a "last known required
3203: character" set. */
3204:
3205: if ((re->flags & PCRE_REQCHSET) != 0)
3206: {
3207: req_byte = re->req_byte & 255;
3208: req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
3209: req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
3210: }
3211:
3212: /* Call the main matching function, looping for a non-anchored regex after a
3213: failed match. If not restarting, perform certain optimizations at the start of
3214: a match. */
3215:
3216: for (;;)
3217: {
3218: int rc;
3219:
3220: if ((options & PCRE_DFA_RESTART) == 0)
3221: {
3222: const uschar *save_end_subject = end_subject;
3223:
3224: /* If firstline is TRUE, the start of the match is constrained to the first
3225: line of a multiline string. Implement this by temporarily adjusting
3226: end_subject so that we stop scanning at a newline. If the match fails at
3227: the newline, later code breaks this loop. */
3228:
3229: if (firstline)
3230: {
3231: USPTR t = current_subject;
3232: #ifdef SUPPORT_UTF8
3233: if (utf8)
3234: {
3235: while (t < md->end_subject && !IS_NEWLINE(t))
3236: {
3237: t++;
3238: while (t < end_subject && (*t & 0xc0) == 0x80) t++;
3239: }
3240: }
3241: else
3242: #endif
3243: while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3244: end_subject = t;
3245: }
3246:
3247: /* There are some optimizations that avoid running the match if a known
3248: starting point is not found. However, there is an option that disables
3249: these, for testing and for ensuring that all callouts do actually occur.
3250: The option can be set in the regex by (*NO_START_OPT) or passed in
3251: match-time options. */
3252:
3253: if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3254: {
3255: /* Advance to a known first byte. */
3256:
3257: if (first_byte >= 0)
3258: {
3259: if (first_byte_caseless)
3260: while (current_subject < end_subject &&
3261: lcc[*current_subject] != first_byte)
3262: current_subject++;
3263: else
3264: while (current_subject < end_subject &&
3265: *current_subject != first_byte)
3266: current_subject++;
3267: }
3268:
3269: /* Or to just after a linebreak for a multiline match if possible */
3270:
3271: else if (startline)
3272: {
3273: if (current_subject > md->start_subject + start_offset)
3274: {
3275: #ifdef SUPPORT_UTF8
3276: if (utf8)
3277: {
3278: while (current_subject < end_subject &&
3279: !WAS_NEWLINE(current_subject))
3280: {
3281: current_subject++;
3282: while(current_subject < end_subject &&
3283: (*current_subject & 0xc0) == 0x80)
3284: current_subject++;
3285: }
3286: }
3287: else
3288: #endif
3289: while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3290: current_subject++;
3291:
3292: /* If we have just passed a CR and the newline option is ANY or
3293: ANYCRLF, and we are now at a LF, advance the match position by one
3294: more character. */
3295:
3296: if (current_subject[-1] == CHAR_CR &&
3297: (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3298: current_subject < end_subject &&
3299: *current_subject == CHAR_NL)
3300: current_subject++;
3301: }
3302: }
3303:
3304: /* Or to a non-unique first char after study */
3305:
3306: else if (start_bits != NULL)
3307: {
3308: while (current_subject < end_subject)
3309: {
3310: register unsigned int c = *current_subject;
3311: if ((start_bits[c/8] & (1 << (c&7))) == 0)
3312: {
3313: current_subject++;
3314: #ifdef SUPPORT_UTF8
3315: if (utf8)
3316: while(current_subject < end_subject &&
3317: (*current_subject & 0xc0) == 0x80) current_subject++;
3318: #endif
3319: }
3320: else break;
3321: }
3322: }
3323: }
3324:
3325: /* Restore fudged end_subject */
3326:
3327: end_subject = save_end_subject;
3328:
3329: /* The following two optimizations are disabled for partial matching or if
3330: disabling is explicitly requested (and of course, by the test above, this
3331: code is not obeyed when restarting after a partial match). */
3332:
3333: if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3334: (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3335: {
3336: /* If the pattern was studied, a minimum subject length may be set. This
3337: is a lower bound; no actual string of that length may actually match the
3338: pattern. Although the value is, strictly, in characters, we treat it as
3339: bytes to avoid spending too much time in this optimization. */
3340:
3341: if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3342: (pcre_uint32)(end_subject - current_subject) < study->minlength)
3343: return PCRE_ERROR_NOMATCH;
3344:
3345: /* If req_byte is set, we know that that character must appear in the
3346: subject for the match to succeed. If the first character is set, req_byte
3347: must be later in the subject; otherwise the test starts at the match
3348: point. This optimization can save a huge amount of work in patterns with
3349: nested unlimited repeats that aren't going to match. Writing separate
3350: code for cased/caseless versions makes it go faster, as does using an
3351: autoincrement and backing off on a match.
3352:
3353: HOWEVER: when the subject string is very, very long, searching to its end
3354: can take a long time, and give bad performance on quite ordinary
3355: patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3356: string... so we don't do this when the string is sufficiently long. */
3357:
3358: if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)
3359: {
3360: register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
3361:
3362: /* We don't need to repeat the search if we haven't yet reached the
3363: place we found it at last time. */
3364:
3365: if (p > req_byte_ptr)
3366: {
3367: if (req_byte_caseless)
3368: {
3369: while (p < end_subject)
3370: {
3371: register int pp = *p++;
3372: if (pp == req_byte || pp == req_byte2) { p--; break; }
3373: }
3374: }
3375: else
3376: {
3377: while (p < end_subject)
3378: {
3379: if (*p++ == req_byte) { p--; break; }
3380: }
3381: }
3382:
3383: /* If we can't find the required character, break the matching loop,
3384: which will cause a return or PCRE_ERROR_NOMATCH. */
3385:
3386: if (p >= end_subject) break;
3387:
3388: /* If we have found the required character, save the point where we
3389: found it, so that we don't search again next time round the loop if
3390: the start hasn't passed this character yet. */
3391:
3392: req_byte_ptr = p;
3393: }
3394: }
3395: }
3396: } /* End of optimizations that are done when not restarting */
3397:
3398: /* OK, now we can do the business */
3399:
3400: md->start_used_ptr = current_subject;
3401: md->recursive = NULL;
3402:
3403: rc = internal_dfa_exec(
3404: md, /* fixed match data */
3405: md->start_code, /* this subexpression's code */
3406: current_subject, /* where we currently are */
3407: start_offset, /* start offset in subject */
3408: offsets, /* offset vector */
3409: offsetcount, /* size of same */
3410: workspace, /* workspace vector */
3411: wscount, /* size of same */
3412: 0); /* function recurse level */
3413:
3414: /* Anything other than "no match" means we are done, always; otherwise, carry
3415: on only if not anchored. */
3416:
3417: if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3418:
3419: /* Advance to the next subject character unless we are at the end of a line
3420: and firstline is set. */
3421:
3422: if (firstline && IS_NEWLINE(current_subject)) break;
3423: current_subject++;
3424: if (utf8)
3425: {
3426: while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
3427: current_subject++;
3428: }
3429: if (current_subject > end_subject) break;
3430:
3431: /* If we have just passed a CR and we are now at a LF, and the pattern does
3432: not contain any explicit matches for \r or \n, and the newline option is CRLF
3433: or ANY or ANYCRLF, advance the match position by one more character. */
3434:
3435: if (current_subject[-1] == CHAR_CR &&
3436: current_subject < end_subject &&
3437: *current_subject == CHAR_NL &&
3438: (re->flags & PCRE_HASCRORLF) == 0 &&
3439: (md->nltype == NLTYPE_ANY ||
3440: md->nltype == NLTYPE_ANYCRLF ||
3441: md->nllen == 2))
3442: current_subject++;
3443:
3444: } /* "Bumpalong" loop */
3445:
3446: return PCRE_ERROR_NOMATCH;
3447: }
3448:
3449: /* End of pcre_dfa_exec.c */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>