Annotation of embedaddon/pcre/pcre_dfa_exec.c, revision 1.1.1.3
1.1 misho 1: /*************************************************
2: * Perl-Compatible Regular Expressions *
3: *************************************************/
4:
5: /* PCRE is a library of functions to support regular expressions whose syntax
6: and semantics are as close as possible to those of the Perl 5 language (but see
7: below for why this module is different).
8:
9: Written by Philip Hazel
1.1.1.2 misho 10: Copyright (c) 1997-2012 University of Cambridge
1.1 misho 11:
12: -----------------------------------------------------------------------------
13: Redistribution and use in source and binary forms, with or without
14: modification, are permitted provided that the following conditions are met:
15:
16: * Redistributions of source code must retain the above copyright notice,
17: this list of conditions and the following disclaimer.
18:
19: * Redistributions in binary form must reproduce the above copyright
20: notice, this list of conditions and the following disclaimer in the
21: documentation and/or other materials provided with the distribution.
22:
23: * Neither the name of the University of Cambridge nor the names of its
24: contributors may be used to endorse or promote products derived from
25: this software without specific prior written permission.
26:
27: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37: POSSIBILITY OF SUCH DAMAGE.
38: -----------------------------------------------------------------------------
39: */
40:
41: /* This module contains the external function pcre_dfa_exec(), which is an
42: alternative matching function that uses a sort of DFA algorithm (not a true
1.1.1.3 ! misho 43: FSM). This is NOT Perl-compatible, but it has advantages in certain
1.1 misho 44: applications. */
45:
46:
47: /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
48: the performance of his patterns greatly. I could not use it as it stood, as it
49: was not thread safe, and made assumptions about pattern sizes. Also, it caused
50: test 7 to loop, and test 9 to crash with a segfault.
51:
52: The issue is the check for duplicate states, which is done by a simple linear
53: search up the state list. (Grep for "duplicate" below to find the code.) For
54: many patterns, there will never be many states active at one time, so a simple
55: linear search is fine. In patterns that have many active states, it might be a
56: bottleneck. The suggested code used an indexing scheme to remember which states
57: had previously been used for each character, and avoided the linear search when
58: it knew there was no chance of a duplicate. This was implemented when adding
59: states to the state lists.
60:
61: I wrote some thread-safe, not-limited code to try something similar at the time
62: of checking for duplicates (instead of when adding states), using index vectors
63: on the stack. It did give a 13% improvement with one specially constructed
64: pattern for certain subject strings, but on other strings and on many of the
65: simpler patterns in the test suite it did worse. The major problem, I think,
66: was the extra time to initialize the index. This had to be done for each call
67: of internal_dfa_exec(). (The supplied patch used a static vector, initialized
68: only once - I suspect this was the cause of the problems with the tests.)
69:
70: Overall, I concluded that the gains in some cases did not outweigh the losses
71: in others, so I abandoned this code. */
72:
73:
74:
75: #ifdef HAVE_CONFIG_H
76: #include "config.h"
77: #endif
78:
79: #define NLBLOCK md /* Block containing newline information */
80: #define PSSTART start_subject /* Field containing processed string start */
81: #define PSEND end_subject /* Field containing processed string end */
82:
83: #include "pcre_internal.h"
84:
85:
86: /* For use to indent debugging output */
87:
88: #define SP " "
89:
90:
91: /*************************************************
92: * Code parameters and static tables *
93: *************************************************/
94:
95: /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
96: into others, under special conditions. A gap of 20 between the blocks should be
97: enough. The resulting opcodes don't have to be less than 256 because they are
98: never stored, so we push them well clear of the normal opcodes. */
99:
100: #define OP_PROP_EXTRA 300
101: #define OP_EXTUNI_EXTRA 320
102: #define OP_ANYNL_EXTRA 340
103: #define OP_HSPACE_EXTRA 360
104: #define OP_VSPACE_EXTRA 380
105:
106:
107: /* This table identifies those opcodes that are followed immediately by a
108: character that is to be tested in some way. This makes it possible to
109: centralize the loading of these characters. In the case of Type * etc, the
110: "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
111: small value. Non-zero values in the table are the offsets from the opcode where
112: the character is to be found. ***NOTE*** If the start of this table is
113: modified, the three tables that follow must also be modified. */
114:
1.1.1.2 misho 115: static const pcre_uint8 coptable[] = {
1.1 misho 116: 0, /* End */
117: 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
118: 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
119: 0, 0, 0, /* Any, AllAny, Anybyte */
120: 0, 0, /* \P, \p */
121: 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
122: 0, /* \X */
123: 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
124: 1, /* Char */
125: 1, /* Chari */
126: 1, /* not */
127: 1, /* noti */
128: /* Positive single-char repeats */
129: 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
1.1.1.2 misho 130: 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
131: 1+IMM2_SIZE, /* exact */
132: 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
1.1 misho 133: 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
1.1.1.2 misho 134: 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
135: 1+IMM2_SIZE, /* exact I */
136: 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
1.1 misho 137: /* Negative single-char repeats - only for chars < 256 */
138: 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
1.1.1.2 misho 139: 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
140: 1+IMM2_SIZE, /* NOT exact */
141: 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
1.1 misho 142: 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
1.1.1.2 misho 143: 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
144: 1+IMM2_SIZE, /* NOT exact I */
145: 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
1.1 misho 146: /* Positive type repeats */
147: 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
1.1.1.2 misho 148: 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
149: 1+IMM2_SIZE, /* Type exact */
150: 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
1.1 misho 151: /* Character class & ref repeats */
152: 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
153: 0, 0, /* CRRANGE, CRMINRANGE */
154: 0, /* CLASS */
155: 0, /* NCLASS */
156: 0, /* XCLASS - variable length */
157: 0, /* REF */
158: 0, /* REFI */
159: 0, /* RECURSE */
160: 0, /* CALLOUT */
161: 0, /* Alt */
162: 0, /* Ket */
163: 0, /* KetRmax */
164: 0, /* KetRmin */
165: 0, /* KetRpos */
166: 0, /* Reverse */
167: 0, /* Assert */
168: 0, /* Assert not */
169: 0, /* Assert behind */
170: 0, /* Assert behind not */
171: 0, 0, /* ONCE, ONCE_NC */
172: 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
173: 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
174: 0, 0, /* CREF, NCREF */
175: 0, 0, /* RREF, NRREF */
176: 0, /* DEF */
177: 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
178: 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
179: 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
180: 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
181: 0, 0 /* CLOSE, SKIPZERO */
182: };
183:
184: /* This table identifies those opcodes that inspect a character. It is used to
185: remember the fact that a character could have been inspected when the end of
186: the subject is reached. ***NOTE*** If the start of this table is modified, the
187: two tables that follow must also be modified. */
188:
1.1.1.2 misho 189: static const pcre_uint8 poptable[] = {
1.1 misho 190: 0, /* End */
191: 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
192: 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
193: 1, 1, 1, /* Any, AllAny, Anybyte */
194: 1, 1, /* \P, \p */
195: 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
196: 1, /* \X */
197: 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
198: 1, /* Char */
199: 1, /* Chari */
200: 1, /* not */
201: 1, /* noti */
202: /* Positive single-char repeats */
203: 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
204: 1, 1, 1, /* upto, minupto, exact */
205: 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
206: 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
207: 1, 1, 1, /* upto I, minupto I, exact I */
208: 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
209: /* Negative single-char repeats - only for chars < 256 */
210: 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
211: 1, 1, 1, /* NOT upto, minupto, exact */
212: 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
213: 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
214: 1, 1, 1, /* NOT upto I, minupto I, exact I */
215: 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
216: /* Positive type repeats */
217: 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
218: 1, 1, 1, /* Type upto, minupto, exact */
219: 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
220: /* Character class & ref repeats */
221: 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
222: 1, 1, /* CRRANGE, CRMINRANGE */
223: 1, /* CLASS */
224: 1, /* NCLASS */
225: 1, /* XCLASS - variable length */
226: 0, /* REF */
227: 0, /* REFI */
228: 0, /* RECURSE */
229: 0, /* CALLOUT */
230: 0, /* Alt */
231: 0, /* Ket */
232: 0, /* KetRmax */
233: 0, /* KetRmin */
234: 0, /* KetRpos */
235: 0, /* Reverse */
236: 0, /* Assert */
237: 0, /* Assert not */
238: 0, /* Assert behind */
239: 0, /* Assert behind not */
240: 0, 0, /* ONCE, ONCE_NC */
241: 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
242: 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
243: 0, 0, /* CREF, NCREF */
244: 0, 0, /* RREF, NRREF */
245: 0, /* DEF */
246: 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
247: 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
248: 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
249: 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
250: 0, 0 /* CLOSE, SKIPZERO */
251: };
252:
253: /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
254: and \w */
255:
1.1.1.2 misho 256: static const pcre_uint8 toptable1[] = {
1.1 misho 257: 0, 0, 0, 0, 0, 0,
258: ctype_digit, ctype_digit,
259: ctype_space, ctype_space,
260: ctype_word, ctype_word,
261: 0, 0 /* OP_ANY, OP_ALLANY */
262: };
263:
1.1.1.2 misho 264: static const pcre_uint8 toptable2[] = {
1.1 misho 265: 0, 0, 0, 0, 0, 0,
266: ctype_digit, 0,
267: ctype_space, 0,
268: ctype_word, 0,
269: 1, 1 /* OP_ANY, OP_ALLANY */
270: };
271:
272:
273: /* Structure for holding data about a particular state, which is in effect the
274: current data for an active path through the match tree. It must consist
275: entirely of ints because the working vector we are passed, and which we put
276: these structures in, is a vector of ints. */
277:
278: typedef struct stateblock {
279: int offset; /* Offset to opcode */
280: int count; /* Count for repeats */
281: int data; /* Some use extra data */
282: } stateblock;
283:
1.1.1.3 ! misho 284: #define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
1.1 misho 285:
286:
287: #ifdef PCRE_DEBUG
288: /*************************************************
289: * Print character string *
290: *************************************************/
291:
292: /* Character string printing function for debugging.
293:
294: Arguments:
295: p points to string
296: length number of bytes
297: f where to print
298:
299: Returns: nothing
300: */
301:
302: static void
1.1.1.2 misho 303: pchars(const pcre_uchar *p, int length, FILE *f)
1.1 misho 304: {
305: int c;
306: while (length-- > 0)
307: {
308: if (isprint(c = *(p++)))
309: fprintf(f, "%c", c);
310: else
311: fprintf(f, "\\x%02x", c);
312: }
313: }
314: #endif
315:
316:
317:
318: /*************************************************
319: * Execute a Regular Expression - DFA engine *
320: *************************************************/
321:
322: /* This internal function applies a compiled pattern to a subject string,
323: starting at a given point, using a DFA engine. This function is called from the
324: external one, possibly multiple times if the pattern is not anchored. The
325: function calls itself recursively for some kinds of subpattern.
326:
327: Arguments:
328: md the match_data block with fixed information
329: this_start_code the opening bracket of this subexpression's code
330: current_subject where we currently are in the subject string
331: start_offset start offset in the subject string
332: offsets vector to contain the matching string offsets
333: offsetcount size of same
334: workspace vector of workspace
335: wscount size of same
336: rlevel function call recursion level
337:
338: Returns: > 0 => number of match offset pairs placed in offsets
339: = 0 => offsets overflowed; longest matches are present
340: -1 => failed to match
341: < -1 => some kind of unexpected problem
342:
343: The following macros are used for adding states to the two state vectors (one
344: for the current character, one for the following character). */
345:
346: #define ADD_ACTIVE(x,y) \
347: if (active_count++ < wscount) \
348: { \
349: next_active_state->offset = (x); \
350: next_active_state->count = (y); \
351: next_active_state++; \
352: DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
353: } \
354: else return PCRE_ERROR_DFA_WSSIZE
355:
356: #define ADD_ACTIVE_DATA(x,y,z) \
357: if (active_count++ < wscount) \
358: { \
359: next_active_state->offset = (x); \
360: next_active_state->count = (y); \
361: next_active_state->data = (z); \
362: next_active_state++; \
363: DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
364: } \
365: else return PCRE_ERROR_DFA_WSSIZE
366:
367: #define ADD_NEW(x,y) \
368: if (new_count++ < wscount) \
369: { \
370: next_new_state->offset = (x); \
371: next_new_state->count = (y); \
372: next_new_state++; \
373: DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
374: } \
375: else return PCRE_ERROR_DFA_WSSIZE
376:
377: #define ADD_NEW_DATA(x,y,z) \
378: if (new_count++ < wscount) \
379: { \
380: next_new_state->offset = (x); \
381: next_new_state->count = (y); \
382: next_new_state->data = (z); \
383: next_new_state++; \
1.1.1.3 ! misho 384: DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
! 385: (x), (y), (z), __LINE__)); \
1.1 misho 386: } \
387: else return PCRE_ERROR_DFA_WSSIZE
388:
389: /* And now, here is the code */
390:
391: static int
392: internal_dfa_exec(
393: dfa_match_data *md,
1.1.1.2 misho 394: const pcre_uchar *this_start_code,
395: const pcre_uchar *current_subject,
1.1 misho 396: int start_offset,
397: int *offsets,
398: int offsetcount,
399: int *workspace,
400: int wscount,
401: int rlevel)
402: {
403: stateblock *active_states, *new_states, *temp_states;
404: stateblock *next_active_state, *next_new_state;
405:
1.1.1.2 misho 406: const pcre_uint8 *ctypes, *lcc, *fcc;
407: const pcre_uchar *ptr;
408: const pcre_uchar *end_code, *first_op;
1.1 misho 409:
410: dfa_recursion_info new_recursive;
411:
412: int active_count, new_count, match_count;
413:
414: /* Some fields in the md block are frequently referenced, so we load them into
415: independent variables in the hope that this will perform better. */
416:
1.1.1.2 misho 417: const pcre_uchar *start_subject = md->start_subject;
418: const pcre_uchar *end_subject = md->end_subject;
419: const pcre_uchar *start_code = md->start_code;
1.1 misho 420:
1.1.1.2 misho 421: #ifdef SUPPORT_UTF
422: BOOL utf = (md->poptions & PCRE_UTF8) != 0;
1.1 misho 423: #else
1.1.1.2 misho 424: BOOL utf = FALSE;
1.1 misho 425: #endif
426:
1.1.1.3 ! misho 427: BOOL reset_could_continue = FALSE;
! 428:
1.1 misho 429: rlevel++;
430: offsetcount &= (-2);
431:
432: wscount -= 2;
433: wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
434: (2 * INTS_PER_STATEBLOCK);
435:
436: DPRINTF(("\n%.*s---------------------\n"
437: "%.*sCall to internal_dfa_exec f=%d\n",
438: rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
439:
440: ctypes = md->tables + ctypes_offset;
441: lcc = md->tables + lcc_offset;
442: fcc = md->tables + fcc_offset;
443:
444: match_count = PCRE_ERROR_NOMATCH; /* A negative number */
445:
446: active_states = (stateblock *)(workspace + 2);
447: next_new_state = new_states = active_states + wscount;
448: new_count = 0;
449:
450: first_op = this_start_code + 1 + LINK_SIZE +
451: ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
1.1.1.2 misho 452: *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
453: ? IMM2_SIZE:0);
1.1 misho 454:
455: /* The first thing in any (sub) pattern is a bracket of some sort. Push all
456: the alternative states onto the list, and find out where the end is. This
457: makes is possible to use this function recursively, when we want to stop at a
458: matching internal ket rather than at the end.
459:
460: If the first opcode in the first alternative is OP_REVERSE, we are dealing with
461: a backward assertion. In that case, we have to find out the maximum amount to
462: move back, and set up each alternative appropriately. */
463:
464: if (*first_op == OP_REVERSE)
465: {
466: int max_back = 0;
467: int gone_back;
468:
469: end_code = this_start_code;
470: do
471: {
472: int back = GET(end_code, 2+LINK_SIZE);
473: if (back > max_back) max_back = back;
474: end_code += GET(end_code, 1);
475: }
476: while (*end_code == OP_ALT);
477:
478: /* If we can't go back the amount required for the longest lookbehind
479: pattern, go back as far as we can; some alternatives may still be viable. */
480:
1.1.1.2 misho 481: #ifdef SUPPORT_UTF
1.1 misho 482: /* In character mode we have to step back character by character */
483:
1.1.1.2 misho 484: if (utf)
1.1 misho 485: {
486: for (gone_back = 0; gone_back < max_back; gone_back++)
487: {
488: if (current_subject <= start_subject) break;
489: current_subject--;
1.1.1.2 misho 490: ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
1.1 misho 491: }
492: }
493: else
494: #endif
495:
496: /* In byte-mode we can do this quickly. */
497:
498: {
499: gone_back = (current_subject - max_back < start_subject)?
500: (int)(current_subject - start_subject) : max_back;
501: current_subject -= gone_back;
502: }
503:
504: /* Save the earliest consulted character */
505:
506: if (current_subject < md->start_used_ptr)
507: md->start_used_ptr = current_subject;
508:
509: /* Now we can process the individual branches. */
510:
511: end_code = this_start_code;
512: do
513: {
514: int back = GET(end_code, 2+LINK_SIZE);
515: if (back <= gone_back)
516: {
517: int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
518: ADD_NEW_DATA(-bstate, 0, gone_back - back);
519: }
520: end_code += GET(end_code, 1);
521: }
522: while (*end_code == OP_ALT);
523: }
524:
525: /* This is the code for a "normal" subpattern (not a backward assertion). The
526: start of a whole pattern is always one of these. If we are at the top level,
527: we may be asked to restart matching from the same point that we reached for a
528: previous partial match. We still have to scan through the top-level branches to
529: find the end state. */
530:
531: else
532: {
533: end_code = this_start_code;
534:
535: /* Restarting */
536:
537: if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
538: {
539: do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
540: new_count = workspace[1];
541: if (!workspace[0])
542: memcpy(new_states, active_states, new_count * sizeof(stateblock));
543: }
544:
545: /* Not restarting */
546:
547: else
548: {
549: int length = 1 + LINK_SIZE +
550: ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
1.1.1.2 misho 551: *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
552: ? IMM2_SIZE:0);
1.1 misho 553: do
554: {
555: ADD_NEW((int)(end_code - start_code + length), 0);
556: end_code += GET(end_code, 1);
557: length = 1 + LINK_SIZE;
558: }
559: while (*end_code == OP_ALT);
560: }
561: }
562:
563: workspace[0] = 0; /* Bit indicating which vector is current */
564:
1.1.1.2 misho 565: DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
1.1 misho 566:
567: /* Loop for scanning the subject */
568:
569: ptr = current_subject;
570: for (;;)
571: {
572: int i, j;
573: int clen, dlen;
574: unsigned int c, d;
575: int forced_fail = 0;
1.1.1.3 ! misho 576: BOOL partial_newline = FALSE;
! 577: BOOL could_continue = reset_could_continue;
! 578: reset_could_continue = FALSE;
1.1 misho 579:
580: /* Make the new state list into the active state list and empty the
581: new state list. */
582:
583: temp_states = active_states;
584: active_states = new_states;
585: new_states = temp_states;
586: active_count = new_count;
587: new_count = 0;
588:
589: workspace[0] ^= 1; /* Remember for the restarting feature */
590: workspace[1] = active_count;
591:
592: #ifdef PCRE_DEBUG
593: printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
1.1.1.2 misho 594: pchars(ptr, STRLEN_UC(ptr), stdout);
1.1 misho 595: printf("\"\n");
596:
597: printf("%.*sActive states: ", rlevel*2-2, SP);
598: for (i = 0; i < active_count; i++)
599: printf("%d/%d ", active_states[i].offset, active_states[i].count);
600: printf("\n");
601: #endif
602:
603: /* Set the pointers for adding new states */
604:
605: next_active_state = active_states + active_count;
606: next_new_state = new_states;
607:
608: /* Load the current character from the subject outside the loop, as many
609: different states may want to look at it, and we assume that at least one
610: will. */
611:
612: if (ptr < end_subject)
613: {
1.1.1.3 ! misho 614: clen = 1; /* Number of data items in the character */
1.1.1.2 misho 615: #ifdef SUPPORT_UTF
616: if (utf) { GETCHARLEN(c, ptr, clen); } else
617: #endif /* SUPPORT_UTF */
1.1 misho 618: c = *ptr;
619: }
620: else
621: {
622: clen = 0; /* This indicates the end of the subject */
623: c = NOTACHAR; /* This value should never actually be used */
624: }
625:
626: /* Scan up the active states and act on each one. The result of an action
627: may be to add more states to the currently active list (e.g. on hitting a
628: parenthesis) or it may be to put states on the new list, for considering
629: when we move the character pointer on. */
630:
631: for (i = 0; i < active_count; i++)
632: {
633: stateblock *current_state = active_states + i;
634: BOOL caseless = FALSE;
1.1.1.2 misho 635: const pcre_uchar *code;
1.1 misho 636: int state_offset = current_state->offset;
637: int count, codevalue, rrc;
638:
639: #ifdef PCRE_DEBUG
640: printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
641: if (clen == 0) printf("EOL\n");
642: else if (c > 32 && c < 127) printf("'%c'\n", c);
643: else printf("0x%02x\n", c);
644: #endif
645:
646: /* A negative offset is a special case meaning "hold off going to this
647: (negated) state until the number of characters in the data field have
1.1.1.3 ! misho 648: been skipped". If the could_continue flag was passed over from a previous
! 649: state, arrange for it to passed on. */
1.1 misho 650:
651: if (state_offset < 0)
652: {
653: if (current_state->data > 0)
654: {
655: DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
656: ADD_NEW_DATA(state_offset, current_state->count,
657: current_state->data - 1);
1.1.1.3 ! misho 658: if (could_continue) reset_could_continue = TRUE;
1.1 misho 659: continue;
660: }
661: else
662: {
663: current_state->offset = state_offset = -state_offset;
664: }
665: }
666:
667: /* Check for a duplicate state with the same count, and skip if found.
668: See the note at the head of this module about the possibility of improving
669: performance here. */
670:
671: for (j = 0; j < i; j++)
672: {
673: if (active_states[j].offset == state_offset &&
674: active_states[j].count == current_state->count)
675: {
676: DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
677: goto NEXT_ACTIVE_STATE;
678: }
679: }
680:
681: /* The state offset is the offset to the opcode */
682:
683: code = start_code + state_offset;
684: codevalue = *code;
685:
686: /* If this opcode inspects a character, but we are at the end of the
687: subject, remember the fact for use when testing for a partial match. */
688:
689: if (clen == 0 && poptable[codevalue] != 0)
690: could_continue = TRUE;
691:
692: /* If this opcode is followed by an inline character, load it. It is
693: tempting to test for the presence of a subject character here, but that
694: is wrong, because sometimes zero repetitions of the subject are
695: permitted.
696:
697: We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
1.1.1.3 ! misho 698: argument that is not a data character - but is always one byte long because
! 699: the values are small. We have to take special action to deal with \P, \p,
! 700: \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
! 701: these ones to new opcodes. */
1.1 misho 702:
703: if (coptable[codevalue] > 0)
704: {
705: dlen = 1;
1.1.1.2 misho 706: #ifdef SUPPORT_UTF
707: if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
708: #endif /* SUPPORT_UTF */
1.1 misho 709: d = code[coptable[codevalue]];
710: if (codevalue >= OP_TYPESTAR)
711: {
712: switch(d)
713: {
714: case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
715: case OP_NOTPROP:
716: case OP_PROP: codevalue += OP_PROP_EXTRA; break;
717: case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
718: case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
719: case OP_NOT_HSPACE:
720: case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
721: case OP_NOT_VSPACE:
722: case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
723: default: break;
724: }
725: }
726: }
727: else
728: {
729: dlen = 0; /* Not strictly necessary, but compilers moan */
730: d = NOTACHAR; /* if these variables are not set. */
731: }
732:
733:
734: /* Now process the individual opcodes */
735:
736: switch (codevalue)
737: {
738: /* ========================================================================== */
739: /* These cases are never obeyed. This is a fudge that causes a compile-
740: time error if the vectors coptable or poptable, which are indexed by
741: opcode, are not the correct length. It seems to be the only way to do
742: such a check at compile time, as the sizeof() operator does not work
743: in the C preprocessor. */
744:
745: case OP_TABLE_LENGTH:
746: case OP_TABLE_LENGTH +
747: ((sizeof(coptable) == OP_TABLE_LENGTH) &&
748: (sizeof(poptable) == OP_TABLE_LENGTH)):
749: break;
750:
751: /* ========================================================================== */
752: /* Reached a closing bracket. If not at the end of the pattern, carry
753: on with the next opcode. For repeating opcodes, also add the repeat
754: state. Note that KETRPOS will always be encountered at the end of the
755: subpattern, because the possessive subpattern repeats are always handled
756: using recursive calls. Thus, it never adds any new states.
757:
758: At the end of the (sub)pattern, unless we have an empty string and
759: PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
760: start of the subject, save the match data, shifting up all previous
761: matches so we always have the longest first. */
762:
763: case OP_KET:
764: case OP_KETRMIN:
765: case OP_KETRMAX:
766: case OP_KETRPOS:
767: if (code != end_code)
768: {
769: ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
770: if (codevalue != OP_KET)
771: {
772: ADD_ACTIVE(state_offset - GET(code, 1), 0);
773: }
774: }
775: else
776: {
777: if (ptr > current_subject ||
778: ((md->moptions & PCRE_NOTEMPTY) == 0 &&
779: ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
780: current_subject > start_subject + md->start_offset)))
781: {
782: if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
783: else if (match_count > 0 && ++match_count * 2 > offsetcount)
784: match_count = 0;
785: count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
786: if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
787: if (offsetcount >= 2)
788: {
789: offsets[0] = (int)(current_subject - start_subject);
790: offsets[1] = (int)(ptr - start_subject);
791: DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
1.1.1.3 ! misho 792: offsets[1] - offsets[0], (char *)current_subject));
1.1 misho 793: }
794: if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
795: {
796: DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
797: "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
798: match_count, rlevel*2-2, SP));
799: return match_count;
800: }
801: }
802: }
803: break;
804:
805: /* ========================================================================== */
806: /* These opcodes add to the current list of states without looking
807: at the current character. */
808:
809: /*-----------------------------------------------------------------*/
810: case OP_ALT:
811: do { code += GET(code, 1); } while (*code == OP_ALT);
812: ADD_ACTIVE((int)(code - start_code), 0);
813: break;
814:
815: /*-----------------------------------------------------------------*/
816: case OP_BRA:
817: case OP_SBRA:
818: do
819: {
820: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
821: code += GET(code, 1);
822: }
823: while (*code == OP_ALT);
824: break;
825:
826: /*-----------------------------------------------------------------*/
827: case OP_CBRA:
828: case OP_SCBRA:
1.1.1.2 misho 829: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
1.1 misho 830: code += GET(code, 1);
831: while (*code == OP_ALT)
832: {
833: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
834: code += GET(code, 1);
835: }
836: break;
837:
838: /*-----------------------------------------------------------------*/
839: case OP_BRAZERO:
840: case OP_BRAMINZERO:
841: ADD_ACTIVE(state_offset + 1, 0);
842: code += 1 + GET(code, 2);
843: while (*code == OP_ALT) code += GET(code, 1);
844: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
845: break;
846:
847: /*-----------------------------------------------------------------*/
848: case OP_SKIPZERO:
849: code += 1 + GET(code, 2);
850: while (*code == OP_ALT) code += GET(code, 1);
851: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
852: break;
853:
854: /*-----------------------------------------------------------------*/
855: case OP_CIRC:
856: if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
857: { ADD_ACTIVE(state_offset + 1, 0); }
858: break;
859:
860: /*-----------------------------------------------------------------*/
861: case OP_CIRCM:
862: if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
863: (ptr != end_subject && WAS_NEWLINE(ptr)))
864: { ADD_ACTIVE(state_offset + 1, 0); }
865: break;
866:
867: /*-----------------------------------------------------------------*/
868: case OP_EOD:
869: if (ptr >= end_subject)
870: {
871: if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
872: could_continue = TRUE;
873: else { ADD_ACTIVE(state_offset + 1, 0); }
874: }
875: break;
876:
877: /*-----------------------------------------------------------------*/
878: case OP_SOD:
879: if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
880: break;
881:
882: /*-----------------------------------------------------------------*/
883: case OP_SOM:
884: if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
885: break;
886:
887:
888: /* ========================================================================== */
889: /* These opcodes inspect the next subject character, and sometimes
890: the previous one as well, but do not have an argument. The variable
891: clen contains the length of the current character and is zero if we are
892: at the end of the subject. */
893:
894: /*-----------------------------------------------------------------*/
895: case OP_ANY:
896: if (clen > 0 && !IS_NEWLINE(ptr))
1.1.1.3 ! misho 897: {
! 898: if (ptr + 1 >= md->end_subject &&
! 899: (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
! 900: NLBLOCK->nltype == NLTYPE_FIXED &&
! 901: NLBLOCK->nllen == 2 &&
! 902: c == NLBLOCK->nl[0])
! 903: {
! 904: could_continue = partial_newline = TRUE;
! 905: }
! 906: else
! 907: {
! 908: ADD_NEW(state_offset + 1, 0);
! 909: }
! 910: }
1.1 misho 911: break;
912:
913: /*-----------------------------------------------------------------*/
914: case OP_ALLANY:
915: if (clen > 0)
916: { ADD_NEW(state_offset + 1, 0); }
917: break;
918:
919: /*-----------------------------------------------------------------*/
920: case OP_EODN:
921: if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
922: could_continue = TRUE;
923: else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
924: { ADD_ACTIVE(state_offset + 1, 0); }
925: break;
926:
927: /*-----------------------------------------------------------------*/
928: case OP_DOLL:
929: if ((md->moptions & PCRE_NOTEOL) == 0)
930: {
931: if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
932: could_continue = TRUE;
933: else if (clen == 0 ||
934: ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
935: (ptr == end_subject - md->nllen)
936: ))
937: { ADD_ACTIVE(state_offset + 1, 0); }
1.1.1.3 ! misho 938: else if (ptr + 1 >= md->end_subject &&
! 939: (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
! 940: NLBLOCK->nltype == NLTYPE_FIXED &&
! 941: NLBLOCK->nllen == 2 &&
! 942: c == NLBLOCK->nl[0])
! 943: {
! 944: if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
! 945: {
! 946: reset_could_continue = TRUE;
! 947: ADD_NEW_DATA(-(state_offset + 1), 0, 1);
! 948: }
! 949: else could_continue = partial_newline = TRUE;
! 950: }
1.1 misho 951: }
952: break;
953:
954: /*-----------------------------------------------------------------*/
955: case OP_DOLLM:
956: if ((md->moptions & PCRE_NOTEOL) == 0)
957: {
958: if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
959: could_continue = TRUE;
960: else if (clen == 0 ||
961: ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
962: { ADD_ACTIVE(state_offset + 1, 0); }
1.1.1.3 ! misho 963: else if (ptr + 1 >= md->end_subject &&
! 964: (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
! 965: NLBLOCK->nltype == NLTYPE_FIXED &&
! 966: NLBLOCK->nllen == 2 &&
! 967: c == NLBLOCK->nl[0])
! 968: {
! 969: if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
! 970: {
! 971: reset_could_continue = TRUE;
! 972: ADD_NEW_DATA(-(state_offset + 1), 0, 1);
! 973: }
! 974: else could_continue = partial_newline = TRUE;
! 975: }
1.1 misho 976: }
977: else if (IS_NEWLINE(ptr))
978: { ADD_ACTIVE(state_offset + 1, 0); }
979: break;
980:
981: /*-----------------------------------------------------------------*/
982:
983: case OP_DIGIT:
984: case OP_WHITESPACE:
985: case OP_WORDCHAR:
986: if (clen > 0 && c < 256 &&
987: ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
988: { ADD_NEW(state_offset + 1, 0); }
989: break;
990:
991: /*-----------------------------------------------------------------*/
992: case OP_NOT_DIGIT:
993: case OP_NOT_WHITESPACE:
994: case OP_NOT_WORDCHAR:
995: if (clen > 0 && (c >= 256 ||
996: ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
997: { ADD_NEW(state_offset + 1, 0); }
998: break;
999:
1000: /*-----------------------------------------------------------------*/
1001: case OP_WORD_BOUNDARY:
1002: case OP_NOT_WORD_BOUNDARY:
1003: {
1004: int left_word, right_word;
1005:
1006: if (ptr > start_subject)
1007: {
1.1.1.2 misho 1008: const pcre_uchar *temp = ptr - 1;
1.1 misho 1009: if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1.1.1.2 misho 1010: #ifdef SUPPORT_UTF
1011: if (utf) { BACKCHAR(temp); }
1.1 misho 1012: #endif
1013: GETCHARTEST(d, temp);
1014: #ifdef SUPPORT_UCP
1015: if ((md->poptions & PCRE_UCP) != 0)
1016: {
1017: if (d == '_') left_word = TRUE; else
1018: {
1019: int cat = UCD_CATEGORY(d);
1020: left_word = (cat == ucp_L || cat == ucp_N);
1021: }
1022: }
1023: else
1024: #endif
1025: left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1026: }
1027: else left_word = FALSE;
1028:
1029: if (clen > 0)
1030: {
1031: #ifdef SUPPORT_UCP
1032: if ((md->poptions & PCRE_UCP) != 0)
1033: {
1034: if (c == '_') right_word = TRUE; else
1035: {
1036: int cat = UCD_CATEGORY(c);
1037: right_word = (cat == ucp_L || cat == ucp_N);
1038: }
1039: }
1040: else
1041: #endif
1042: right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1043: }
1044: else right_word = FALSE;
1045:
1046: if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1047: { ADD_ACTIVE(state_offset + 1, 0); }
1048: }
1049: break;
1050:
1051:
1052: /*-----------------------------------------------------------------*/
1053: /* Check the next character by Unicode property. We will get here only
1054: if the support is in the binary; otherwise a compile-time error occurs.
1055: */
1056:
1057: #ifdef SUPPORT_UCP
1058: case OP_PROP:
1059: case OP_NOTPROP:
1060: if (clen > 0)
1061: {
1062: BOOL OK;
1063: const ucd_record * prop = GET_UCD(c);
1064: switch(code[1])
1065: {
1066: case PT_ANY:
1067: OK = TRUE;
1068: break;
1069:
1070: case PT_LAMP:
1071: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1072: prop->chartype == ucp_Lt;
1073: break;
1074:
1075: case PT_GC:
1.1.1.2 misho 1076: OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1.1 misho 1077: break;
1078:
1079: case PT_PC:
1080: OK = prop->chartype == code[2];
1081: break;
1082:
1083: case PT_SC:
1084: OK = prop->script == code[2];
1085: break;
1086:
1087: /* These are specials for combination cases. */
1088:
1089: case PT_ALNUM:
1.1.1.2 misho 1090: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1091: PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1.1 misho 1092: break;
1093:
1094: case PT_SPACE: /* Perl space */
1.1.1.2 misho 1095: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.1 misho 1096: c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1097: break;
1098:
1099: case PT_PXSPACE: /* POSIX space */
1.1.1.2 misho 1100: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.1 misho 1101: c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1102: c == CHAR_FF || c == CHAR_CR;
1103: break;
1104:
1105: case PT_WORD:
1.1.1.2 misho 1106: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1107: PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1.1 misho 1108: c == CHAR_UNDERSCORE;
1109: break;
1110:
1111: /* Should never occur, but keep compilers from grumbling. */
1112:
1113: default:
1114: OK = codevalue != OP_PROP;
1115: break;
1116: }
1117:
1118: if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1119: }
1120: break;
1121: #endif
1122:
1123:
1124:
1125: /* ========================================================================== */
1126: /* These opcodes likewise inspect the subject character, but have an
1127: argument that is not a data character. It is one of these opcodes:
1128: OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1129: OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1130:
1131: case OP_TYPEPLUS:
1132: case OP_TYPEMINPLUS:
1133: case OP_TYPEPOSPLUS:
1134: count = current_state->count; /* Already matched */
1135: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1136: if (clen > 0)
1137: {
1.1.1.3 ! misho 1138: if (d == OP_ANY && ptr + 1 >= md->end_subject &&
! 1139: (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
! 1140: NLBLOCK->nltype == NLTYPE_FIXED &&
! 1141: NLBLOCK->nllen == 2 &&
! 1142: c == NLBLOCK->nl[0])
! 1143: {
! 1144: could_continue = partial_newline = TRUE;
! 1145: }
! 1146: else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1.1 misho 1147: (c < 256 &&
1148: (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1149: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1150: {
1151: if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1152: {
1153: active_count--; /* Remove non-match possibility */
1154: next_active_state--;
1155: }
1156: count++;
1157: ADD_NEW(state_offset, count);
1158: }
1159: }
1160: break;
1161:
1162: /*-----------------------------------------------------------------*/
1163: case OP_TYPEQUERY:
1164: case OP_TYPEMINQUERY:
1165: case OP_TYPEPOSQUERY:
1166: ADD_ACTIVE(state_offset + 2, 0);
1167: if (clen > 0)
1168: {
1.1.1.3 ! misho 1169: if (d == OP_ANY && ptr + 1 >= md->end_subject &&
! 1170: (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
! 1171: NLBLOCK->nltype == NLTYPE_FIXED &&
! 1172: NLBLOCK->nllen == 2 &&
! 1173: c == NLBLOCK->nl[0])
! 1174: {
! 1175: could_continue = partial_newline = TRUE;
! 1176: }
! 1177: else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1.1 misho 1178: (c < 256 &&
1179: (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1180: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1181: {
1182: if (codevalue == OP_TYPEPOSQUERY)
1183: {
1184: active_count--; /* Remove non-match possibility */
1185: next_active_state--;
1186: }
1187: ADD_NEW(state_offset + 2, 0);
1188: }
1189: }
1190: break;
1191:
1192: /*-----------------------------------------------------------------*/
1193: case OP_TYPESTAR:
1194: case OP_TYPEMINSTAR:
1195: case OP_TYPEPOSSTAR:
1196: ADD_ACTIVE(state_offset + 2, 0);
1197: if (clen > 0)
1198: {
1.1.1.3 ! misho 1199: if (d == OP_ANY && ptr + 1 >= md->end_subject &&
! 1200: (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
! 1201: NLBLOCK->nltype == NLTYPE_FIXED &&
! 1202: NLBLOCK->nllen == 2 &&
! 1203: c == NLBLOCK->nl[0])
! 1204: {
! 1205: could_continue = partial_newline = TRUE;
! 1206: }
! 1207: else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1.1 misho 1208: (c < 256 &&
1209: (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1210: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1211: {
1212: if (codevalue == OP_TYPEPOSSTAR)
1213: {
1214: active_count--; /* Remove non-match possibility */
1215: next_active_state--;
1216: }
1217: ADD_NEW(state_offset, 0);
1218: }
1219: }
1220: break;
1221:
1222: /*-----------------------------------------------------------------*/
1223: case OP_TYPEEXACT:
1224: count = current_state->count; /* Number already matched */
1225: if (clen > 0)
1226: {
1.1.1.3 ! misho 1227: if (d == OP_ANY && ptr + 1 >= md->end_subject &&
! 1228: (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
! 1229: NLBLOCK->nltype == NLTYPE_FIXED &&
! 1230: NLBLOCK->nllen == 2 &&
! 1231: c == NLBLOCK->nl[0])
! 1232: {
! 1233: could_continue = partial_newline = TRUE;
! 1234: }
! 1235: else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1.1 misho 1236: (c < 256 &&
1237: (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1238: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1239: {
1240: if (++count >= GET2(code, 1))
1.1.1.2 misho 1241: { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1.1 misho 1242: else
1243: { ADD_NEW(state_offset, count); }
1244: }
1245: }
1246: break;
1247:
1248: /*-----------------------------------------------------------------*/
1249: case OP_TYPEUPTO:
1250: case OP_TYPEMINUPTO:
1251: case OP_TYPEPOSUPTO:
1.1.1.2 misho 1252: ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1.1 misho 1253: count = current_state->count; /* Number already matched */
1254: if (clen > 0)
1255: {
1.1.1.3 ! misho 1256: if (d == OP_ANY && ptr + 1 >= md->end_subject &&
! 1257: (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
! 1258: NLBLOCK->nltype == NLTYPE_FIXED &&
! 1259: NLBLOCK->nllen == 2 &&
! 1260: c == NLBLOCK->nl[0])
! 1261: {
! 1262: could_continue = partial_newline = TRUE;
! 1263: }
! 1264: else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1.1 misho 1265: (c < 256 &&
1266: (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1267: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1268: {
1269: if (codevalue == OP_TYPEPOSUPTO)
1270: {
1271: active_count--; /* Remove non-match possibility */
1272: next_active_state--;
1273: }
1274: if (++count >= GET2(code, 1))
1.1.1.2 misho 1275: { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1.1 misho 1276: else
1277: { ADD_NEW(state_offset, count); }
1278: }
1279: }
1280: break;
1281:
1282: /* ========================================================================== */
1283: /* These are virtual opcodes that are used when something like
1284: OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1285: argument. It keeps the code above fast for the other cases. The argument
1286: is in the d variable. */
1287:
1288: #ifdef SUPPORT_UCP
1289: case OP_PROP_EXTRA + OP_TYPEPLUS:
1290: case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1291: case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1292: count = current_state->count; /* Already matched */
1293: if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1294: if (clen > 0)
1295: {
1296: BOOL OK;
1297: const ucd_record * prop = GET_UCD(c);
1298: switch(code[2])
1299: {
1300: case PT_ANY:
1301: OK = TRUE;
1302: break;
1303:
1304: case PT_LAMP:
1305: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1306: prop->chartype == ucp_Lt;
1307: break;
1308:
1309: case PT_GC:
1.1.1.2 misho 1310: OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1.1 misho 1311: break;
1312:
1313: case PT_PC:
1314: OK = prop->chartype == code[3];
1315: break;
1316:
1317: case PT_SC:
1318: OK = prop->script == code[3];
1319: break;
1320:
1321: /* These are specials for combination cases. */
1322:
1323: case PT_ALNUM:
1.1.1.2 misho 1324: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1325: PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1.1 misho 1326: break;
1327:
1328: case PT_SPACE: /* Perl space */
1.1.1.2 misho 1329: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.1 misho 1330: c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1331: break;
1332:
1333: case PT_PXSPACE: /* POSIX space */
1.1.1.2 misho 1334: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.1 misho 1335: c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1336: c == CHAR_FF || c == CHAR_CR;
1337: break;
1338:
1339: case PT_WORD:
1.1.1.2 misho 1340: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1341: PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1.1 misho 1342: c == CHAR_UNDERSCORE;
1343: break;
1344:
1345: /* Should never occur, but keep compilers from grumbling. */
1346:
1347: default:
1348: OK = codevalue != OP_PROP;
1349: break;
1350: }
1351:
1352: if (OK == (d == OP_PROP))
1353: {
1354: if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1355: {
1356: active_count--; /* Remove non-match possibility */
1357: next_active_state--;
1358: }
1359: count++;
1360: ADD_NEW(state_offset, count);
1361: }
1362: }
1363: break;
1364:
1365: /*-----------------------------------------------------------------*/
1366: case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1367: case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1368: case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1369: count = current_state->count; /* Already matched */
1370: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1371: if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1372: {
1.1.1.2 misho 1373: const pcre_uchar *nptr = ptr + clen;
1.1 misho 1374: int ncount = 0;
1375: if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1376: {
1377: active_count--; /* Remove non-match possibility */
1378: next_active_state--;
1379: }
1380: while (nptr < end_subject)
1381: {
1382: int nd;
1383: int ndlen = 1;
1384: GETCHARLEN(nd, nptr, ndlen);
1385: if (UCD_CATEGORY(nd) != ucp_M) break;
1386: ncount++;
1387: nptr += ndlen;
1388: }
1389: count++;
1390: ADD_NEW_DATA(-state_offset, count, ncount);
1391: }
1392: break;
1393: #endif
1394:
1395: /*-----------------------------------------------------------------*/
1396: case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1397: case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1398: case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1399: count = current_state->count; /* Already matched */
1400: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1401: if (clen > 0)
1402: {
1403: int ncount = 0;
1404: switch (c)
1405: {
1406: case 0x000b:
1407: case 0x000c:
1408: case 0x0085:
1409: case 0x2028:
1410: case 0x2029:
1411: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1412: goto ANYNL01;
1413:
1414: case 0x000d:
1415: if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1416: /* Fall through */
1417:
1418: ANYNL01:
1419: case 0x000a:
1420: if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1421: {
1422: active_count--; /* Remove non-match possibility */
1423: next_active_state--;
1424: }
1425: count++;
1426: ADD_NEW_DATA(-state_offset, count, ncount);
1427: break;
1428:
1429: default:
1430: break;
1431: }
1432: }
1433: break;
1434:
1435: /*-----------------------------------------------------------------*/
1436: case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1437: case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1438: case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1439: count = current_state->count; /* Already matched */
1440: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1441: if (clen > 0)
1442: {
1443: BOOL OK;
1444: switch (c)
1445: {
1446: case 0x000a:
1447: case 0x000b:
1448: case 0x000c:
1449: case 0x000d:
1450: case 0x0085:
1451: case 0x2028:
1452: case 0x2029:
1453: OK = TRUE;
1454: break;
1455:
1456: default:
1457: OK = FALSE;
1458: break;
1459: }
1460:
1461: if (OK == (d == OP_VSPACE))
1462: {
1463: if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1464: {
1465: active_count--; /* Remove non-match possibility */
1466: next_active_state--;
1467: }
1468: count++;
1469: ADD_NEW_DATA(-state_offset, count, 0);
1470: }
1471: }
1472: break;
1473:
1474: /*-----------------------------------------------------------------*/
1475: case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1476: case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1477: case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1478: count = current_state->count; /* Already matched */
1479: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1480: if (clen > 0)
1481: {
1482: BOOL OK;
1483: switch (c)
1484: {
1485: case 0x09: /* HT */
1486: case 0x20: /* SPACE */
1487: case 0xa0: /* NBSP */
1488: case 0x1680: /* OGHAM SPACE MARK */
1489: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1490: case 0x2000: /* EN QUAD */
1491: case 0x2001: /* EM QUAD */
1492: case 0x2002: /* EN SPACE */
1493: case 0x2003: /* EM SPACE */
1494: case 0x2004: /* THREE-PER-EM SPACE */
1495: case 0x2005: /* FOUR-PER-EM SPACE */
1496: case 0x2006: /* SIX-PER-EM SPACE */
1497: case 0x2007: /* FIGURE SPACE */
1498: case 0x2008: /* PUNCTUATION SPACE */
1499: case 0x2009: /* THIN SPACE */
1500: case 0x200A: /* HAIR SPACE */
1501: case 0x202f: /* NARROW NO-BREAK SPACE */
1502: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1503: case 0x3000: /* IDEOGRAPHIC SPACE */
1504: OK = TRUE;
1505: break;
1506:
1507: default:
1508: OK = FALSE;
1509: break;
1510: }
1511:
1512: if (OK == (d == OP_HSPACE))
1513: {
1514: if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1515: {
1516: active_count--; /* Remove non-match possibility */
1517: next_active_state--;
1518: }
1519: count++;
1520: ADD_NEW_DATA(-state_offset, count, 0);
1521: }
1522: }
1523: break;
1524:
1525: /*-----------------------------------------------------------------*/
1526: #ifdef SUPPORT_UCP
1527: case OP_PROP_EXTRA + OP_TYPEQUERY:
1528: case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1529: case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1530: count = 4;
1531: goto QS1;
1532:
1533: case OP_PROP_EXTRA + OP_TYPESTAR:
1534: case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1535: case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1536: count = 0;
1537:
1538: QS1:
1539:
1540: ADD_ACTIVE(state_offset + 4, 0);
1541: if (clen > 0)
1542: {
1543: BOOL OK;
1544: const ucd_record * prop = GET_UCD(c);
1545: switch(code[2])
1546: {
1547: case PT_ANY:
1548: OK = TRUE;
1549: break;
1550:
1551: case PT_LAMP:
1552: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1553: prop->chartype == ucp_Lt;
1554: break;
1555:
1556: case PT_GC:
1.1.1.2 misho 1557: OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1.1 misho 1558: break;
1559:
1560: case PT_PC:
1561: OK = prop->chartype == code[3];
1562: break;
1563:
1564: case PT_SC:
1565: OK = prop->script == code[3];
1566: break;
1567:
1568: /* These are specials for combination cases. */
1569:
1570: case PT_ALNUM:
1.1.1.2 misho 1571: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1572: PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1.1 misho 1573: break;
1574:
1575: case PT_SPACE: /* Perl space */
1.1.1.2 misho 1576: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.1 misho 1577: c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1578: break;
1579:
1580: case PT_PXSPACE: /* POSIX space */
1.1.1.2 misho 1581: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.1 misho 1582: c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1583: c == CHAR_FF || c == CHAR_CR;
1584: break;
1585:
1586: case PT_WORD:
1.1.1.2 misho 1587: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1588: PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1.1 misho 1589: c == CHAR_UNDERSCORE;
1590: break;
1591:
1592: /* Should never occur, but keep compilers from grumbling. */
1593:
1594: default:
1595: OK = codevalue != OP_PROP;
1596: break;
1597: }
1598:
1599: if (OK == (d == OP_PROP))
1600: {
1601: if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1602: codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1603: {
1604: active_count--; /* Remove non-match possibility */
1605: next_active_state--;
1606: }
1607: ADD_NEW(state_offset + count, 0);
1608: }
1609: }
1610: break;
1611:
1612: /*-----------------------------------------------------------------*/
1613: case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1614: case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1615: case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1616: count = 2;
1617: goto QS2;
1618:
1619: case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1620: case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1621: case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1622: count = 0;
1623:
1624: QS2:
1625:
1626: ADD_ACTIVE(state_offset + 2, 0);
1627: if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1628: {
1.1.1.2 misho 1629: const pcre_uchar *nptr = ptr + clen;
1.1 misho 1630: int ncount = 0;
1631: if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1632: codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1633: {
1634: active_count--; /* Remove non-match possibility */
1635: next_active_state--;
1636: }
1637: while (nptr < end_subject)
1638: {
1639: int nd;
1640: int ndlen = 1;
1641: GETCHARLEN(nd, nptr, ndlen);
1642: if (UCD_CATEGORY(nd) != ucp_M) break;
1643: ncount++;
1644: nptr += ndlen;
1645: }
1646: ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1647: }
1648: break;
1649: #endif
1650:
1651: /*-----------------------------------------------------------------*/
1652: case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1653: case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1654: case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1655: count = 2;
1656: goto QS3;
1657:
1658: case OP_ANYNL_EXTRA + OP_TYPESTAR:
1659: case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1660: case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1661: count = 0;
1662:
1663: QS3:
1664: ADD_ACTIVE(state_offset + 2, 0);
1665: if (clen > 0)
1666: {
1667: int ncount = 0;
1668: switch (c)
1669: {
1670: case 0x000b:
1671: case 0x000c:
1672: case 0x0085:
1673: case 0x2028:
1674: case 0x2029:
1675: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1676: goto ANYNL02;
1677:
1678: case 0x000d:
1679: if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1680: /* Fall through */
1681:
1682: ANYNL02:
1683: case 0x000a:
1684: if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1685: codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1686: {
1687: active_count--; /* Remove non-match possibility */
1688: next_active_state--;
1689: }
1690: ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1691: break;
1692:
1693: default:
1694: break;
1695: }
1696: }
1697: break;
1698:
1699: /*-----------------------------------------------------------------*/
1700: case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1701: case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1702: case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1703: count = 2;
1704: goto QS4;
1705:
1706: case OP_VSPACE_EXTRA + OP_TYPESTAR:
1707: case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1708: case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1709: count = 0;
1710:
1711: QS4:
1712: ADD_ACTIVE(state_offset + 2, 0);
1713: if (clen > 0)
1714: {
1715: BOOL OK;
1716: switch (c)
1717: {
1718: case 0x000a:
1719: case 0x000b:
1720: case 0x000c:
1721: case 0x000d:
1722: case 0x0085:
1723: case 0x2028:
1724: case 0x2029:
1725: OK = TRUE;
1726: break;
1727:
1728: default:
1729: OK = FALSE;
1730: break;
1731: }
1732: if (OK == (d == OP_VSPACE))
1733: {
1734: if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1735: codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1736: {
1737: active_count--; /* Remove non-match possibility */
1738: next_active_state--;
1739: }
1740: ADD_NEW_DATA(-(state_offset + count), 0, 0);
1741: }
1742: }
1743: break;
1744:
1745: /*-----------------------------------------------------------------*/
1746: case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1747: case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1748: case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1749: count = 2;
1750: goto QS5;
1751:
1752: case OP_HSPACE_EXTRA + OP_TYPESTAR:
1753: case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1754: case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1755: count = 0;
1756:
1757: QS5:
1758: ADD_ACTIVE(state_offset + 2, 0);
1759: if (clen > 0)
1760: {
1761: BOOL OK;
1762: switch (c)
1763: {
1764: case 0x09: /* HT */
1765: case 0x20: /* SPACE */
1766: case 0xa0: /* NBSP */
1767: case 0x1680: /* OGHAM SPACE MARK */
1768: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1769: case 0x2000: /* EN QUAD */
1770: case 0x2001: /* EM QUAD */
1771: case 0x2002: /* EN SPACE */
1772: case 0x2003: /* EM SPACE */
1773: case 0x2004: /* THREE-PER-EM SPACE */
1774: case 0x2005: /* FOUR-PER-EM SPACE */
1775: case 0x2006: /* SIX-PER-EM SPACE */
1776: case 0x2007: /* FIGURE SPACE */
1777: case 0x2008: /* PUNCTUATION SPACE */
1778: case 0x2009: /* THIN SPACE */
1779: case 0x200A: /* HAIR SPACE */
1780: case 0x202f: /* NARROW NO-BREAK SPACE */
1781: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1782: case 0x3000: /* IDEOGRAPHIC SPACE */
1783: OK = TRUE;
1784: break;
1785:
1786: default:
1787: OK = FALSE;
1788: break;
1789: }
1790:
1791: if (OK == (d == OP_HSPACE))
1792: {
1793: if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1794: codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1795: {
1796: active_count--; /* Remove non-match possibility */
1797: next_active_state--;
1798: }
1799: ADD_NEW_DATA(-(state_offset + count), 0, 0);
1800: }
1801: }
1802: break;
1803:
1804: /*-----------------------------------------------------------------*/
1805: #ifdef SUPPORT_UCP
1806: case OP_PROP_EXTRA + OP_TYPEEXACT:
1807: case OP_PROP_EXTRA + OP_TYPEUPTO:
1808: case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1809: case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1810: if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1.1.1.2 misho 1811: { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1.1 misho 1812: count = current_state->count; /* Number already matched */
1813: if (clen > 0)
1814: {
1815: BOOL OK;
1816: const ucd_record * prop = GET_UCD(c);
1.1.1.2 misho 1817: switch(code[1 + IMM2_SIZE + 1])
1.1 misho 1818: {
1819: case PT_ANY:
1820: OK = TRUE;
1821: break;
1822:
1823: case PT_LAMP:
1824: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1825: prop->chartype == ucp_Lt;
1826: break;
1827:
1828: case PT_GC:
1.1.1.2 misho 1829: OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1.1 misho 1830: break;
1831:
1832: case PT_PC:
1.1.1.2 misho 1833: OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1.1 misho 1834: break;
1835:
1836: case PT_SC:
1.1.1.2 misho 1837: OK = prop->script == code[1 + IMM2_SIZE + 2];
1.1 misho 1838: break;
1839:
1840: /* These are specials for combination cases. */
1841:
1842: case PT_ALNUM:
1.1.1.2 misho 1843: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1844: PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1.1 misho 1845: break;
1846:
1847: case PT_SPACE: /* Perl space */
1.1.1.2 misho 1848: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.1 misho 1849: c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1850: break;
1851:
1852: case PT_PXSPACE: /* POSIX space */
1.1.1.2 misho 1853: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.1 misho 1854: c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1855: c == CHAR_FF || c == CHAR_CR;
1856: break;
1857:
1858: case PT_WORD:
1.1.1.2 misho 1859: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1860: PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1.1 misho 1861: c == CHAR_UNDERSCORE;
1862: break;
1863:
1864: /* Should never occur, but keep compilers from grumbling. */
1865:
1866: default:
1867: OK = codevalue != OP_PROP;
1868: break;
1869: }
1870:
1871: if (OK == (d == OP_PROP))
1872: {
1873: if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1874: {
1875: active_count--; /* Remove non-match possibility */
1876: next_active_state--;
1877: }
1878: if (++count >= GET2(code, 1))
1.1.1.2 misho 1879: { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1.1 misho 1880: else
1881: { ADD_NEW(state_offset, count); }
1882: }
1883: }
1884: break;
1885:
1886: /*-----------------------------------------------------------------*/
1887: case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1888: case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1889: case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1890: case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1891: if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1.1.1.2 misho 1892: { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1.1 misho 1893: count = current_state->count; /* Number already matched */
1894: if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1895: {
1.1.1.2 misho 1896: const pcre_uchar *nptr = ptr + clen;
1.1 misho 1897: int ncount = 0;
1898: if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1899: {
1900: active_count--; /* Remove non-match possibility */
1901: next_active_state--;
1902: }
1903: while (nptr < end_subject)
1904: {
1905: int nd;
1906: int ndlen = 1;
1907: GETCHARLEN(nd, nptr, ndlen);
1908: if (UCD_CATEGORY(nd) != ucp_M) break;
1909: ncount++;
1910: nptr += ndlen;
1911: }
1.1.1.3 ! misho 1912: if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
! 1913: reset_could_continue = TRUE;
1.1 misho 1914: if (++count >= GET2(code, 1))
1.1.1.2 misho 1915: { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1.1 misho 1916: else
1917: { ADD_NEW_DATA(-state_offset, count, ncount); }
1918: }
1919: break;
1920: #endif
1921:
1922: /*-----------------------------------------------------------------*/
1923: case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1924: case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1925: case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1926: case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1927: if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1.1.1.2 misho 1928: { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1.1 misho 1929: count = current_state->count; /* Number already matched */
1930: if (clen > 0)
1931: {
1932: int ncount = 0;
1933: switch (c)
1934: {
1935: case 0x000b:
1936: case 0x000c:
1937: case 0x0085:
1938: case 0x2028:
1939: case 0x2029:
1940: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1941: goto ANYNL03;
1942:
1943: case 0x000d:
1944: if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1945: /* Fall through */
1946:
1947: ANYNL03:
1948: case 0x000a:
1949: if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1950: {
1951: active_count--; /* Remove non-match possibility */
1952: next_active_state--;
1953: }
1954: if (++count >= GET2(code, 1))
1.1.1.2 misho 1955: { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1.1 misho 1956: else
1957: { ADD_NEW_DATA(-state_offset, count, ncount); }
1958: break;
1959:
1960: default:
1961: break;
1962: }
1963: }
1964: break;
1965:
1966: /*-----------------------------------------------------------------*/
1967: case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1968: case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1969: case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1970: case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1971: if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1.1.1.2 misho 1972: { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1.1 misho 1973: count = current_state->count; /* Number already matched */
1974: if (clen > 0)
1975: {
1976: BOOL OK;
1977: switch (c)
1978: {
1979: case 0x000a:
1980: case 0x000b:
1981: case 0x000c:
1982: case 0x000d:
1983: case 0x0085:
1984: case 0x2028:
1985: case 0x2029:
1986: OK = TRUE;
1987: break;
1988:
1989: default:
1990: OK = FALSE;
1991: }
1992:
1993: if (OK == (d == OP_VSPACE))
1994: {
1995: if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1996: {
1997: active_count--; /* Remove non-match possibility */
1998: next_active_state--;
1999: }
2000: if (++count >= GET2(code, 1))
1.1.1.2 misho 2001: { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1.1 misho 2002: else
2003: { ADD_NEW_DATA(-state_offset, count, 0); }
2004: }
2005: }
2006: break;
2007:
2008: /*-----------------------------------------------------------------*/
2009: case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2010: case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2011: case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2012: case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2013: if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1.1.1.2 misho 2014: { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1.1 misho 2015: count = current_state->count; /* Number already matched */
2016: if (clen > 0)
2017: {
2018: BOOL OK;
2019: switch (c)
2020: {
2021: case 0x09: /* HT */
2022: case 0x20: /* SPACE */
2023: case 0xa0: /* NBSP */
2024: case 0x1680: /* OGHAM SPACE MARK */
2025: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2026: case 0x2000: /* EN QUAD */
2027: case 0x2001: /* EM QUAD */
2028: case 0x2002: /* EN SPACE */
2029: case 0x2003: /* EM SPACE */
2030: case 0x2004: /* THREE-PER-EM SPACE */
2031: case 0x2005: /* FOUR-PER-EM SPACE */
2032: case 0x2006: /* SIX-PER-EM SPACE */
2033: case 0x2007: /* FIGURE SPACE */
2034: case 0x2008: /* PUNCTUATION SPACE */
2035: case 0x2009: /* THIN SPACE */
2036: case 0x200A: /* HAIR SPACE */
2037: case 0x202f: /* NARROW NO-BREAK SPACE */
2038: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2039: case 0x3000: /* IDEOGRAPHIC SPACE */
2040: OK = TRUE;
2041: break;
2042:
2043: default:
2044: OK = FALSE;
2045: break;
2046: }
2047:
2048: if (OK == (d == OP_HSPACE))
2049: {
2050: if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2051: {
2052: active_count--; /* Remove non-match possibility */
2053: next_active_state--;
2054: }
2055: if (++count >= GET2(code, 1))
1.1.1.2 misho 2056: { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1.1 misho 2057: else
2058: { ADD_NEW_DATA(-state_offset, count, 0); }
2059: }
2060: }
2061: break;
2062:
2063: /* ========================================================================== */
2064: /* These opcodes are followed by a character that is usually compared
2065: to the current subject character; it is loaded into d. We still get
2066: here even if there is no subject character, because in some cases zero
2067: repetitions are permitted. */
2068:
2069: /*-----------------------------------------------------------------*/
2070: case OP_CHAR:
2071: if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2072: break;
2073:
2074: /*-----------------------------------------------------------------*/
2075: case OP_CHARI:
2076: if (clen == 0) break;
2077:
1.1.1.2 misho 2078: #ifdef SUPPORT_UTF
2079: if (utf)
1.1 misho 2080: {
2081: if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2082: {
2083: unsigned int othercase;
1.1.1.2 misho 2084: if (c < 128)
2085: othercase = fcc[c];
2086: else
2087: /* If we have Unicode property support, we can use it to test the
2088: other case of the character. */
1.1 misho 2089: #ifdef SUPPORT_UCP
1.1.1.2 misho 2090: othercase = UCD_OTHERCASE(c);
1.1 misho 2091: #else
1.1.1.2 misho 2092: othercase = NOTACHAR;
1.1 misho 2093: #endif
2094:
2095: if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2096: }
2097: }
2098: else
1.1.1.2 misho 2099: #endif /* SUPPORT_UTF */
2100: /* Not UTF mode */
1.1 misho 2101: {
1.1.1.2 misho 2102: if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2103: { ADD_NEW(state_offset + 2, 0); }
1.1 misho 2104: }
2105: break;
2106:
2107:
2108: #ifdef SUPPORT_UCP
2109: /*-----------------------------------------------------------------*/
2110: /* This is a tricky one because it can match more than one character.
2111: Find out how many characters to skip, and then set up a negative state
2112: to wait for them to pass before continuing. */
2113:
2114: case OP_EXTUNI:
2115: if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
2116: {
1.1.1.2 misho 2117: const pcre_uchar *nptr = ptr + clen;
1.1 misho 2118: int ncount = 0;
2119: while (nptr < end_subject)
2120: {
2121: int nclen = 1;
2122: GETCHARLEN(c, nptr, nclen);
2123: if (UCD_CATEGORY(c) != ucp_M) break;
2124: ncount++;
2125: nptr += nclen;
2126: }
1.1.1.3 ! misho 2127: if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
! 2128: reset_could_continue = TRUE;
1.1 misho 2129: ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2130: }
2131: break;
2132: #endif
2133:
2134: /*-----------------------------------------------------------------*/
2135: /* This is a tricky like EXTUNI because it too can match more than one
2136: character (when CR is followed by LF). In this case, set up a negative
2137: state to wait for one character to pass before continuing. */
2138:
2139: case OP_ANYNL:
2140: if (clen > 0) switch(c)
2141: {
2142: case 0x000b:
2143: case 0x000c:
2144: case 0x0085:
2145: case 0x2028:
2146: case 0x2029:
2147: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2148:
2149: case 0x000a:
2150: ADD_NEW(state_offset + 1, 0);
2151: break;
2152:
2153: case 0x000d:
1.1.1.3 ! misho 2154: if (ptr + 1 >= end_subject)
! 2155: {
! 2156: ADD_NEW(state_offset + 1, 0);
! 2157: if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
! 2158: reset_could_continue = TRUE;
! 2159: }
! 2160: else if (ptr[1] == 0x0a)
1.1 misho 2161: {
2162: ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2163: }
2164: else
2165: {
2166: ADD_NEW(state_offset + 1, 0);
2167: }
2168: break;
2169: }
2170: break;
2171:
2172: /*-----------------------------------------------------------------*/
2173: case OP_NOT_VSPACE:
2174: if (clen > 0) switch(c)
2175: {
2176: case 0x000a:
2177: case 0x000b:
2178: case 0x000c:
2179: case 0x000d:
2180: case 0x0085:
2181: case 0x2028:
2182: case 0x2029:
2183: break;
2184:
2185: default:
2186: ADD_NEW(state_offset + 1, 0);
2187: break;
2188: }
2189: break;
2190:
2191: /*-----------------------------------------------------------------*/
2192: case OP_VSPACE:
2193: if (clen > 0) switch(c)
2194: {
2195: case 0x000a:
2196: case 0x000b:
2197: case 0x000c:
2198: case 0x000d:
2199: case 0x0085:
2200: case 0x2028:
2201: case 0x2029:
2202: ADD_NEW(state_offset + 1, 0);
2203: break;
2204:
2205: default: break;
2206: }
2207: break;
2208:
2209: /*-----------------------------------------------------------------*/
2210: case OP_NOT_HSPACE:
2211: if (clen > 0) switch(c)
2212: {
2213: case 0x09: /* HT */
2214: case 0x20: /* SPACE */
2215: case 0xa0: /* NBSP */
2216: case 0x1680: /* OGHAM SPACE MARK */
2217: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2218: case 0x2000: /* EN QUAD */
2219: case 0x2001: /* EM QUAD */
2220: case 0x2002: /* EN SPACE */
2221: case 0x2003: /* EM SPACE */
2222: case 0x2004: /* THREE-PER-EM SPACE */
2223: case 0x2005: /* FOUR-PER-EM SPACE */
2224: case 0x2006: /* SIX-PER-EM SPACE */
2225: case 0x2007: /* FIGURE SPACE */
2226: case 0x2008: /* PUNCTUATION SPACE */
2227: case 0x2009: /* THIN SPACE */
2228: case 0x200A: /* HAIR SPACE */
2229: case 0x202f: /* NARROW NO-BREAK SPACE */
2230: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2231: case 0x3000: /* IDEOGRAPHIC SPACE */
2232: break;
2233:
2234: default:
2235: ADD_NEW(state_offset + 1, 0);
2236: break;
2237: }
2238: break;
2239:
2240: /*-----------------------------------------------------------------*/
2241: case OP_HSPACE:
2242: if (clen > 0) switch(c)
2243: {
2244: case 0x09: /* HT */
2245: case 0x20: /* SPACE */
2246: case 0xa0: /* NBSP */
2247: case 0x1680: /* OGHAM SPACE MARK */
2248: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2249: case 0x2000: /* EN QUAD */
2250: case 0x2001: /* EM QUAD */
2251: case 0x2002: /* EN SPACE */
2252: case 0x2003: /* EM SPACE */
2253: case 0x2004: /* THREE-PER-EM SPACE */
2254: case 0x2005: /* FOUR-PER-EM SPACE */
2255: case 0x2006: /* SIX-PER-EM SPACE */
2256: case 0x2007: /* FIGURE SPACE */
2257: case 0x2008: /* PUNCTUATION SPACE */
2258: case 0x2009: /* THIN SPACE */
2259: case 0x200A: /* HAIR SPACE */
2260: case 0x202f: /* NARROW NO-BREAK SPACE */
2261: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2262: case 0x3000: /* IDEOGRAPHIC SPACE */
2263: ADD_NEW(state_offset + 1, 0);
2264: break;
2265: }
2266: break;
2267:
2268: /*-----------------------------------------------------------------*/
1.1.1.3 ! misho 2269: /* Match a negated single character casefully. */
1.1 misho 2270:
2271: case OP_NOT:
2272: if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2273: break;
2274:
2275: /*-----------------------------------------------------------------*/
1.1.1.3 ! misho 2276: /* Match a negated single character caselessly. */
1.1 misho 2277:
2278: case OP_NOTI:
1.1.1.3 ! misho 2279: if (clen > 0)
! 2280: {
! 2281: unsigned int otherd;
! 2282: #ifdef SUPPORT_UTF
! 2283: if (utf && d >= 128)
! 2284: {
! 2285: #ifdef SUPPORT_UCP
! 2286: otherd = UCD_OTHERCASE(d);
! 2287: #endif /* SUPPORT_UCP */
! 2288: }
! 2289: else
! 2290: #endif /* SUPPORT_UTF */
! 2291: otherd = TABLE_GET(d, fcc, d);
! 2292: if (c != d && c != otherd)
! 2293: { ADD_NEW(state_offset + dlen + 1, 0); }
! 2294: }
1.1 misho 2295: break;
2296:
2297: /*-----------------------------------------------------------------*/
2298: case OP_PLUSI:
2299: case OP_MINPLUSI:
2300: case OP_POSPLUSI:
2301: case OP_NOTPLUSI:
2302: case OP_NOTMINPLUSI:
2303: case OP_NOTPOSPLUSI:
2304: caseless = TRUE;
2305: codevalue -= OP_STARI - OP_STAR;
2306:
2307: /* Fall through */
2308: case OP_PLUS:
2309: case OP_MINPLUS:
2310: case OP_POSPLUS:
2311: case OP_NOTPLUS:
2312: case OP_NOTMINPLUS:
2313: case OP_NOTPOSPLUS:
2314: count = current_state->count; /* Already matched */
2315: if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2316: if (clen > 0)
2317: {
2318: unsigned int otherd = NOTACHAR;
2319: if (caseless)
2320: {
1.1.1.2 misho 2321: #ifdef SUPPORT_UTF
2322: if (utf && d >= 128)
1.1 misho 2323: {
2324: #ifdef SUPPORT_UCP
2325: otherd = UCD_OTHERCASE(d);
2326: #endif /* SUPPORT_UCP */
2327: }
2328: else
1.1.1.2 misho 2329: #endif /* SUPPORT_UTF */
2330: otherd = TABLE_GET(d, fcc, d);
1.1 misho 2331: }
2332: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2333: {
2334: if (count > 0 &&
2335: (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2336: {
2337: active_count--; /* Remove non-match possibility */
2338: next_active_state--;
2339: }
2340: count++;
2341: ADD_NEW(state_offset, count);
2342: }
2343: }
2344: break;
2345:
2346: /*-----------------------------------------------------------------*/
2347: case OP_QUERYI:
2348: case OP_MINQUERYI:
2349: case OP_POSQUERYI:
2350: case OP_NOTQUERYI:
2351: case OP_NOTMINQUERYI:
2352: case OP_NOTPOSQUERYI:
2353: caseless = TRUE;
2354: codevalue -= OP_STARI - OP_STAR;
2355: /* Fall through */
2356: case OP_QUERY:
2357: case OP_MINQUERY:
2358: case OP_POSQUERY:
2359: case OP_NOTQUERY:
2360: case OP_NOTMINQUERY:
2361: case OP_NOTPOSQUERY:
2362: ADD_ACTIVE(state_offset + dlen + 1, 0);
2363: if (clen > 0)
2364: {
2365: unsigned int otherd = NOTACHAR;
2366: if (caseless)
2367: {
1.1.1.2 misho 2368: #ifdef SUPPORT_UTF
2369: if (utf && d >= 128)
1.1 misho 2370: {
2371: #ifdef SUPPORT_UCP
2372: otherd = UCD_OTHERCASE(d);
2373: #endif /* SUPPORT_UCP */
2374: }
2375: else
1.1.1.2 misho 2376: #endif /* SUPPORT_UTF */
2377: otherd = TABLE_GET(d, fcc, d);
1.1 misho 2378: }
2379: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2380: {
2381: if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2382: {
2383: active_count--; /* Remove non-match possibility */
2384: next_active_state--;
2385: }
2386: ADD_NEW(state_offset + dlen + 1, 0);
2387: }
2388: }
2389: break;
2390:
2391: /*-----------------------------------------------------------------*/
2392: case OP_STARI:
2393: case OP_MINSTARI:
2394: case OP_POSSTARI:
2395: case OP_NOTSTARI:
2396: case OP_NOTMINSTARI:
2397: case OP_NOTPOSSTARI:
2398: caseless = TRUE;
2399: codevalue -= OP_STARI - OP_STAR;
2400: /* Fall through */
2401: case OP_STAR:
2402: case OP_MINSTAR:
2403: case OP_POSSTAR:
2404: case OP_NOTSTAR:
2405: case OP_NOTMINSTAR:
2406: case OP_NOTPOSSTAR:
2407: ADD_ACTIVE(state_offset + dlen + 1, 0);
2408: if (clen > 0)
2409: {
2410: unsigned int otherd = NOTACHAR;
2411: if (caseless)
2412: {
1.1.1.2 misho 2413: #ifdef SUPPORT_UTF
2414: if (utf && d >= 128)
1.1 misho 2415: {
2416: #ifdef SUPPORT_UCP
2417: otherd = UCD_OTHERCASE(d);
2418: #endif /* SUPPORT_UCP */
2419: }
2420: else
1.1.1.2 misho 2421: #endif /* SUPPORT_UTF */
2422: otherd = TABLE_GET(d, fcc, d);
1.1 misho 2423: }
2424: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2425: {
2426: if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2427: {
2428: active_count--; /* Remove non-match possibility */
2429: next_active_state--;
2430: }
2431: ADD_NEW(state_offset, 0);
2432: }
2433: }
2434: break;
2435:
2436: /*-----------------------------------------------------------------*/
2437: case OP_EXACTI:
2438: case OP_NOTEXACTI:
2439: caseless = TRUE;
2440: codevalue -= OP_STARI - OP_STAR;
2441: /* Fall through */
2442: case OP_EXACT:
2443: case OP_NOTEXACT:
2444: count = current_state->count; /* Number already matched */
2445: if (clen > 0)
2446: {
2447: unsigned int otherd = NOTACHAR;
2448: if (caseless)
2449: {
1.1.1.2 misho 2450: #ifdef SUPPORT_UTF
2451: if (utf && d >= 128)
1.1 misho 2452: {
2453: #ifdef SUPPORT_UCP
2454: otherd = UCD_OTHERCASE(d);
2455: #endif /* SUPPORT_UCP */
2456: }
2457: else
1.1.1.2 misho 2458: #endif /* SUPPORT_UTF */
2459: otherd = TABLE_GET(d, fcc, d);
1.1 misho 2460: }
2461: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2462: {
2463: if (++count >= GET2(code, 1))
1.1.1.2 misho 2464: { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
1.1 misho 2465: else
2466: { ADD_NEW(state_offset, count); }
2467: }
2468: }
2469: break;
2470:
2471: /*-----------------------------------------------------------------*/
2472: case OP_UPTOI:
2473: case OP_MINUPTOI:
2474: case OP_POSUPTOI:
2475: case OP_NOTUPTOI:
2476: case OP_NOTMINUPTOI:
2477: case OP_NOTPOSUPTOI:
2478: caseless = TRUE;
2479: codevalue -= OP_STARI - OP_STAR;
2480: /* Fall through */
2481: case OP_UPTO:
2482: case OP_MINUPTO:
2483: case OP_POSUPTO:
2484: case OP_NOTUPTO:
2485: case OP_NOTMINUPTO:
2486: case OP_NOTPOSUPTO:
1.1.1.2 misho 2487: ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
1.1 misho 2488: count = current_state->count; /* Number already matched */
2489: if (clen > 0)
2490: {
2491: unsigned int otherd = NOTACHAR;
2492: if (caseless)
2493: {
1.1.1.2 misho 2494: #ifdef SUPPORT_UTF
2495: if (utf && d >= 128)
1.1 misho 2496: {
2497: #ifdef SUPPORT_UCP
2498: otherd = UCD_OTHERCASE(d);
2499: #endif /* SUPPORT_UCP */
2500: }
2501: else
1.1.1.2 misho 2502: #endif /* SUPPORT_UTF */
2503: otherd = TABLE_GET(d, fcc, d);
1.1 misho 2504: }
2505: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2506: {
2507: if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2508: {
2509: active_count--; /* Remove non-match possibility */
2510: next_active_state--;
2511: }
2512: if (++count >= GET2(code, 1))
1.1.1.2 misho 2513: { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
1.1 misho 2514: else
2515: { ADD_NEW(state_offset, count); }
2516: }
2517: }
2518: break;
2519:
2520:
2521: /* ========================================================================== */
2522: /* These are the class-handling opcodes */
2523:
2524: case OP_CLASS:
2525: case OP_NCLASS:
2526: case OP_XCLASS:
2527: {
2528: BOOL isinclass = FALSE;
2529: int next_state_offset;
1.1.1.2 misho 2530: const pcre_uchar *ecode;
1.1 misho 2531:
2532: /* For a simple class, there is always just a 32-byte table, and we
2533: can set isinclass from it. */
2534:
2535: if (codevalue != OP_XCLASS)
2536: {
1.1.1.2 misho 2537: ecode = code + 1 + (32 / sizeof(pcre_uchar));
1.1 misho 2538: if (clen > 0)
2539: {
2540: isinclass = (c > 255)? (codevalue == OP_NCLASS) :
1.1.1.2 misho 2541: ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
1.1 misho 2542: }
2543: }
2544:
2545: /* An extended class may have a table or a list of single characters,
2546: ranges, or both, and it may be positive or negative. There's a
2547: function that sorts all this out. */
2548:
2549: else
2550: {
2551: ecode = code + GET(code, 1);
1.1.1.2 misho 2552: if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
1.1 misho 2553: }
2554:
2555: /* At this point, isinclass is set for all kinds of class, and ecode
2556: points to the byte after the end of the class. If there is a
2557: quantifier, this is where it will be. */
2558:
2559: next_state_offset = (int)(ecode - start_code);
2560:
2561: switch (*ecode)
2562: {
2563: case OP_CRSTAR:
2564: case OP_CRMINSTAR:
2565: ADD_ACTIVE(next_state_offset + 1, 0);
2566: if (isinclass) { ADD_NEW(state_offset, 0); }
2567: break;
2568:
2569: case OP_CRPLUS:
2570: case OP_CRMINPLUS:
2571: count = current_state->count; /* Already matched */
2572: if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2573: if (isinclass) { count++; ADD_NEW(state_offset, count); }
2574: break;
2575:
2576: case OP_CRQUERY:
2577: case OP_CRMINQUERY:
2578: ADD_ACTIVE(next_state_offset + 1, 0);
2579: if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2580: break;
2581:
2582: case OP_CRRANGE:
2583: case OP_CRMINRANGE:
2584: count = current_state->count; /* Already matched */
2585: if (count >= GET2(ecode, 1))
1.1.1.2 misho 2586: { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
1.1 misho 2587: if (isinclass)
2588: {
1.1.1.2 misho 2589: int max = GET2(ecode, 1 + IMM2_SIZE);
1.1 misho 2590: if (++count >= max && max != 0) /* Max 0 => no limit */
1.1.1.2 misho 2591: { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
1.1 misho 2592: else
2593: { ADD_NEW(state_offset, count); }
2594: }
2595: break;
2596:
2597: default:
2598: if (isinclass) { ADD_NEW(next_state_offset, 0); }
2599: break;
2600: }
2601: }
2602: break;
2603:
2604: /* ========================================================================== */
2605: /* These are the opcodes for fancy brackets of various kinds. We have
2606: to use recursion in order to handle them. The "always failing" assertion
2607: (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2608: though the other "backtracking verbs" are not supported. */
2609:
2610: case OP_FAIL:
2611: forced_fail++; /* Count FAILs for multiple states */
2612: break;
2613:
2614: case OP_ASSERT:
2615: case OP_ASSERT_NOT:
2616: case OP_ASSERTBACK:
2617: case OP_ASSERTBACK_NOT:
2618: {
2619: int rc;
2620: int local_offsets[2];
2621: int local_workspace[1000];
1.1.1.2 misho 2622: const pcre_uchar *endasscode = code + GET(code, 1);
1.1 misho 2623:
2624: while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2625:
2626: rc = internal_dfa_exec(
2627: md, /* static match data */
2628: code, /* this subexpression's code */
2629: ptr, /* where we currently are */
2630: (int)(ptr - start_subject), /* start offset */
2631: local_offsets, /* offset vector */
2632: sizeof(local_offsets)/sizeof(int), /* size of same */
2633: local_workspace, /* workspace vector */
2634: sizeof(local_workspace)/sizeof(int), /* size of same */
2635: rlevel); /* function recursion level */
2636:
2637: if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2638: if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2639: { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2640: }
2641: break;
2642:
2643: /*-----------------------------------------------------------------*/
2644: case OP_COND:
2645: case OP_SCOND:
2646: {
2647: int local_offsets[1000];
2648: int local_workspace[1000];
2649: int codelink = GET(code, 1);
2650: int condcode;
2651:
2652: /* Because of the way auto-callout works during compile, a callout item
2653: is inserted between OP_COND and an assertion condition. This does not
2654: happen for the other conditions. */
2655:
2656: if (code[LINK_SIZE+1] == OP_CALLOUT)
2657: {
2658: rrc = 0;
1.1.1.2 misho 2659: if (PUBL(callout) != NULL)
1.1 misho 2660: {
1.1.1.2 misho 2661: PUBL(callout_block) cb;
1.1 misho 2662: cb.version = 1; /* Version 1 of the callout block */
2663: cb.callout_number = code[LINK_SIZE+2];
2664: cb.offset_vector = offsets;
1.1.1.2 misho 2665: #ifdef COMPILE_PCRE8
1.1 misho 2666: cb.subject = (PCRE_SPTR)start_subject;
1.1.1.2 misho 2667: #else
2668: cb.subject = (PCRE_SPTR16)start_subject;
2669: #endif
1.1 misho 2670: cb.subject_length = (int)(end_subject - start_subject);
2671: cb.start_match = (int)(current_subject - start_subject);
2672: cb.current_position = (int)(ptr - start_subject);
2673: cb.pattern_position = GET(code, LINK_SIZE + 3);
2674: cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2675: cb.capture_top = 1;
2676: cb.capture_last = -1;
2677: cb.callout_data = md->callout_data;
2678: cb.mark = NULL; /* No (*MARK) support */
1.1.1.2 misho 2679: if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
1.1 misho 2680: }
2681: if (rrc > 0) break; /* Fail this thread */
1.1.1.2 misho 2682: code += PRIV(OP_lengths)[OP_CALLOUT]; /* Skip callout data */
1.1 misho 2683: }
2684:
2685: condcode = code[LINK_SIZE+1];
2686:
2687: /* Back reference conditions are not supported */
2688:
2689: if (condcode == OP_CREF || condcode == OP_NCREF)
2690: return PCRE_ERROR_DFA_UCOND;
2691:
2692: /* The DEFINE condition is always false */
2693:
2694: if (condcode == OP_DEF)
2695: { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2696:
2697: /* The only supported version of OP_RREF is for the value RREF_ANY,
2698: which means "test if in any recursion". We can't test for specifically
2699: recursed groups. */
2700:
2701: else if (condcode == OP_RREF || condcode == OP_NRREF)
2702: {
1.1.1.2 misho 2703: int value = GET2(code, LINK_SIZE + 2);
1.1 misho 2704: if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2705: if (md->recursive != NULL)
1.1.1.2 misho 2706: { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
1.1 misho 2707: else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2708: }
2709:
2710: /* Otherwise, the condition is an assertion */
2711:
2712: else
2713: {
2714: int rc;
1.1.1.2 misho 2715: const pcre_uchar *asscode = code + LINK_SIZE + 1;
2716: const pcre_uchar *endasscode = asscode + GET(asscode, 1);
1.1 misho 2717:
2718: while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2719:
2720: rc = internal_dfa_exec(
2721: md, /* fixed match data */
2722: asscode, /* this subexpression's code */
2723: ptr, /* where we currently are */
2724: (int)(ptr - start_subject), /* start offset */
2725: local_offsets, /* offset vector */
2726: sizeof(local_offsets)/sizeof(int), /* size of same */
2727: local_workspace, /* workspace vector */
2728: sizeof(local_workspace)/sizeof(int), /* size of same */
2729: rlevel); /* function recursion level */
2730:
2731: if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2732: if ((rc >= 0) ==
2733: (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2734: { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2735: else
2736: { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2737: }
2738: }
2739: break;
2740:
2741: /*-----------------------------------------------------------------*/
2742: case OP_RECURSE:
2743: {
2744: dfa_recursion_info *ri;
2745: int local_offsets[1000];
2746: int local_workspace[1000];
1.1.1.2 misho 2747: const pcre_uchar *callpat = start_code + GET(code, 1);
1.1 misho 2748: int recno = (callpat == md->start_code)? 0 :
2749: GET2(callpat, 1 + LINK_SIZE);
2750: int rc;
2751:
2752: DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2753:
2754: /* Check for repeating a recursion without advancing the subject
2755: pointer. This should catch convoluted mutual recursions. (Some simple
2756: cases are caught at compile time.) */
2757:
2758: for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2759: if (recno == ri->group_num && ptr == ri->subject_position)
2760: return PCRE_ERROR_RECURSELOOP;
2761:
2762: /* Remember this recursion and where we started it so as to
2763: catch infinite loops. */
2764:
2765: new_recursive.group_num = recno;
2766: new_recursive.subject_position = ptr;
2767: new_recursive.prevrec = md->recursive;
2768: md->recursive = &new_recursive;
2769:
2770: rc = internal_dfa_exec(
2771: md, /* fixed match data */
2772: callpat, /* this subexpression's code */
2773: ptr, /* where we currently are */
2774: (int)(ptr - start_subject), /* start offset */
2775: local_offsets, /* offset vector */
2776: sizeof(local_offsets)/sizeof(int), /* size of same */
2777: local_workspace, /* workspace vector */
2778: sizeof(local_workspace)/sizeof(int), /* size of same */
2779: rlevel); /* function recursion level */
2780:
2781: md->recursive = new_recursive.prevrec; /* Done this recursion */
2782:
2783: DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2784: rc));
2785:
2786: /* Ran out of internal offsets */
2787:
2788: if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2789:
2790: /* For each successful matched substring, set up the next state with a
2791: count of characters to skip before trying it. Note that the count is in
2792: characters, not bytes. */
2793:
2794: if (rc > 0)
2795: {
2796: for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2797: {
2798: int charcount = local_offsets[rc+1] - local_offsets[rc];
1.1.1.2 misho 2799: #ifdef SUPPORT_UTF
1.1.1.3 ! misho 2800: if (utf)
! 2801: {
! 2802: const pcre_uchar *p = start_subject + local_offsets[rc];
! 2803: const pcre_uchar *pp = start_subject + local_offsets[rc+1];
! 2804: while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
! 2805: }
1.1.1.2 misho 2806: #endif
1.1 misho 2807: if (charcount > 0)
2808: {
2809: ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2810: }
2811: else
2812: {
2813: ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2814: }
2815: }
2816: }
2817: else if (rc != PCRE_ERROR_NOMATCH) return rc;
2818: }
2819: break;
2820:
2821: /*-----------------------------------------------------------------*/
2822: case OP_BRAPOS:
2823: case OP_SBRAPOS:
2824: case OP_CBRAPOS:
2825: case OP_SCBRAPOS:
2826: case OP_BRAPOSZERO:
2827: {
2828: int charcount, matched_count;
1.1.1.2 misho 2829: const pcre_uchar *local_ptr = ptr;
1.1 misho 2830: BOOL allow_zero;
2831:
2832: if (codevalue == OP_BRAPOSZERO)
2833: {
2834: allow_zero = TRUE;
2835: codevalue = *(++code); /* Codevalue will be one of above BRAs */
2836: }
2837: else allow_zero = FALSE;
2838:
2839: /* Loop to match the subpattern as many times as possible as if it were
2840: a complete pattern. */
2841:
2842: for (matched_count = 0;; matched_count++)
2843: {
2844: int local_offsets[2];
2845: int local_workspace[1000];
2846:
2847: int rc = internal_dfa_exec(
2848: md, /* fixed match data */
2849: code, /* this subexpression's code */
2850: local_ptr, /* where we currently are */
2851: (int)(ptr - start_subject), /* start offset */
2852: local_offsets, /* offset vector */
2853: sizeof(local_offsets)/sizeof(int), /* size of same */
2854: local_workspace, /* workspace vector */
2855: sizeof(local_workspace)/sizeof(int), /* size of same */
2856: rlevel); /* function recursion level */
2857:
2858: /* Failed to match */
2859:
2860: if (rc < 0)
2861: {
2862: if (rc != PCRE_ERROR_NOMATCH) return rc;
2863: break;
2864: }
2865:
2866: /* Matched: break the loop if zero characters matched. */
2867:
2868: charcount = local_offsets[1] - local_offsets[0];
2869: if (charcount == 0) break;
2870: local_ptr += charcount; /* Advance temporary position ptr */
2871: }
2872:
2873: /* At this point we have matched the subpattern matched_count
2874: times, and local_ptr is pointing to the character after the end of the
2875: last match. */
2876:
2877: if (matched_count > 0 || allow_zero)
2878: {
1.1.1.2 misho 2879: const pcre_uchar *end_subpattern = code;
1.1 misho 2880: int next_state_offset;
2881:
2882: do { end_subpattern += GET(end_subpattern, 1); }
2883: while (*end_subpattern == OP_ALT);
2884: next_state_offset =
2885: (int)(end_subpattern - start_code + LINK_SIZE + 1);
2886:
2887: /* Optimization: if there are no more active states, and there
2888: are no new states yet set up, then skip over the subject string
2889: right here, to save looping. Otherwise, set up the new state to swing
2890: into action when the end of the matched substring is reached. */
2891:
2892: if (i + 1 >= active_count && new_count == 0)
2893: {
2894: ptr = local_ptr;
2895: clen = 0;
2896: ADD_NEW(next_state_offset, 0);
2897: }
2898: else
2899: {
1.1.1.2 misho 2900: const pcre_uchar *p = ptr;
2901: const pcre_uchar *pp = local_ptr;
1.1 misho 2902: charcount = (int)(pp - p);
1.1.1.2 misho 2903: #ifdef SUPPORT_UTF
1.1.1.3 ! misho 2904: if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
1.1.1.2 misho 2905: #endif
1.1 misho 2906: ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2907: }
2908: }
2909: }
2910: break;
2911:
2912: /*-----------------------------------------------------------------*/
2913: case OP_ONCE:
2914: case OP_ONCE_NC:
2915: {
2916: int local_offsets[2];
2917: int local_workspace[1000];
2918:
2919: int rc = internal_dfa_exec(
2920: md, /* fixed match data */
2921: code, /* this subexpression's code */
2922: ptr, /* where we currently are */
2923: (int)(ptr - start_subject), /* start offset */
2924: local_offsets, /* offset vector */
2925: sizeof(local_offsets)/sizeof(int), /* size of same */
2926: local_workspace, /* workspace vector */
2927: sizeof(local_workspace)/sizeof(int), /* size of same */
2928: rlevel); /* function recursion level */
2929:
2930: if (rc >= 0)
2931: {
1.1.1.2 misho 2932: const pcre_uchar *end_subpattern = code;
1.1 misho 2933: int charcount = local_offsets[1] - local_offsets[0];
2934: int next_state_offset, repeat_state_offset;
2935:
2936: do { end_subpattern += GET(end_subpattern, 1); }
2937: while (*end_subpattern == OP_ALT);
2938: next_state_offset =
2939: (int)(end_subpattern - start_code + LINK_SIZE + 1);
2940:
2941: /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2942: arrange for the repeat state also to be added to the relevant list.
2943: Calculate the offset, or set -1 for no repeat. */
2944:
2945: repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2946: *end_subpattern == OP_KETRMIN)?
2947: (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2948:
2949: /* If we have matched an empty string, add the next state at the
2950: current character pointer. This is important so that the duplicate
2951: checking kicks in, which is what breaks infinite loops that match an
2952: empty string. */
2953:
2954: if (charcount == 0)
2955: {
2956: ADD_ACTIVE(next_state_offset, 0);
2957: }
2958:
2959: /* Optimization: if there are no more active states, and there
2960: are no new states yet set up, then skip over the subject string
2961: right here, to save looping. Otherwise, set up the new state to swing
2962: into action when the end of the matched substring is reached. */
2963:
2964: else if (i + 1 >= active_count && new_count == 0)
2965: {
2966: ptr += charcount;
2967: clen = 0;
2968: ADD_NEW(next_state_offset, 0);
2969:
2970: /* If we are adding a repeat state at the new character position,
2971: we must fudge things so that it is the only current state.
2972: Otherwise, it might be a duplicate of one we processed before, and
2973: that would cause it to be skipped. */
2974:
2975: if (repeat_state_offset >= 0)
2976: {
2977: next_active_state = active_states;
2978: active_count = 0;
2979: i = -1;
2980: ADD_ACTIVE(repeat_state_offset, 0);
2981: }
2982: }
2983: else
2984: {
1.1.1.2 misho 2985: #ifdef SUPPORT_UTF
1.1.1.3 ! misho 2986: if (utf)
! 2987: {
! 2988: const pcre_uchar *p = start_subject + local_offsets[0];
! 2989: const pcre_uchar *pp = start_subject + local_offsets[1];
! 2990: while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
! 2991: }
1.1.1.2 misho 2992: #endif
1.1 misho 2993: ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2994: if (repeat_state_offset >= 0)
2995: { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2996: }
2997: }
2998: else if (rc != PCRE_ERROR_NOMATCH) return rc;
2999: }
3000: break;
3001:
3002:
3003: /* ========================================================================== */
3004: /* Handle callouts */
3005:
3006: case OP_CALLOUT:
3007: rrc = 0;
1.1.1.2 misho 3008: if (PUBL(callout) != NULL)
1.1 misho 3009: {
1.1.1.2 misho 3010: PUBL(callout_block) cb;
1.1 misho 3011: cb.version = 1; /* Version 1 of the callout block */
3012: cb.callout_number = code[1];
3013: cb.offset_vector = offsets;
1.1.1.2 misho 3014: #ifdef COMPILE_PCRE8
1.1 misho 3015: cb.subject = (PCRE_SPTR)start_subject;
1.1.1.2 misho 3016: #else
3017: cb.subject = (PCRE_SPTR16)start_subject;
3018: #endif
1.1 misho 3019: cb.subject_length = (int)(end_subject - start_subject);
3020: cb.start_match = (int)(current_subject - start_subject);
3021: cb.current_position = (int)(ptr - start_subject);
3022: cb.pattern_position = GET(code, 2);
3023: cb.next_item_length = GET(code, 2 + LINK_SIZE);
3024: cb.capture_top = 1;
3025: cb.capture_last = -1;
3026: cb.callout_data = md->callout_data;
3027: cb.mark = NULL; /* No (*MARK) support */
1.1.1.2 misho 3028: if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
1.1 misho 3029: }
3030: if (rrc == 0)
1.1.1.2 misho 3031: { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
1.1 misho 3032: break;
3033:
3034:
3035: /* ========================================================================== */
3036: default: /* Unsupported opcode */
3037: return PCRE_ERROR_DFA_UITEM;
3038: }
3039:
3040: NEXT_ACTIVE_STATE: continue;
3041:
3042: } /* End of loop scanning active states */
3043:
3044: /* We have finished the processing at the current subject character. If no
3045: new states have been set for the next character, we have found all the
3046: matches that we are going to find. If we are at the top level and partial
3047: matching has been requested, check for appropriate conditions.
3048:
3049: The "forced_ fail" variable counts the number of (*F) encountered for the
3050: character. If it is equal to the original active_count (saved in
3051: workspace[1]) it means that (*F) was found on every active state. In this
3052: case we don't want to give a partial match.
3053:
3054: The "could_continue" variable is true if a state could have continued but
3055: for the fact that the end of the subject was reached. */
3056:
3057: if (new_count <= 0)
3058: {
3059: if (rlevel == 1 && /* Top level, and */
1.1.1.3 ! misho 3060: could_continue && /* Some could go on, and */
1.1 misho 3061: forced_fail != workspace[1] && /* Not all forced fail & */
3062: ( /* either... */
3063: (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
3064: || /* or... */
3065: ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
3066: match_count < 0) /* no matches */
3067: ) && /* And... */
1.1.1.3 ! misho 3068: (
! 3069: partial_newline || /* Either partial NL */
! 3070: ( /* or ... */
! 3071: ptr >= end_subject && /* End of subject and */
! 3072: ptr > md->start_used_ptr) /* Inspected non-empty string */
! 3073: )
! 3074: )
1.1 misho 3075: {
3076: if (offsetcount >= 2)
3077: {
3078: offsets[0] = (int)(md->start_used_ptr - start_subject);
3079: offsets[1] = (int)(end_subject - start_subject);
3080: }
3081: match_count = PCRE_ERROR_PARTIAL;
3082: }
3083:
3084: DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
3085: "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
3086: rlevel*2-2, SP));
3087: break; /* In effect, "return", but see the comment below */
3088: }
3089:
3090: /* One or more states are active for the next character. */
3091:
3092: ptr += clen; /* Advance to next subject character */
3093: } /* Loop to move along the subject string */
3094:
3095: /* Control gets here from "break" a few lines above. We do it this way because
3096: if we use "return" above, we have compiler trouble. Some compilers warn if
3097: there's nothing here because they think the function doesn't return a value. On
3098: the other hand, if we put a dummy statement here, some more clever compilers
3099: complain that it can't be reached. Sigh. */
3100:
3101: return match_count;
3102: }
3103:
3104:
3105:
3106:
3107: /*************************************************
3108: * Execute a Regular Expression - DFA engine *
3109: *************************************************/
3110:
3111: /* This external function applies a compiled re to a subject string using a DFA
3112: engine. This function calls the internal function multiple times if the pattern
3113: is not anchored.
3114:
3115: Arguments:
3116: argument_re points to the compiled expression
3117: extra_data points to extra data or is NULL
3118: subject points to the subject string
3119: length length of subject string (may contain binary zeros)
3120: start_offset where to start in the subject string
3121: options option bits
3122: offsets vector of match offsets
3123: offsetcount size of same
3124: workspace workspace vector
3125: wscount size of same
3126:
3127: Returns: > 0 => number of match offset pairs placed in offsets
3128: = 0 => offsets overflowed; longest matches are present
3129: -1 => failed to match
3130: < -1 => some kind of unexpected problem
3131: */
3132:
1.1.1.2 misho 3133: #ifdef COMPILE_PCRE8
1.1 misho 3134: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3135: pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3136: const char *subject, int length, int start_offset, int options, int *offsets,
3137: int offsetcount, int *workspace, int wscount)
1.1.1.2 misho 3138: #else
3139: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3140: pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3141: PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3142: int offsetcount, int *workspace, int wscount)
3143: #endif
1.1 misho 3144: {
1.1.1.2 misho 3145: REAL_PCRE *re = (REAL_PCRE *)argument_re;
1.1 misho 3146: dfa_match_data match_block;
3147: dfa_match_data *md = &match_block;
1.1.1.2 misho 3148: BOOL utf, anchored, startline, firstline;
3149: const pcre_uchar *current_subject, *end_subject;
1.1 misho 3150: const pcre_study_data *study = NULL;
3151:
1.1.1.2 misho 3152: const pcre_uchar *req_char_ptr;
3153: const pcre_uint8 *start_bits = NULL;
3154: BOOL has_first_char = FALSE;
3155: BOOL has_req_char = FALSE;
3156: pcre_uchar first_char = 0;
3157: pcre_uchar first_char2 = 0;
3158: pcre_uchar req_char = 0;
3159: pcre_uchar req_char2 = 0;
1.1 misho 3160: int newline;
3161:
3162: /* Plausibility checks */
3163:
3164: if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3165: if (re == NULL || subject == NULL || workspace == NULL ||
3166: (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3167: if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3168: if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3169: if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3170:
1.1.1.3 ! misho 3171: /* Check that the first field in the block is the magic number. If it is not,
! 3172: return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
! 3173: REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
! 3174: means that the pattern is likely compiled with different endianness. */
! 3175:
! 3176: if (re->magic_number != MAGIC_NUMBER)
! 3177: return re->magic_number == REVERSED_MAGIC_NUMBER?
! 3178: PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
! 3179: if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
! 3180:
! 3181: /* If restarting after a partial match, do some sanity checks on the contents
! 3182: of the workspace. */
! 3183:
! 3184: if ((options & PCRE_DFA_RESTART) != 0)
! 3185: {
! 3186: if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
! 3187: workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
! 3188: return PCRE_ERROR_DFA_BADRESTART;
! 3189: }
! 3190:
! 3191: /* Set up study, callout, and table data */
1.1 misho 3192:
3193: md->tables = re->tables;
3194: md->callout_data = NULL;
3195:
3196: if (extra_data != NULL)
3197: {
3198: unsigned int flags = extra_data->flags;
3199: if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3200: study = (const pcre_study_data *)extra_data->study_data;
3201: if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3202: if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3203: return PCRE_ERROR_DFA_UMLIMIT;
3204: if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3205: md->callout_data = extra_data->callout_data;
3206: if ((flags & PCRE_EXTRA_TABLES) != 0)
3207: md->tables = extra_data->tables;
3208: }
3209:
3210: /* Set some local values */
3211:
1.1.1.2 misho 3212: current_subject = (const pcre_uchar *)subject + start_offset;
3213: end_subject = (const pcre_uchar *)subject + length;
3214: req_char_ptr = current_subject - 1;
3215:
3216: #ifdef SUPPORT_UTF
3217: /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3218: utf = (re->options & PCRE_UTF8) != 0;
1.1 misho 3219: #else
1.1.1.2 misho 3220: utf = FALSE;
1.1 misho 3221: #endif
3222:
3223: anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3224: (re->options & PCRE_ANCHORED) != 0;
3225:
3226: /* The remaining fixed data for passing around. */
3227:
1.1.1.2 misho 3228: md->start_code = (const pcre_uchar *)argument_re +
1.1 misho 3229: re->name_table_offset + re->name_count * re->name_entry_size;
1.1.1.2 misho 3230: md->start_subject = (const pcre_uchar *)subject;
1.1 misho 3231: md->end_subject = end_subject;
3232: md->start_offset = start_offset;
3233: md->moptions = options;
3234: md->poptions = re->options;
3235:
3236: /* If the BSR option is not set at match time, copy what was set
3237: at compile time. */
3238:
3239: if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3240: {
3241: if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3242: md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3243: #ifdef BSR_ANYCRLF
3244: else md->moptions |= PCRE_BSR_ANYCRLF;
3245: #endif
3246: }
3247:
3248: /* Handle different types of newline. The three bits give eight cases. If
3249: nothing is set at run time, whatever was used at compile time applies. */
3250:
3251: switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3252: PCRE_NEWLINE_BITS)
3253: {
3254: case 0: newline = NEWLINE; break; /* Compile-time default */
3255: case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3256: case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3257: case PCRE_NEWLINE_CR+
3258: PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3259: case PCRE_NEWLINE_ANY: newline = -1; break;
3260: case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3261: default: return PCRE_ERROR_BADNEWLINE;
3262: }
3263:
3264: if (newline == -2)
3265: {
3266: md->nltype = NLTYPE_ANYCRLF;
3267: }
3268: else if (newline < 0)
3269: {
3270: md->nltype = NLTYPE_ANY;
3271: }
3272: else
3273: {
3274: md->nltype = NLTYPE_FIXED;
3275: if (newline > 255)
3276: {
3277: md->nllen = 2;
3278: md->nl[0] = (newline >> 8) & 255;
3279: md->nl[1] = newline & 255;
3280: }
3281: else
3282: {
3283: md->nllen = 1;
3284: md->nl[0] = newline;
3285: }
3286: }
3287:
3288: /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3289: back the character offset. */
3290:
1.1.1.2 misho 3291: #ifdef SUPPORT_UTF
3292: if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
1.1 misho 3293: {
3294: int erroroffset;
1.1.1.2 misho 3295: int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
1.1 misho 3296: if (errorcode != 0)
3297: {
3298: if (offsetcount >= 2)
3299: {
3300: offsets[0] = erroroffset;
3301: offsets[1] = errorcode;
3302: }
3303: return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3304: PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3305: }
3306: if (start_offset > 0 && start_offset < length &&
1.1.1.2 misho 3307: NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
1.1 misho 3308: return PCRE_ERROR_BADUTF8_OFFSET;
3309: }
3310: #endif
3311:
3312: /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3313: is a feature that makes it possible to save compiled regex and re-use them
3314: in other programs later. */
3315:
1.1.1.2 misho 3316: if (md->tables == NULL) md->tables = PRIV(default_tables);
1.1 misho 3317:
1.1.1.2 misho 3318: /* The "must be at the start of a line" flags are used in a loop when finding
3319: where to start. */
1.1 misho 3320:
3321: startline = (re->flags & PCRE_STARTLINE) != 0;
3322: firstline = (re->options & PCRE_FIRSTLINE) != 0;
3323:
3324: /* Set up the first character to match, if available. The first_byte value is
3325: never set for an anchored regular expression, but the anchoring may be forced
3326: at run time, so we have to test for anchoring. The first char may be unset for
3327: an unanchored pattern, of course. If there's no first char and the pattern was
3328: studied, there may be a bitmap of possible first characters. */
3329:
3330: if (!anchored)
3331: {
3332: if ((re->flags & PCRE_FIRSTSET) != 0)
3333: {
1.1.1.2 misho 3334: has_first_char = TRUE;
3335: first_char = first_char2 = (pcre_uchar)(re->first_char);
3336: if ((re->flags & PCRE_FCH_CASELESS) != 0)
3337: {
3338: first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3339: #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3340: if (utf && first_char > 127)
3341: first_char2 = UCD_OTHERCASE(first_char);
3342: #endif
3343: }
1.1 misho 3344: }
3345: else
3346: {
3347: if (!startline && study != NULL &&
3348: (study->flags & PCRE_STUDY_MAPPED) != 0)
3349: start_bits = study->start_bits;
3350: }
3351: }
3352:
3353: /* For anchored or unanchored matches, there may be a "last known required
3354: character" set. */
3355:
3356: if ((re->flags & PCRE_REQCHSET) != 0)
3357: {
1.1.1.2 misho 3358: has_req_char = TRUE;
3359: req_char = req_char2 = (pcre_uchar)(re->req_char);
3360: if ((re->flags & PCRE_RCH_CASELESS) != 0)
3361: {
3362: req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3363: #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3364: if (utf && req_char > 127)
3365: req_char2 = UCD_OTHERCASE(req_char);
3366: #endif
3367: }
1.1 misho 3368: }
3369:
3370: /* Call the main matching function, looping for a non-anchored regex after a
3371: failed match. If not restarting, perform certain optimizations at the start of
3372: a match. */
3373:
3374: for (;;)
3375: {
3376: int rc;
3377:
3378: if ((options & PCRE_DFA_RESTART) == 0)
3379: {
1.1.1.2 misho 3380: const pcre_uchar *save_end_subject = end_subject;
1.1 misho 3381:
3382: /* If firstline is TRUE, the start of the match is constrained to the first
3383: line of a multiline string. Implement this by temporarily adjusting
3384: end_subject so that we stop scanning at a newline. If the match fails at
3385: the newline, later code breaks this loop. */
3386:
3387: if (firstline)
3388: {
1.1.1.2 misho 3389: PCRE_PUCHAR t = current_subject;
3390: #ifdef SUPPORT_UTF
3391: if (utf)
1.1 misho 3392: {
3393: while (t < md->end_subject && !IS_NEWLINE(t))
3394: {
3395: t++;
1.1.1.2 misho 3396: ACROSSCHAR(t < end_subject, *t, t++);
1.1 misho 3397: }
3398: }
3399: else
3400: #endif
3401: while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3402: end_subject = t;
3403: }
3404:
3405: /* There are some optimizations that avoid running the match if a known
3406: starting point is not found. However, there is an option that disables
3407: these, for testing and for ensuring that all callouts do actually occur.
3408: The option can be set in the regex by (*NO_START_OPT) or passed in
3409: match-time options. */
3410:
3411: if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3412: {
1.1.1.2 misho 3413: /* Advance to a known first char. */
1.1 misho 3414:
1.1.1.2 misho 3415: if (has_first_char)
1.1 misho 3416: {
1.1.1.2 misho 3417: if (first_char != first_char2)
1.1 misho 3418: while (current_subject < end_subject &&
1.1.1.2 misho 3419: *current_subject != first_char && *current_subject != first_char2)
1.1 misho 3420: current_subject++;
3421: else
3422: while (current_subject < end_subject &&
1.1.1.2 misho 3423: *current_subject != first_char)
1.1 misho 3424: current_subject++;
3425: }
3426:
3427: /* Or to just after a linebreak for a multiline match if possible */
3428:
3429: else if (startline)
3430: {
3431: if (current_subject > md->start_subject + start_offset)
3432: {
1.1.1.2 misho 3433: #ifdef SUPPORT_UTF
3434: if (utf)
1.1 misho 3435: {
3436: while (current_subject < end_subject &&
3437: !WAS_NEWLINE(current_subject))
3438: {
3439: current_subject++;
1.1.1.2 misho 3440: ACROSSCHAR(current_subject < end_subject, *current_subject,
3441: current_subject++);
1.1 misho 3442: }
3443: }
3444: else
3445: #endif
3446: while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3447: current_subject++;
3448:
3449: /* If we have just passed a CR and the newline option is ANY or
3450: ANYCRLF, and we are now at a LF, advance the match position by one
3451: more character. */
3452:
3453: if (current_subject[-1] == CHAR_CR &&
3454: (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3455: current_subject < end_subject &&
3456: *current_subject == CHAR_NL)
3457: current_subject++;
3458: }
3459: }
3460:
3461: /* Or to a non-unique first char after study */
3462:
3463: else if (start_bits != NULL)
3464: {
3465: while (current_subject < end_subject)
3466: {
3467: register unsigned int c = *current_subject;
1.1.1.2 misho 3468: #ifndef COMPILE_PCRE8
3469: if (c > 255) c = 255;
3470: #endif
1.1 misho 3471: if ((start_bits[c/8] & (1 << (c&7))) == 0)
3472: {
3473: current_subject++;
1.1.1.2 misho 3474: #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3475: /* In non 8-bit mode, the iteration will stop for
3476: characters > 255 at the beginning or not stop at all. */
3477: if (utf)
3478: ACROSSCHAR(current_subject < end_subject, *current_subject,
3479: current_subject++);
1.1 misho 3480: #endif
3481: }
3482: else break;
3483: }
3484: }
3485: }
3486:
3487: /* Restore fudged end_subject */
3488:
3489: end_subject = save_end_subject;
3490:
3491: /* The following two optimizations are disabled for partial matching or if
3492: disabling is explicitly requested (and of course, by the test above, this
3493: code is not obeyed when restarting after a partial match). */
3494:
3495: if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3496: (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3497: {
3498: /* If the pattern was studied, a minimum subject length may be set. This
3499: is a lower bound; no actual string of that length may actually match the
3500: pattern. Although the value is, strictly, in characters, we treat it as
3501: bytes to avoid spending too much time in this optimization. */
3502:
3503: if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3504: (pcre_uint32)(end_subject - current_subject) < study->minlength)
3505: return PCRE_ERROR_NOMATCH;
3506:
1.1.1.2 misho 3507: /* If req_char is set, we know that that character must appear in the
3508: subject for the match to succeed. If the first character is set, req_char
1.1 misho 3509: must be later in the subject; otherwise the test starts at the match
3510: point. This optimization can save a huge amount of work in patterns with
3511: nested unlimited repeats that aren't going to match. Writing separate
3512: code for cased/caseless versions makes it go faster, as does using an
3513: autoincrement and backing off on a match.
3514:
3515: HOWEVER: when the subject string is very, very long, searching to its end
3516: can take a long time, and give bad performance on quite ordinary
3517: patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3518: string... so we don't do this when the string is sufficiently long. */
3519:
1.1.1.2 misho 3520: if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
1.1 misho 3521: {
1.1.1.2 misho 3522: register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
1.1 misho 3523:
3524: /* We don't need to repeat the search if we haven't yet reached the
3525: place we found it at last time. */
3526:
1.1.1.2 misho 3527: if (p > req_char_ptr)
1.1 misho 3528: {
1.1.1.2 misho 3529: if (req_char != req_char2)
1.1 misho 3530: {
3531: while (p < end_subject)
3532: {
3533: register int pp = *p++;
1.1.1.2 misho 3534: if (pp == req_char || pp == req_char2) { p--; break; }
1.1 misho 3535: }
3536: }
3537: else
3538: {
3539: while (p < end_subject)
3540: {
1.1.1.2 misho 3541: if (*p++ == req_char) { p--; break; }
1.1 misho 3542: }
3543: }
3544:
3545: /* If we can't find the required character, break the matching loop,
3546: which will cause a return or PCRE_ERROR_NOMATCH. */
3547:
3548: if (p >= end_subject) break;
3549:
3550: /* If we have found the required character, save the point where we
3551: found it, so that we don't search again next time round the loop if
3552: the start hasn't passed this character yet. */
3553:
1.1.1.2 misho 3554: req_char_ptr = p;
1.1 misho 3555: }
3556: }
3557: }
3558: } /* End of optimizations that are done when not restarting */
3559:
3560: /* OK, now we can do the business */
3561:
3562: md->start_used_ptr = current_subject;
3563: md->recursive = NULL;
3564:
3565: rc = internal_dfa_exec(
3566: md, /* fixed match data */
3567: md->start_code, /* this subexpression's code */
3568: current_subject, /* where we currently are */
3569: start_offset, /* start offset in subject */
3570: offsets, /* offset vector */
3571: offsetcount, /* size of same */
3572: workspace, /* workspace vector */
3573: wscount, /* size of same */
3574: 0); /* function recurse level */
3575:
3576: /* Anything other than "no match" means we are done, always; otherwise, carry
3577: on only if not anchored. */
3578:
3579: if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3580:
3581: /* Advance to the next subject character unless we are at the end of a line
3582: and firstline is set. */
3583:
3584: if (firstline && IS_NEWLINE(current_subject)) break;
3585: current_subject++;
1.1.1.2 misho 3586: #ifdef SUPPORT_UTF
3587: if (utf)
1.1 misho 3588: {
1.1.1.2 misho 3589: ACROSSCHAR(current_subject < end_subject, *current_subject,
3590: current_subject++);
1.1 misho 3591: }
1.1.1.2 misho 3592: #endif
1.1 misho 3593: if (current_subject > end_subject) break;
3594:
3595: /* If we have just passed a CR and we are now at a LF, and the pattern does
3596: not contain any explicit matches for \r or \n, and the newline option is CRLF
3597: or ANY or ANYCRLF, advance the match position by one more character. */
3598:
3599: if (current_subject[-1] == CHAR_CR &&
3600: current_subject < end_subject &&
3601: *current_subject == CHAR_NL &&
3602: (re->flags & PCRE_HASCRORLF) == 0 &&
3603: (md->nltype == NLTYPE_ANY ||
3604: md->nltype == NLTYPE_ANYCRLF ||
3605: md->nllen == 2))
3606: current_subject++;
3607:
3608: } /* "Bumpalong" loop */
3609:
3610: return PCRE_ERROR_NOMATCH;
3611: }
3612:
3613: /* End of pcre_dfa_exec.c */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>