Annotation of embedaddon/pcre/pcre_exec.c, revision 1.1.1.1
1.1 misho 1: /*************************************************
2: * Perl-Compatible Regular Expressions *
3: *************************************************/
4:
5: /* PCRE is a library of functions to support regular expressions whose syntax
6: and semantics are as close as possible to those of the Perl 5 language.
7:
8: Written by Philip Hazel
9: Copyright (c) 1997-2011 University of Cambridge
10:
11: -----------------------------------------------------------------------------
12: Redistribution and use in source and binary forms, with or without
13: modification, are permitted provided that the following conditions are met:
14:
15: * Redistributions of source code must retain the above copyright notice,
16: this list of conditions and the following disclaimer.
17:
18: * Redistributions in binary form must reproduce the above copyright
19: notice, this list of conditions and the following disclaimer in the
20: documentation and/or other materials provided with the distribution.
21:
22: * Neither the name of the University of Cambridge nor the names of its
23: contributors may be used to endorse or promote products derived from
24: this software without specific prior written permission.
25:
26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36: POSSIBILITY OF SUCH DAMAGE.
37: -----------------------------------------------------------------------------
38: */
39:
40:
41: /* This module contains pcre_exec(), the externally visible function that does
42: pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43: possible. There are also some static supporting functions. */
44:
45: #ifdef HAVE_CONFIG_H
46: #include "config.h"
47: #endif
48:
49: #define NLBLOCK md /* Block containing newline information */
50: #define PSSTART start_subject /* Field containing processed string start */
51: #define PSEND end_subject /* Field containing processed string end */
52:
53: #include "pcre_internal.h"
54:
55: /* Undefine some potentially clashing cpp symbols */
56:
57: #undef min
58: #undef max
59:
60: /* Values for setting in md->match_function_type to indicate two special types
61: of call to match(). We do it this way to save on using another stack variable,
62: as stack usage is to be discouraged. */
63:
64: #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65: #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66:
67: /* Non-error returns from the match() function. Error returns are externally
68: defined PCRE_ERROR_xxx codes, which are all negative. */
69:
70: #define MATCH_MATCH 1
71: #define MATCH_NOMATCH 0
72:
73: /* Special internal returns from the match() function. Make them sufficiently
74: negative to avoid the external error codes. */
75:
76: #define MATCH_ACCEPT (-999)
77: #define MATCH_COMMIT (-998)
78: #define MATCH_KETRPOS (-997)
79: #define MATCH_ONCE (-996)
80: #define MATCH_PRUNE (-995)
81: #define MATCH_SKIP (-994)
82: #define MATCH_SKIP_ARG (-993)
83: #define MATCH_THEN (-992)
84:
85: /* Maximum number of ints of offset to save on the stack for recursive calls.
86: If the offset vector is bigger, malloc is used. This should be a multiple of 3,
87: because the offset vector is always a multiple of 3 long. */
88:
89: #define REC_STACK_SAVE_MAX 30
90:
91: /* Min and max values for the common repeats; for the maxima, 0 => infinity */
92:
93: static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
94: static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
95:
96:
97:
98: #ifdef PCRE_DEBUG
99: /*************************************************
100: * Debugging function to print chars *
101: *************************************************/
102:
103: /* Print a sequence of chars in printable format, stopping at the end of the
104: subject if the requested.
105:
106: Arguments:
107: p points to characters
108: length number to print
109: is_subject TRUE if printing from within md->start_subject
110: md pointer to matching data block, if is_subject is TRUE
111:
112: Returns: nothing
113: */
114:
115: static void
116: pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
117: {
118: unsigned int c;
119: if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
120: while (length-- > 0)
121: if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
122: }
123: #endif
124:
125:
126:
127: /*************************************************
128: * Match a back-reference *
129: *************************************************/
130:
131: /* Normally, if a back reference hasn't been set, the length that is passed is
132: negative, so the match always fails. However, in JavaScript compatibility mode,
133: the length passed is zero. Note that in caseless UTF-8 mode, the number of
134: subject bytes matched may be different to the number of reference bytes.
135:
136: Arguments:
137: offset index into the offset vector
138: eptr pointer into the subject
139: length length of reference to be matched (number of bytes)
140: md points to match data block
141: caseless TRUE if caseless
142:
143: Returns: < 0 if not matched, otherwise the number of subject bytes matched
144: */
145:
146: static int
147: match_ref(int offset, register USPTR eptr, int length, match_data *md,
148: BOOL caseless)
149: {
150: USPTR eptr_start = eptr;
151: register USPTR p = md->start_subject + md->offset_vector[offset];
152:
153: #ifdef PCRE_DEBUG
154: if (eptr >= md->end_subject)
155: printf("matching subject <null>");
156: else
157: {
158: printf("matching subject ");
159: pchars(eptr, length, TRUE, md);
160: }
161: printf(" against backref ");
162: pchars(p, length, FALSE, md);
163: printf("\n");
164: #endif
165:
166: /* Always fail if reference not set (and not JavaScript compatible). */
167:
168: if (length < 0) return -1;
169:
170: /* Separate the caseless case for speed. In UTF-8 mode we can only do this
171: properly if Unicode properties are supported. Otherwise, we can check only
172: ASCII characters. */
173:
174: if (caseless)
175: {
176: #ifdef SUPPORT_UTF8
177: #ifdef SUPPORT_UCP
178: if (md->utf8)
179: {
180: /* Match characters up to the end of the reference. NOTE: the number of
181: bytes matched may differ, because there are some characters whose upper and
182: lower case versions code as different numbers of bytes. For example, U+023A
183: (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
184: a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
185: the latter. It is important, therefore, to check the length along the
186: reference, not along the subject (earlier code did this wrong). */
187:
188: USPTR endptr = p + length;
189: while (p < endptr)
190: {
191: int c, d;
192: if (eptr >= md->end_subject) return -1;
193: GETCHARINC(c, eptr);
194: GETCHARINC(d, p);
195: if (c != d && c != UCD_OTHERCASE(d)) return -1;
196: }
197: }
198: else
199: #endif
200: #endif
201:
202: /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
203: is no UCP support. */
204: {
205: if (eptr + length > md->end_subject) return -1;
206: while (length-- > 0)
207: { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
208: }
209: }
210:
211: /* In the caseful case, we can just compare the bytes, whether or not we
212: are in UTF-8 mode. */
213:
214: else
215: {
216: if (eptr + length > md->end_subject) return -1;
217: while (length-- > 0) if (*p++ != *eptr++) return -1;
218: }
219:
220: return (int)(eptr - eptr_start);
221: }
222:
223:
224:
225: /***************************************************************************
226: ****************************************************************************
227: RECURSION IN THE match() FUNCTION
228:
229: The match() function is highly recursive, though not every recursive call
230: increases the recursive depth. Nevertheless, some regular expressions can cause
231: it to recurse to a great depth. I was writing for Unix, so I just let it call
232: itself recursively. This uses the stack for saving everything that has to be
233: saved for a recursive call. On Unix, the stack can be large, and this works
234: fine.
235:
236: It turns out that on some non-Unix-like systems there are problems with
237: programs that use a lot of stack. (This despite the fact that every last chip
238: has oodles of memory these days, and techniques for extending the stack have
239: been known for decades.) So....
240:
241: There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
242: calls by keeping local variables that need to be preserved in blocks of memory
243: obtained from malloc() instead instead of on the stack. Macros are used to
244: achieve this so that the actual code doesn't look very different to what it
245: always used to.
246:
247: The original heap-recursive code used longjmp(). However, it seems that this
248: can be very slow on some operating systems. Following a suggestion from Stan
249: Switzer, the use of longjmp() has been abolished, at the cost of having to
250: provide a unique number for each call to RMATCH. There is no way of generating
251: a sequence of numbers at compile time in C. I have given them names, to make
252: them stand out more clearly.
253:
254: Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
255: FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
256: tests. Furthermore, not using longjmp() means that local dynamic variables
257: don't have indeterminate values; this has meant that the frame size can be
258: reduced because the result can be "passed back" by straight setting of the
259: variable instead of being passed in the frame.
260: ****************************************************************************
261: ***************************************************************************/
262:
263: /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
264: below must be updated in sync. */
265:
266: enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
267: RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
268: RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
269: RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
270: RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
271: RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
272: RM61, RM62, RM63, RM64, RM65, RM66 };
273:
274: /* These versions of the macros use the stack, as normal. There are debugging
275: versions and production versions. Note that the "rw" argument of RMATCH isn't
276: actually used in this definition. */
277:
278: #ifndef NO_RECURSE
279: #define REGISTER register
280:
281: #ifdef PCRE_DEBUG
282: #define RMATCH(ra,rb,rc,rd,re,rw) \
283: { \
284: printf("match() called in line %d\n", __LINE__); \
285: rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
286: printf("to line %d\n", __LINE__); \
287: }
288: #define RRETURN(ra) \
289: { \
290: printf("match() returned %d from line %d ", ra, __LINE__); \
291: return ra; \
292: }
293: #else
294: #define RMATCH(ra,rb,rc,rd,re,rw) \
295: rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
296: #define RRETURN(ra) return ra
297: #endif
298:
299: #else
300:
301:
302: /* These versions of the macros manage a private stack on the heap. Note that
303: the "rd" argument of RMATCH isn't actually used in this definition. It's the md
304: argument of match(), which never changes. */
305:
306: #define REGISTER
307:
308: #define RMATCH(ra,rb,rc,rd,re,rw)\
309: {\
310: heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
311: if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
312: frame->Xwhere = rw; \
313: newframe->Xeptr = ra;\
314: newframe->Xecode = rb;\
315: newframe->Xmstart = mstart;\
316: newframe->Xoffset_top = rc;\
317: newframe->Xeptrb = re;\
318: newframe->Xrdepth = frame->Xrdepth + 1;\
319: newframe->Xprevframe = frame;\
320: frame = newframe;\
321: DPRINTF(("restarting from line %d\n", __LINE__));\
322: goto HEAP_RECURSE;\
323: L_##rw:\
324: DPRINTF(("jumped back to line %d\n", __LINE__));\
325: }
326:
327: #define RRETURN(ra)\
328: {\
329: heapframe *oldframe = frame;\
330: frame = oldframe->Xprevframe;\
331: (pcre_stack_free)(oldframe);\
332: if (frame != NULL)\
333: {\
334: rrc = ra;\
335: goto HEAP_RETURN;\
336: }\
337: return ra;\
338: }
339:
340:
341: /* Structure for remembering the local variables in a private frame */
342:
343: typedef struct heapframe {
344: struct heapframe *Xprevframe;
345:
346: /* Function arguments that may change */
347:
348: USPTR Xeptr;
349: const uschar *Xecode;
350: USPTR Xmstart;
351: int Xoffset_top;
352: eptrblock *Xeptrb;
353: unsigned int Xrdepth;
354:
355: /* Function local variables */
356:
357: USPTR Xcallpat;
358: #ifdef SUPPORT_UTF8
359: USPTR Xcharptr;
360: #endif
361: USPTR Xdata;
362: USPTR Xnext;
363: USPTR Xpp;
364: USPTR Xprev;
365: USPTR Xsaved_eptr;
366:
367: recursion_info Xnew_recursive;
368:
369: BOOL Xcur_is_word;
370: BOOL Xcondition;
371: BOOL Xprev_is_word;
372:
373: #ifdef SUPPORT_UCP
374: int Xprop_type;
375: int Xprop_value;
376: int Xprop_fail_result;
377: int Xoclength;
378: uschar Xocchars[8];
379: #endif
380:
381: int Xcodelink;
382: int Xctype;
383: unsigned int Xfc;
384: int Xfi;
385: int Xlength;
386: int Xmax;
387: int Xmin;
388: int Xnumber;
389: int Xoffset;
390: int Xop;
391: int Xsave_capture_last;
392: int Xsave_offset1, Xsave_offset2, Xsave_offset3;
393: int Xstacksave[REC_STACK_SAVE_MAX];
394:
395: eptrblock Xnewptrb;
396:
397: /* Where to jump back to */
398:
399: int Xwhere;
400:
401: } heapframe;
402:
403: #endif
404:
405:
406: /***************************************************************************
407: ***************************************************************************/
408:
409:
410:
411: /*************************************************
412: * Match from current position *
413: *************************************************/
414:
415: /* This function is called recursively in many circumstances. Whenever it
416: returns a negative (error) response, the outer incarnation must also return the
417: same response. */
418:
419: /* These macros pack up tests that are used for partial matching, and which
420: appear several times in the code. We set the "hit end" flag if the pointer is
421: at the end of the subject and also past the start of the subject (i.e.
422: something has been matched). For hard partial matching, we then return
423: immediately. The second one is used when we already know we are past the end of
424: the subject. */
425:
426: #define CHECK_PARTIAL()\
427: if (md->partial != 0 && eptr >= md->end_subject && \
428: eptr > md->start_used_ptr) \
429: { \
430: md->hitend = TRUE; \
431: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
432: }
433:
434: #define SCHECK_PARTIAL()\
435: if (md->partial != 0 && eptr > md->start_used_ptr) \
436: { \
437: md->hitend = TRUE; \
438: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
439: }
440:
441:
442: /* Performance note: It might be tempting to extract commonly used fields from
443: the md structure (e.g. utf8, end_subject) into individual variables to improve
444: performance. Tests using gcc on a SPARC disproved this; in the first case, it
445: made performance worse.
446:
447: Arguments:
448: eptr pointer to current character in subject
449: ecode pointer to current position in compiled code
450: mstart pointer to the current match start position (can be modified
451: by encountering \K)
452: offset_top current top pointer
453: md pointer to "static" info for the match
454: eptrb pointer to chain of blocks containing eptr at start of
455: brackets - for testing for empty matches
456: rdepth the recursion depth
457:
458: Returns: MATCH_MATCH if matched ) these values are >= 0
459: MATCH_NOMATCH if failed to match )
460: a negative MATCH_xxx value for PRUNE, SKIP, etc
461: a negative PCRE_ERROR_xxx value if aborted by an error condition
462: (e.g. stopped by repeated call or recursion limit)
463: */
464:
465: static int
466: match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
467: int offset_top, match_data *md, eptrblock *eptrb, unsigned int rdepth)
468: {
469: /* These variables do not need to be preserved over recursion in this function,
470: so they can be ordinary variables in all cases. Mark some of them with
471: "register" because they are used a lot in loops. */
472:
473: register int rrc; /* Returns from recursive calls */
474: register int i; /* Used for loops not involving calls to RMATCH() */
475: register unsigned int c; /* Character values not kept over RMATCH() calls */
476: register BOOL utf8; /* Local copy of UTF-8 flag for speed */
477:
478: BOOL minimize, possessive; /* Quantifier options */
479: BOOL caseless;
480: int condcode;
481:
482: /* When recursion is not being used, all "local" variables that have to be
483: preserved over calls to RMATCH() are part of a "frame" which is obtained from
484: heap storage. Set up the top-level frame here; others are obtained from the
485: heap whenever RMATCH() does a "recursion". See the macro definitions above. */
486:
487: #ifdef NO_RECURSE
488: heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
489: if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
490: frame->Xprevframe = NULL; /* Marks the top level */
491:
492: /* Copy in the original argument variables */
493:
494: frame->Xeptr = eptr;
495: frame->Xecode = ecode;
496: frame->Xmstart = mstart;
497: frame->Xoffset_top = offset_top;
498: frame->Xeptrb = eptrb;
499: frame->Xrdepth = rdepth;
500:
501: /* This is where control jumps back to to effect "recursion" */
502:
503: HEAP_RECURSE:
504:
505: /* Macros make the argument variables come from the current frame */
506:
507: #define eptr frame->Xeptr
508: #define ecode frame->Xecode
509: #define mstart frame->Xmstart
510: #define offset_top frame->Xoffset_top
511: #define eptrb frame->Xeptrb
512: #define rdepth frame->Xrdepth
513:
514: /* Ditto for the local variables */
515:
516: #ifdef SUPPORT_UTF8
517: #define charptr frame->Xcharptr
518: #endif
519: #define callpat frame->Xcallpat
520: #define codelink frame->Xcodelink
521: #define data frame->Xdata
522: #define next frame->Xnext
523: #define pp frame->Xpp
524: #define prev frame->Xprev
525: #define saved_eptr frame->Xsaved_eptr
526:
527: #define new_recursive frame->Xnew_recursive
528:
529: #define cur_is_word frame->Xcur_is_word
530: #define condition frame->Xcondition
531: #define prev_is_word frame->Xprev_is_word
532:
533: #ifdef SUPPORT_UCP
534: #define prop_type frame->Xprop_type
535: #define prop_value frame->Xprop_value
536: #define prop_fail_result frame->Xprop_fail_result
537: #define oclength frame->Xoclength
538: #define occhars frame->Xocchars
539: #endif
540:
541: #define ctype frame->Xctype
542: #define fc frame->Xfc
543: #define fi frame->Xfi
544: #define length frame->Xlength
545: #define max frame->Xmax
546: #define min frame->Xmin
547: #define number frame->Xnumber
548: #define offset frame->Xoffset
549: #define op frame->Xop
550: #define save_capture_last frame->Xsave_capture_last
551: #define save_offset1 frame->Xsave_offset1
552: #define save_offset2 frame->Xsave_offset2
553: #define save_offset3 frame->Xsave_offset3
554: #define stacksave frame->Xstacksave
555:
556: #define newptrb frame->Xnewptrb
557:
558: /* When recursion is being used, local variables are allocated on the stack and
559: get preserved during recursion in the normal way. In this environment, fi and
560: i, and fc and c, can be the same variables. */
561:
562: #else /* NO_RECURSE not defined */
563: #define fi i
564: #define fc c
565:
566: /* Many of the following variables are used only in small blocks of the code.
567: My normal style of coding would have declared them within each of those blocks.
568: However, in order to accommodate the version of this code that uses an external
569: "stack" implemented on the heap, it is easier to declare them all here, so the
570: declarations can be cut out in a block. The only declarations within blocks
571: below are for variables that do not have to be preserved over a recursive call
572: to RMATCH(). */
573:
574: #ifdef SUPPORT_UTF8
575: const uschar *charptr;
576: #endif
577: const uschar *callpat;
578: const uschar *data;
579: const uschar *next;
580: USPTR pp;
581: const uschar *prev;
582: USPTR saved_eptr;
583:
584: recursion_info new_recursive;
585:
586: BOOL cur_is_word;
587: BOOL condition;
588: BOOL prev_is_word;
589:
590: #ifdef SUPPORT_UCP
591: int prop_type;
592: int prop_value;
593: int prop_fail_result;
594: int oclength;
595: uschar occhars[8];
596: #endif
597:
598: int codelink;
599: int ctype;
600: int length;
601: int max;
602: int min;
603: int number;
604: int offset;
605: int op;
606: int save_capture_last;
607: int save_offset1, save_offset2, save_offset3;
608: int stacksave[REC_STACK_SAVE_MAX];
609:
610: eptrblock newptrb;
611: #endif /* NO_RECURSE */
612:
613: /* To save space on the stack and in the heap frame, I have doubled up on some
614: of the local variables that are used only in localised parts of the code, but
615: still need to be preserved over recursive calls of match(). These macros define
616: the alternative names that are used. */
617:
618: #define allow_zero cur_is_word
619: #define cbegroup condition
620: #define code_offset codelink
621: #define condassert condition
622: #define matched_once prev_is_word
623:
624: /* These statements are here to stop the compiler complaining about unitialized
625: variables. */
626:
627: #ifdef SUPPORT_UCP
628: prop_value = 0;
629: prop_fail_result = 0;
630: #endif
631:
632:
633: /* This label is used for tail recursion, which is used in a few cases even
634: when NO_RECURSE is not defined, in order to reduce the amount of stack that is
635: used. Thanks to Ian Taylor for noticing this possibility and sending the
636: original patch. */
637:
638: TAIL_RECURSE:
639:
640: /* OK, now we can get on with the real code of the function. Recursive calls
641: are specified by the macro RMATCH and RRETURN is used to return. When
642: NO_RECURSE is *not* defined, these just turn into a recursive call to match()
643: and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
644: defined). However, RMATCH isn't like a function call because it's quite a
645: complicated macro. It has to be used in one particular way. This shouldn't,
646: however, impact performance when true recursion is being used. */
647:
648: #ifdef SUPPORT_UTF8
649: utf8 = md->utf8; /* Local copy of the flag */
650: #else
651: utf8 = FALSE;
652: #endif
653:
654: /* First check that we haven't called match() too many times, or that we
655: haven't exceeded the recursive call limit. */
656:
657: if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
658: if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
659:
660: /* At the start of a group with an unlimited repeat that may match an empty
661: string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
662: done this way to save having to use another function argument, which would take
663: up space on the stack. See also MATCH_CONDASSERT below.
664:
665: When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
666: such remembered pointers, to be checked when we hit the closing ket, in order
667: to break infinite loops that match no characters. When match() is called in
668: other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
669: NOT be used with tail recursion, because the memory block that is used is on
670: the stack, so a new one may be required for each match(). */
671:
672: if (md->match_function_type == MATCH_CBEGROUP)
673: {
674: newptrb.epb_saved_eptr = eptr;
675: newptrb.epb_prev = eptrb;
676: eptrb = &newptrb;
677: md->match_function_type = 0;
678: }
679:
680: /* Now start processing the opcodes. */
681:
682: for (;;)
683: {
684: minimize = possessive = FALSE;
685: op = *ecode;
686:
687: switch(op)
688: {
689: case OP_MARK:
690: md->nomatch_mark = ecode + 2;
691: md->mark = NULL; /* In case previously set by assertion */
692: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
693: eptrb, RM55);
694: if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
695: md->mark == NULL) md->mark = ecode + 2;
696:
697: /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
698: argument, and we must check whether that argument matches this MARK's
699: argument. It is passed back in md->start_match_ptr (an overloading of that
700: variable). If it does match, we reset that variable to the current subject
701: position and return MATCH_SKIP. Otherwise, pass back the return code
702: unaltered. */
703:
704: else if (rrc == MATCH_SKIP_ARG &&
705: strcmp((char *)(ecode + 2), (char *)(md->start_match_ptr)) == 0)
706: {
707: md->start_match_ptr = eptr;
708: RRETURN(MATCH_SKIP);
709: }
710: RRETURN(rrc);
711:
712: case OP_FAIL:
713: RRETURN(MATCH_NOMATCH);
714:
715: /* COMMIT overrides PRUNE, SKIP, and THEN */
716:
717: case OP_COMMIT:
718: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
719: eptrb, RM52);
720: if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
721: rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
722: rrc != MATCH_THEN)
723: RRETURN(rrc);
724: RRETURN(MATCH_COMMIT);
725:
726: /* PRUNE overrides THEN */
727:
728: case OP_PRUNE:
729: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
730: eptrb, RM51);
731: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
732: RRETURN(MATCH_PRUNE);
733:
734: case OP_PRUNE_ARG:
735: md->nomatch_mark = ecode + 2;
736: md->mark = NULL; /* In case previously set by assertion */
737: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
738: eptrb, RM56);
739: if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
740: md->mark == NULL) md->mark = ecode + 2;
741: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
742: RRETURN(MATCH_PRUNE);
743:
744: /* SKIP overrides PRUNE and THEN */
745:
746: case OP_SKIP:
747: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
748: eptrb, RM53);
749: if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
750: RRETURN(rrc);
751: md->start_match_ptr = eptr; /* Pass back current position */
752: RRETURN(MATCH_SKIP);
753:
754: /* Note that, for Perl compatibility, SKIP with an argument does NOT set
755: nomatch_mark. There is a flag that disables this opcode when re-matching a
756: pattern that ended with a SKIP for which there was not a matching MARK. */
757:
758: case OP_SKIP_ARG:
759: if (md->ignore_skip_arg)
760: {
761: ecode += _pcre_OP_lengths[*ecode] + ecode[1];
762: break;
763: }
764: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
765: eptrb, RM57);
766: if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
767: RRETURN(rrc);
768:
769: /* Pass back the current skip name by overloading md->start_match_ptr and
770: returning the special MATCH_SKIP_ARG return code. This will either be
771: caught by a matching MARK, or get to the top, where it causes a rematch
772: with the md->ignore_skip_arg flag set. */
773:
774: md->start_match_ptr = ecode + 2;
775: RRETURN(MATCH_SKIP_ARG);
776:
777: /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
778: the branch in which it occurs can be determined. Overload the start of
779: match pointer to do this. */
780:
781: case OP_THEN:
782: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
783: eptrb, RM54);
784: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
785: md->start_match_ptr = ecode;
786: RRETURN(MATCH_THEN);
787:
788: case OP_THEN_ARG:
789: md->nomatch_mark = ecode + 2;
790: md->mark = NULL; /* In case previously set by assertion */
791: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top,
792: md, eptrb, RM58);
793: if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
794: md->mark == NULL) md->mark = ecode + 2;
795: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
796: md->start_match_ptr = ecode;
797: RRETURN(MATCH_THEN);
798:
799: /* Handle an atomic group that does not contain any capturing parentheses.
800: This can be handled like an assertion. Prior to 8.13, all atomic groups
801: were handled this way. In 8.13, the code was changed as below for ONCE, so
802: that backups pass through the group and thereby reset captured values.
803: However, this uses a lot more stack, so in 8.20, atomic groups that do not
804: contain any captures generate OP_ONCE_NC, which can be handled in the old,
805: less stack intensive way.
806:
807: Check the alternative branches in turn - the matching won't pass the KET
808: for this kind of subpattern. If any one branch matches, we carry on as at
809: the end of a normal bracket, leaving the subject pointer, but resetting
810: the start-of-match value in case it was changed by \K. */
811:
812: case OP_ONCE_NC:
813: prev = ecode;
814: saved_eptr = eptr;
815: do
816: {
817: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
818: if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
819: {
820: mstart = md->start_match_ptr;
821: break;
822: }
823: if (rrc == MATCH_THEN)
824: {
825: next = ecode + GET(ecode,1);
826: if (md->start_match_ptr < next &&
827: (*ecode == OP_ALT || *next == OP_ALT))
828: rrc = MATCH_NOMATCH;
829: }
830:
831: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
832: ecode += GET(ecode,1);
833: }
834: while (*ecode == OP_ALT);
835:
836: /* If hit the end of the group (which could be repeated), fail */
837:
838: if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
839:
840: /* Continue as from after the group, updating the offsets high water
841: mark, since extracts may have been taken. */
842:
843: do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
844:
845: offset_top = md->end_offset_top;
846: eptr = md->end_match_ptr;
847:
848: /* For a non-repeating ket, just continue at this level. This also
849: happens for a repeating ket if no characters were matched in the group.
850: This is the forcible breaking of infinite loops as implemented in Perl
851: 5.005. */
852:
853: if (*ecode == OP_KET || eptr == saved_eptr)
854: {
855: ecode += 1+LINK_SIZE;
856: break;
857: }
858:
859: /* The repeating kets try the rest of the pattern or restart from the
860: preceding bracket, in the appropriate order. The second "call" of match()
861: uses tail recursion, to avoid using another stack frame. */
862:
863: if (*ecode == OP_KETRMIN)
864: {
865: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
866: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
867: ecode = prev;
868: goto TAIL_RECURSE;
869: }
870: else /* OP_KETRMAX */
871: {
872: md->match_function_type = MATCH_CBEGROUP;
873: RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
874: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
875: ecode += 1 + LINK_SIZE;
876: goto TAIL_RECURSE;
877: }
878: /* Control never gets here */
879:
880: /* Handle a capturing bracket, other than those that are possessive with an
881: unlimited repeat. If there is space in the offset vector, save the current
882: subject position in the working slot at the top of the vector. We mustn't
883: change the current values of the data slot, because they may be set from a
884: previous iteration of this group, and be referred to by a reference inside
885: the group. A failure to match might occur after the group has succeeded,
886: if something later on doesn't match. For this reason, we need to restore
887: the working value and also the values of the final offsets, in case they
888: were set by a previous iteration of the same bracket.
889:
890: If there isn't enough space in the offset vector, treat this as if it were
891: a non-capturing bracket. Don't worry about setting the flag for the error
892: case here; that is handled in the code for KET. */
893:
894: case OP_CBRA:
895: case OP_SCBRA:
896: number = GET2(ecode, 1+LINK_SIZE);
897: offset = number << 1;
898:
899: #ifdef PCRE_DEBUG
900: printf("start bracket %d\n", number);
901: printf("subject=");
902: pchars(eptr, 16, TRUE, md);
903: printf("\n");
904: #endif
905:
906: if (offset < md->offset_max)
907: {
908: save_offset1 = md->offset_vector[offset];
909: save_offset2 = md->offset_vector[offset+1];
910: save_offset3 = md->offset_vector[md->offset_end - number];
911: save_capture_last = md->capture_last;
912:
913: DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
914: md->offset_vector[md->offset_end - number] =
915: (int)(eptr - md->start_subject);
916:
917: for (;;)
918: {
919: if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
920: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
921: eptrb, RM1);
922: if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
923:
924: /* If we backed up to a THEN, check whether it is within the current
925: branch by comparing the address of the THEN that is passed back with
926: the end of the branch. If it is within the current branch, and the
927: branch is one of two or more alternatives (it either starts or ends
928: with OP_ALT), we have reached the limit of THEN's action, so convert
929: the return code to NOMATCH, which will cause normal backtracking to
930: happen from now on. Otherwise, THEN is passed back to an outer
931: alternative. This implements Perl's treatment of parenthesized groups,
932: where a group not containing | does not affect the current alternative,
933: that is, (X) is NOT the same as (X|(*F)). */
934:
935: if (rrc == MATCH_THEN)
936: {
937: next = ecode + GET(ecode,1);
938: if (md->start_match_ptr < next &&
939: (*ecode == OP_ALT || *next == OP_ALT))
940: rrc = MATCH_NOMATCH;
941: }
942:
943: /* Anything other than NOMATCH is passed back. */
944:
945: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
946: md->capture_last = save_capture_last;
947: ecode += GET(ecode, 1);
948: if (*ecode != OP_ALT) break;
949: }
950:
951: DPRINTF(("bracket %d failed\n", number));
952: md->offset_vector[offset] = save_offset1;
953: md->offset_vector[offset+1] = save_offset2;
954: md->offset_vector[md->offset_end - number] = save_offset3;
955:
956: /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
957:
958: RRETURN(rrc);
959: }
960:
961: /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
962: as a non-capturing bracket. */
963:
964: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
965: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
966:
967: DPRINTF(("insufficient capture room: treat as non-capturing\n"));
968:
969: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
970: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
971:
972: /* Non-capturing or atomic group, except for possessive with unlimited
973: repeat and ONCE group with no captures. Loop for all the alternatives.
974:
975: When we get to the final alternative within the brackets, we used to return
976: the result of a recursive call to match() whatever happened so it was
977: possible to reduce stack usage by turning this into a tail recursion,
978: except in the case of a possibly empty group. However, now that there is
979: the possiblity of (*THEN) occurring in the final alternative, this
980: optimization is no longer always possible.
981:
982: We can optimize if we know there are no (*THEN)s in the pattern; at present
983: this is the best that can be done.
984:
985: MATCH_ONCE is returned when the end of an atomic group is successfully
986: reached, but subsequent matching fails. It passes back up the tree (causing
987: captured values to be reset) until the original atomic group level is
988: reached. This is tested by comparing md->once_target with the start of the
989: group. At this point, the return is converted into MATCH_NOMATCH so that
990: previous backup points can be taken. */
991:
992: case OP_ONCE:
993: case OP_BRA:
994: case OP_SBRA:
995: DPRINTF(("start non-capturing bracket\n"));
996:
997: for (;;)
998: {
999: if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
1000:
1001: /* If this is not a possibly empty group, and there are no (*THEN)s in
1002: the pattern, and this is the final alternative, optimize as described
1003: above. */
1004:
1005: else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1006: {
1007: ecode += _pcre_OP_lengths[*ecode];
1008: goto TAIL_RECURSE;
1009: }
1010:
1011: /* In all other cases, we have to make another call to match(). */
1012:
1013: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
1014: RM2);
1015:
1016: /* See comment in the code for capturing groups above about handling
1017: THEN. */
1018:
1019: if (rrc == MATCH_THEN)
1020: {
1021: next = ecode + GET(ecode,1);
1022: if (md->start_match_ptr < next &&
1023: (*ecode == OP_ALT || *next == OP_ALT))
1024: rrc = MATCH_NOMATCH;
1025: }
1026:
1027: if (rrc != MATCH_NOMATCH)
1028: {
1029: if (rrc == MATCH_ONCE)
1030: {
1031: const uschar *scode = ecode;
1032: if (*scode != OP_ONCE) /* If not at start, find it */
1033: {
1034: while (*scode == OP_ALT) scode += GET(scode, 1);
1035: scode -= GET(scode, 1);
1036: }
1037: if (md->once_target == scode) rrc = MATCH_NOMATCH;
1038: }
1039: RRETURN(rrc);
1040: }
1041: ecode += GET(ecode, 1);
1042: if (*ecode != OP_ALT) break;
1043: }
1044:
1045: RRETURN(MATCH_NOMATCH);
1046:
1047: /* Handle possessive capturing brackets with an unlimited repeat. We come
1048: here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1049: handled similarly to the normal case above. However, the matching is
1050: different. The end of these brackets will always be OP_KETRPOS, which
1051: returns MATCH_KETRPOS without going further in the pattern. By this means
1052: we can handle the group by iteration rather than recursion, thereby
1053: reducing the amount of stack needed. */
1054:
1055: case OP_CBRAPOS:
1056: case OP_SCBRAPOS:
1057: allow_zero = FALSE;
1058:
1059: POSSESSIVE_CAPTURE:
1060: number = GET2(ecode, 1+LINK_SIZE);
1061: offset = number << 1;
1062:
1063: #ifdef PCRE_DEBUG
1064: printf("start possessive bracket %d\n", number);
1065: printf("subject=");
1066: pchars(eptr, 16, TRUE, md);
1067: printf("\n");
1068: #endif
1069:
1070: if (offset < md->offset_max)
1071: {
1072: matched_once = FALSE;
1073: code_offset = (int)(ecode - md->start_code);
1074:
1075: save_offset1 = md->offset_vector[offset];
1076: save_offset2 = md->offset_vector[offset+1];
1077: save_offset3 = md->offset_vector[md->offset_end - number];
1078: save_capture_last = md->capture_last;
1079:
1080: DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1081:
1082: /* Each time round the loop, save the current subject position for use
1083: when the group matches. For MATCH_MATCH, the group has matched, so we
1084: restart it with a new subject starting position, remembering that we had
1085: at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1086: usual. If we haven't matched any alternatives in any iteration, check to
1087: see if a previous iteration matched. If so, the group has matched;
1088: continue from afterwards. Otherwise it has failed; restore the previous
1089: capture values before returning NOMATCH. */
1090:
1091: for (;;)
1092: {
1093: md->offset_vector[md->offset_end - number] =
1094: (int)(eptr - md->start_subject);
1095: if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1096: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1097: eptrb, RM63);
1098: if (rrc == MATCH_KETRPOS)
1099: {
1100: offset_top = md->end_offset_top;
1101: eptr = md->end_match_ptr;
1102: ecode = md->start_code + code_offset;
1103: save_capture_last = md->capture_last;
1104: matched_once = TRUE;
1105: continue;
1106: }
1107:
1108: /* See comment in the code for capturing groups above about handling
1109: THEN. */
1110:
1111: if (rrc == MATCH_THEN)
1112: {
1113: next = ecode + GET(ecode,1);
1114: if (md->start_match_ptr < next &&
1115: (*ecode == OP_ALT || *next == OP_ALT))
1116: rrc = MATCH_NOMATCH;
1117: }
1118:
1119: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1120: md->capture_last = save_capture_last;
1121: ecode += GET(ecode, 1);
1122: if (*ecode != OP_ALT) break;
1123: }
1124:
1125: if (!matched_once)
1126: {
1127: md->offset_vector[offset] = save_offset1;
1128: md->offset_vector[offset+1] = save_offset2;
1129: md->offset_vector[md->offset_end - number] = save_offset3;
1130: }
1131:
1132: if (allow_zero || matched_once)
1133: {
1134: ecode += 1 + LINK_SIZE;
1135: break;
1136: }
1137:
1138: RRETURN(MATCH_NOMATCH);
1139: }
1140:
1141: /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1142: as a non-capturing bracket. */
1143:
1144: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1145: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1146:
1147: DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1148:
1149: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1150: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1151:
1152: /* Non-capturing possessive bracket with unlimited repeat. We come here
1153: from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1154: without the capturing complication. It is written out separately for speed
1155: and cleanliness. */
1156:
1157: case OP_BRAPOS:
1158: case OP_SBRAPOS:
1159: allow_zero = FALSE;
1160:
1161: POSSESSIVE_NON_CAPTURE:
1162: matched_once = FALSE;
1163: code_offset = (int)(ecode - md->start_code);
1164:
1165: for (;;)
1166: {
1167: if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1168: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1169: eptrb, RM48);
1170: if (rrc == MATCH_KETRPOS)
1171: {
1172: offset_top = md->end_offset_top;
1173: eptr = md->end_match_ptr;
1174: ecode = md->start_code + code_offset;
1175: matched_once = TRUE;
1176: continue;
1177: }
1178:
1179: /* See comment in the code for capturing groups above about handling
1180: THEN. */
1181:
1182: if (rrc == MATCH_THEN)
1183: {
1184: next = ecode + GET(ecode,1);
1185: if (md->start_match_ptr < next &&
1186: (*ecode == OP_ALT || *next == OP_ALT))
1187: rrc = MATCH_NOMATCH;
1188: }
1189:
1190: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1191: ecode += GET(ecode, 1);
1192: if (*ecode != OP_ALT) break;
1193: }
1194:
1195: if (matched_once || allow_zero)
1196: {
1197: ecode += 1 + LINK_SIZE;
1198: break;
1199: }
1200: RRETURN(MATCH_NOMATCH);
1201:
1202: /* Control never reaches here. */
1203:
1204: /* Conditional group: compilation checked that there are no more than
1205: two branches. If the condition is false, skipping the first branch takes us
1206: past the end if there is only one branch, but that's OK because that is
1207: exactly what going to the ket would do. */
1208:
1209: case OP_COND:
1210: case OP_SCOND:
1211: codelink = GET(ecode, 1);
1212:
1213: /* Because of the way auto-callout works during compile, a callout item is
1214: inserted between OP_COND and an assertion condition. */
1215:
1216: if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1217: {
1218: if (pcre_callout != NULL)
1219: {
1220: pcre_callout_block cb;
1221: cb.version = 2; /* Version 1 of the callout block */
1222: cb.callout_number = ecode[LINK_SIZE+2];
1223: cb.offset_vector = md->offset_vector;
1224: cb.subject = (PCRE_SPTR)md->start_subject;
1225: cb.subject_length = (int)(md->end_subject - md->start_subject);
1226: cb.start_match = (int)(mstart - md->start_subject);
1227: cb.current_position = (int)(eptr - md->start_subject);
1228: cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1229: cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1230: cb.capture_top = offset_top/2;
1231: cb.capture_last = md->capture_last;
1232: cb.callout_data = md->callout_data;
1233: cb.mark = md->nomatch_mark;
1234: if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1235: if (rrc < 0) RRETURN(rrc);
1236: }
1237: ecode += _pcre_OP_lengths[OP_CALLOUT];
1238: }
1239:
1240: condcode = ecode[LINK_SIZE+1];
1241:
1242: /* Now see what the actual condition is */
1243:
1244: if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1245: {
1246: if (md->recursive == NULL) /* Not recursing => FALSE */
1247: {
1248: condition = FALSE;
1249: ecode += GET(ecode, 1);
1250: }
1251: else
1252: {
1253: int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1254: condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1255:
1256: /* If the test is for recursion into a specific subpattern, and it is
1257: false, but the test was set up by name, scan the table to see if the
1258: name refers to any other numbers, and test them. The condition is true
1259: if any one is set. */
1260:
1261: if (!condition && condcode == OP_NRREF)
1262: {
1263: uschar *slotA = md->name_table;
1264: for (i = 0; i < md->name_count; i++)
1265: {
1266: if (GET2(slotA, 0) == recno) break;
1267: slotA += md->name_entry_size;
1268: }
1269:
1270: /* Found a name for the number - there can be only one; duplicate
1271: names for different numbers are allowed, but not vice versa. First
1272: scan down for duplicates. */
1273:
1274: if (i < md->name_count)
1275: {
1276: uschar *slotB = slotA;
1277: while (slotB > md->name_table)
1278: {
1279: slotB -= md->name_entry_size;
1280: if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1281: {
1282: condition = GET2(slotB, 0) == md->recursive->group_num;
1283: if (condition) break;
1284: }
1285: else break;
1286: }
1287:
1288: /* Scan up for duplicates */
1289:
1290: if (!condition)
1291: {
1292: slotB = slotA;
1293: for (i++; i < md->name_count; i++)
1294: {
1295: slotB += md->name_entry_size;
1296: if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1297: {
1298: condition = GET2(slotB, 0) == md->recursive->group_num;
1299: if (condition) break;
1300: }
1301: else break;
1302: }
1303: }
1304: }
1305: }
1306:
1307: /* Chose branch according to the condition */
1308:
1309: ecode += condition? 3 : GET(ecode, 1);
1310: }
1311: }
1312:
1313: else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1314: {
1315: offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1316: condition = offset < offset_top && md->offset_vector[offset] >= 0;
1317:
1318: /* If the numbered capture is unset, but the reference was by name,
1319: scan the table to see if the name refers to any other numbers, and test
1320: them. The condition is true if any one is set. This is tediously similar
1321: to the code above, but not close enough to try to amalgamate. */
1322:
1323: if (!condition && condcode == OP_NCREF)
1324: {
1325: int refno = offset >> 1;
1326: uschar *slotA = md->name_table;
1327:
1328: for (i = 0; i < md->name_count; i++)
1329: {
1330: if (GET2(slotA, 0) == refno) break;
1331: slotA += md->name_entry_size;
1332: }
1333:
1334: /* Found a name for the number - there can be only one; duplicate names
1335: for different numbers are allowed, but not vice versa. First scan down
1336: for duplicates. */
1337:
1338: if (i < md->name_count)
1339: {
1340: uschar *slotB = slotA;
1341: while (slotB > md->name_table)
1342: {
1343: slotB -= md->name_entry_size;
1344: if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1345: {
1346: offset = GET2(slotB, 0) << 1;
1347: condition = offset < offset_top &&
1348: md->offset_vector[offset] >= 0;
1349: if (condition) break;
1350: }
1351: else break;
1352: }
1353:
1354: /* Scan up for duplicates */
1355:
1356: if (!condition)
1357: {
1358: slotB = slotA;
1359: for (i++; i < md->name_count; i++)
1360: {
1361: slotB += md->name_entry_size;
1362: if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1363: {
1364: offset = GET2(slotB, 0) << 1;
1365: condition = offset < offset_top &&
1366: md->offset_vector[offset] >= 0;
1367: if (condition) break;
1368: }
1369: else break;
1370: }
1371: }
1372: }
1373: }
1374:
1375: /* Chose branch according to the condition */
1376:
1377: ecode += condition? 3 : GET(ecode, 1);
1378: }
1379:
1380: else if (condcode == OP_DEF) /* DEFINE - always false */
1381: {
1382: condition = FALSE;
1383: ecode += GET(ecode, 1);
1384: }
1385:
1386: /* The condition is an assertion. Call match() to evaluate it - setting
1387: md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1388: an assertion. */
1389:
1390: else
1391: {
1392: md->match_function_type = MATCH_CONDASSERT;
1393: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1394: if (rrc == MATCH_MATCH)
1395: {
1396: if (md->end_offset_top > offset_top)
1397: offset_top = md->end_offset_top; /* Captures may have happened */
1398: condition = TRUE;
1399: ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1400: while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1401: }
1402:
1403: /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1404: assertion; it is therefore treated as NOMATCH. */
1405:
1406: else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1407: {
1408: RRETURN(rrc); /* Need braces because of following else */
1409: }
1410: else
1411: {
1412: condition = FALSE;
1413: ecode += codelink;
1414: }
1415: }
1416:
1417: /* We are now at the branch that is to be obeyed. As there is only one, can
1418: use tail recursion to avoid using another stack frame, except when there is
1419: unlimited repeat of a possibly empty group. In the latter case, a recursive
1420: call to match() is always required, unless the second alternative doesn't
1421: exist, in which case we can just plough on. Note that, for compatibility
1422: with Perl, the | in a conditional group is NOT treated as creating two
1423: alternatives. If a THEN is encountered in the branch, it propagates out to
1424: the enclosing alternative (unless nested in a deeper set of alternatives,
1425: of course). */
1426:
1427: if (condition || *ecode == OP_ALT)
1428: {
1429: if (op != OP_SCOND)
1430: {
1431: ecode += 1 + LINK_SIZE;
1432: goto TAIL_RECURSE;
1433: }
1434:
1435: md->match_function_type = MATCH_CBEGROUP;
1436: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1437: RRETURN(rrc);
1438: }
1439:
1440: /* Condition false & no alternative; continue after the group. */
1441:
1442: else
1443: {
1444: ecode += 1 + LINK_SIZE;
1445: }
1446: break;
1447:
1448:
1449: /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1450: to close any currently open capturing brackets. */
1451:
1452: case OP_CLOSE:
1453: number = GET2(ecode, 1);
1454: offset = number << 1;
1455:
1456: #ifdef PCRE_DEBUG
1457: printf("end bracket %d at *ACCEPT", number);
1458: printf("\n");
1459: #endif
1460:
1461: md->capture_last = number;
1462: if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1463: {
1464: md->offset_vector[offset] =
1465: md->offset_vector[md->offset_end - number];
1466: md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1467: if (offset_top <= offset) offset_top = offset + 2;
1468: }
1469: ecode += 3;
1470: break;
1471:
1472:
1473: /* End of the pattern, either real or forced. */
1474:
1475: case OP_END:
1476: case OP_ACCEPT:
1477: case OP_ASSERT_ACCEPT:
1478:
1479: /* If we have matched an empty string, fail if not in an assertion and not
1480: in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1481: is set and we have matched at the start of the subject. In both cases,
1482: backtracking will then try other alternatives, if any. */
1483:
1484: if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1485: md->recursive == NULL &&
1486: (md->notempty ||
1487: (md->notempty_atstart &&
1488: mstart == md->start_subject + md->start_offset)))
1489: RRETURN(MATCH_NOMATCH);
1490:
1491: /* Otherwise, we have a match. */
1492:
1493: md->end_match_ptr = eptr; /* Record where we ended */
1494: md->end_offset_top = offset_top; /* and how many extracts were taken */
1495: md->start_match_ptr = mstart; /* and the start (\K can modify) */
1496:
1497: /* For some reason, the macros don't work properly if an expression is
1498: given as the argument to RRETURN when the heap is in use. */
1499:
1500: rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1501: RRETURN(rrc);
1502:
1503: /* Assertion brackets. Check the alternative branches in turn - the
1504: matching won't pass the KET for an assertion. If any one branch matches,
1505: the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1506: start of each branch to move the current point backwards, so the code at
1507: this level is identical to the lookahead case. When the assertion is part
1508: of a condition, we want to return immediately afterwards. The caller of
1509: this incarnation of the match() function will have set MATCH_CONDASSERT in
1510: md->match_function type, and one of these opcodes will be the first opcode
1511: that is processed. We use a local variable that is preserved over calls to
1512: match() to remember this case. */
1513:
1514: case OP_ASSERT:
1515: case OP_ASSERTBACK:
1516: if (md->match_function_type == MATCH_CONDASSERT)
1517: {
1518: condassert = TRUE;
1519: md->match_function_type = 0;
1520: }
1521: else condassert = FALSE;
1522:
1523: do
1524: {
1525: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1526: if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1527: {
1528: mstart = md->start_match_ptr; /* In case \K reset it */
1529: break;
1530: }
1531:
1532: /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1533: as NOMATCH. */
1534:
1535: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1536: ecode += GET(ecode, 1);
1537: }
1538: while (*ecode == OP_ALT);
1539:
1540: if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1541:
1542: /* If checking an assertion for a condition, return MATCH_MATCH. */
1543:
1544: if (condassert) RRETURN(MATCH_MATCH);
1545:
1546: /* Continue from after the assertion, updating the offsets high water
1547: mark, since extracts may have been taken during the assertion. */
1548:
1549: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1550: ecode += 1 + LINK_SIZE;
1551: offset_top = md->end_offset_top;
1552: continue;
1553:
1554: /* Negative assertion: all branches must fail to match. Encountering SKIP,
1555: PRUNE, or COMMIT means we must assume failure without checking subsequent
1556: branches. */
1557:
1558: case OP_ASSERT_NOT:
1559: case OP_ASSERTBACK_NOT:
1560: if (md->match_function_type == MATCH_CONDASSERT)
1561: {
1562: condassert = TRUE;
1563: md->match_function_type = 0;
1564: }
1565: else condassert = FALSE;
1566:
1567: do
1568: {
1569: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1570: if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1571: if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1572: {
1573: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1574: break;
1575: }
1576:
1577: /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1578: as NOMATCH. */
1579:
1580: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1581: ecode += GET(ecode,1);
1582: }
1583: while (*ecode == OP_ALT);
1584:
1585: if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1586:
1587: ecode += 1 + LINK_SIZE;
1588: continue;
1589:
1590: /* Move the subject pointer back. This occurs only at the start of
1591: each branch of a lookbehind assertion. If we are too close to the start to
1592: move back, this match function fails. When working with UTF-8 we move
1593: back a number of characters, not bytes. */
1594:
1595: case OP_REVERSE:
1596: #ifdef SUPPORT_UTF8
1597: if (utf8)
1598: {
1599: i = GET(ecode, 1);
1600: while (i-- > 0)
1601: {
1602: eptr--;
1603: if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1604: BACKCHAR(eptr);
1605: }
1606: }
1607: else
1608: #endif
1609:
1610: /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1611:
1612: {
1613: eptr -= GET(ecode, 1);
1614: if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1615: }
1616:
1617: /* Save the earliest consulted character, then skip to next op code */
1618:
1619: if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1620: ecode += 1 + LINK_SIZE;
1621: break;
1622:
1623: /* The callout item calls an external function, if one is provided, passing
1624: details of the match so far. This is mainly for debugging, though the
1625: function is able to force a failure. */
1626:
1627: case OP_CALLOUT:
1628: if (pcre_callout != NULL)
1629: {
1630: pcre_callout_block cb;
1631: cb.version = 2; /* Version 1 of the callout block */
1632: cb.callout_number = ecode[1];
1633: cb.offset_vector = md->offset_vector;
1634: cb.subject = (PCRE_SPTR)md->start_subject;
1635: cb.subject_length = (int)(md->end_subject - md->start_subject);
1636: cb.start_match = (int)(mstart - md->start_subject);
1637: cb.current_position = (int)(eptr - md->start_subject);
1638: cb.pattern_position = GET(ecode, 2);
1639: cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1640: cb.capture_top = offset_top/2;
1641: cb.capture_last = md->capture_last;
1642: cb.callout_data = md->callout_data;
1643: cb.mark = md->nomatch_mark;
1644: if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1645: if (rrc < 0) RRETURN(rrc);
1646: }
1647: ecode += 2 + 2*LINK_SIZE;
1648: break;
1649:
1650: /* Recursion either matches the current regex, or some subexpression. The
1651: offset data is the offset to the starting bracket from the start of the
1652: whole pattern. (This is so that it works from duplicated subpatterns.)
1653:
1654: The state of the capturing groups is preserved over recursion, and
1655: re-instated afterwards. We don't know how many are started and not yet
1656: finished (offset_top records the completed total) so we just have to save
1657: all the potential data. There may be up to 65535 such values, which is too
1658: large to put on the stack, but using malloc for small numbers seems
1659: expensive. As a compromise, the stack is used when there are no more than
1660: REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1661:
1662: There are also other values that have to be saved. We use a chained
1663: sequence of blocks that actually live on the stack. Thanks to Robin Houston
1664: for the original version of this logic. It has, however, been hacked around
1665: a lot, so he is not to blame for the current way it works. */
1666:
1667: case OP_RECURSE:
1668: {
1669: recursion_info *ri;
1670: int recno;
1671:
1672: callpat = md->start_code + GET(ecode, 1);
1673: recno = (callpat == md->start_code)? 0 :
1674: GET2(callpat, 1 + LINK_SIZE);
1675:
1676: /* Check for repeating a recursion without advancing the subject pointer.
1677: This should catch convoluted mutual recursions. (Some simple cases are
1678: caught at compile time.) */
1679:
1680: for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1681: if (recno == ri->group_num && eptr == ri->subject_position)
1682: RRETURN(PCRE_ERROR_RECURSELOOP);
1683:
1684: /* Add to "recursing stack" */
1685:
1686: new_recursive.group_num = recno;
1687: new_recursive.subject_position = eptr;
1688: new_recursive.prevrec = md->recursive;
1689: md->recursive = &new_recursive;
1690:
1691: /* Where to continue from afterwards */
1692:
1693: ecode += 1 + LINK_SIZE;
1694:
1695: /* Now save the offset data */
1696:
1697: new_recursive.saved_max = md->offset_end;
1698: if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1699: new_recursive.offset_save = stacksave;
1700: else
1701: {
1702: new_recursive.offset_save =
1703: (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1704: if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1705: }
1706: memcpy(new_recursive.offset_save, md->offset_vector,
1707: new_recursive.saved_max * sizeof(int));
1708:
1709: /* OK, now we can do the recursion. After processing each alternative,
1710: restore the offset data. If there were nested recursions, md->recursive
1711: might be changed, so reset it before looping. */
1712:
1713: DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1714: cbegroup = (*callpat >= OP_SBRA);
1715: do
1716: {
1717: if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1718: RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1719: md, eptrb, RM6);
1720: memcpy(md->offset_vector, new_recursive.offset_save,
1721: new_recursive.saved_max * sizeof(int));
1722: md->recursive = new_recursive.prevrec;
1723: if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1724: {
1725: DPRINTF(("Recursion matched\n"));
1726: if (new_recursive.offset_save != stacksave)
1727: (pcre_free)(new_recursive.offset_save);
1728:
1729: /* Set where we got to in the subject, and reset the start in case
1730: it was changed by \K. This *is* propagated back out of a recursion,
1731: for Perl compatibility. */
1732:
1733: eptr = md->end_match_ptr;
1734: mstart = md->start_match_ptr;
1735: goto RECURSION_MATCHED; /* Exit loop; end processing */
1736: }
1737:
1738: /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1739: as NOMATCH. */
1740:
1741: else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1742: {
1743: DPRINTF(("Recursion gave error %d\n", rrc));
1744: if (new_recursive.offset_save != stacksave)
1745: (pcre_free)(new_recursive.offset_save);
1746: RRETURN(rrc);
1747: }
1748:
1749: md->recursive = &new_recursive;
1750: callpat += GET(callpat, 1);
1751: }
1752: while (*callpat == OP_ALT);
1753:
1754: DPRINTF(("Recursion didn't match\n"));
1755: md->recursive = new_recursive.prevrec;
1756: if (new_recursive.offset_save != stacksave)
1757: (pcre_free)(new_recursive.offset_save);
1758: RRETURN(MATCH_NOMATCH);
1759: }
1760:
1761: RECURSION_MATCHED:
1762: break;
1763:
1764: /* An alternation is the end of a branch; scan along to find the end of the
1765: bracketed group and go to there. */
1766:
1767: case OP_ALT:
1768: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1769: break;
1770:
1771: /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1772: indicating that it may occur zero times. It may repeat infinitely, or not
1773: at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1774: with fixed upper repeat limits are compiled as a number of copies, with the
1775: optional ones preceded by BRAZERO or BRAMINZERO. */
1776:
1777: case OP_BRAZERO:
1778: next = ecode + 1;
1779: RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1780: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1781: do next += GET(next, 1); while (*next == OP_ALT);
1782: ecode = next + 1 + LINK_SIZE;
1783: break;
1784:
1785: case OP_BRAMINZERO:
1786: next = ecode + 1;
1787: do next += GET(next, 1); while (*next == OP_ALT);
1788: RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1789: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1790: ecode++;
1791: break;
1792:
1793: case OP_SKIPZERO:
1794: next = ecode+1;
1795: do next += GET(next,1); while (*next == OP_ALT);
1796: ecode = next + 1 + LINK_SIZE;
1797: break;
1798:
1799: /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1800: here; just jump to the group, with allow_zero set TRUE. */
1801:
1802: case OP_BRAPOSZERO:
1803: op = *(++ecode);
1804: allow_zero = TRUE;
1805: if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1806: goto POSSESSIVE_NON_CAPTURE;
1807:
1808: /* End of a group, repeated or non-repeating. */
1809:
1810: case OP_KET:
1811: case OP_KETRMIN:
1812: case OP_KETRMAX:
1813: case OP_KETRPOS:
1814: prev = ecode - GET(ecode, 1);
1815:
1816: /* If this was a group that remembered the subject start, in order to break
1817: infinite repeats of empty string matches, retrieve the subject start from
1818: the chain. Otherwise, set it NULL. */
1819:
1820: if (*prev >= OP_SBRA || *prev == OP_ONCE)
1821: {
1822: saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1823: eptrb = eptrb->epb_prev; /* Backup to previous group */
1824: }
1825: else saved_eptr = NULL;
1826:
1827: /* If we are at the end of an assertion group or a non-capturing atomic
1828: group, stop matching and return MATCH_MATCH, but record the current high
1829: water mark for use by positive assertions. We also need to record the match
1830: start in case it was changed by \K. */
1831:
1832: if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1833: *prev == OP_ONCE_NC)
1834: {
1835: md->end_match_ptr = eptr; /* For ONCE_NC */
1836: md->end_offset_top = offset_top;
1837: md->start_match_ptr = mstart;
1838: RRETURN(MATCH_MATCH); /* Sets md->mark */
1839: }
1840:
1841: /* For capturing groups we have to check the group number back at the start
1842: and if necessary complete handling an extraction by setting the offsets and
1843: bumping the high water mark. Whole-pattern recursion is coded as a recurse
1844: into group 0, so it won't be picked up here. Instead, we catch it when the
1845: OP_END is reached. Other recursion is handled here. We just have to record
1846: the current subject position and start match pointer and give a MATCH
1847: return. */
1848:
1849: if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1850: *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1851: {
1852: number = GET2(prev, 1+LINK_SIZE);
1853: offset = number << 1;
1854:
1855: #ifdef PCRE_DEBUG
1856: printf("end bracket %d", number);
1857: printf("\n");
1858: #endif
1859:
1860: /* Handle a recursively called group. */
1861:
1862: if (md->recursive != NULL && md->recursive->group_num == number)
1863: {
1864: md->end_match_ptr = eptr;
1865: md->start_match_ptr = mstart;
1866: RRETURN(MATCH_MATCH);
1867: }
1868:
1869: /* Deal with capturing */
1870:
1871: md->capture_last = number;
1872: if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1873: {
1874: /* If offset is greater than offset_top, it means that we are
1875: "skipping" a capturing group, and that group's offsets must be marked
1876: unset. In earlier versions of PCRE, all the offsets were unset at the
1877: start of matching, but this doesn't work because atomic groups and
1878: assertions can cause a value to be set that should later be unset.
1879: Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1880: part of the atomic group, but this is not on the final matching path,
1881: so must be unset when 2 is set. (If there is no group 2, there is no
1882: problem, because offset_top will then be 2, indicating no capture.) */
1883:
1884: if (offset > offset_top)
1885: {
1886: register int *iptr = md->offset_vector + offset_top;
1887: register int *iend = md->offset_vector + offset;
1888: while (iptr < iend) *iptr++ = -1;
1889: }
1890:
1891: /* Now make the extraction */
1892:
1893: md->offset_vector[offset] =
1894: md->offset_vector[md->offset_end - number];
1895: md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1896: if (offset_top <= offset) offset_top = offset + 2;
1897: }
1898: }
1899:
1900: /* For an ordinary non-repeating ket, just continue at this level. This
1901: also happens for a repeating ket if no characters were matched in the
1902: group. This is the forcible breaking of infinite loops as implemented in
1903: Perl 5.005. For a non-repeating atomic group that includes captures,
1904: establish a backup point by processing the rest of the pattern at a lower
1905: level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1906: original OP_ONCE level, thereby bypassing intermediate backup points, but
1907: resetting any captures that happened along the way. */
1908:
1909: if (*ecode == OP_KET || eptr == saved_eptr)
1910: {
1911: if (*prev == OP_ONCE)
1912: {
1913: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1914: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1915: md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1916: RRETURN(MATCH_ONCE);
1917: }
1918: ecode += 1 + LINK_SIZE; /* Carry on at this level */
1919: break;
1920: }
1921:
1922: /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1923: and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1924: at a time from the outer level, thus saving stack. */
1925:
1926: if (*ecode == OP_KETRPOS)
1927: {
1928: md->end_match_ptr = eptr;
1929: md->end_offset_top = offset_top;
1930: RRETURN(MATCH_KETRPOS);
1931: }
1932:
1933: /* The normal repeating kets try the rest of the pattern or restart from
1934: the preceding bracket, in the appropriate order. In the second case, we can
1935: use tail recursion to avoid using another stack frame, unless we have an
1936: an atomic group or an unlimited repeat of a group that can match an empty
1937: string. */
1938:
1939: if (*ecode == OP_KETRMIN)
1940: {
1941: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1942: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1943: if (*prev == OP_ONCE)
1944: {
1945: RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1946: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1947: md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1948: RRETURN(MATCH_ONCE);
1949: }
1950: if (*prev >= OP_SBRA) /* Could match an empty string */
1951: {
1952: md->match_function_type = MATCH_CBEGROUP;
1953: RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1954: RRETURN(rrc);
1955: }
1956: ecode = prev;
1957: goto TAIL_RECURSE;
1958: }
1959: else /* OP_KETRMAX */
1960: {
1961: if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1962: RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1963: if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1964: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1965: if (*prev == OP_ONCE)
1966: {
1967: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1968: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1969: md->once_target = prev;
1970: RRETURN(MATCH_ONCE);
1971: }
1972: ecode += 1 + LINK_SIZE;
1973: goto TAIL_RECURSE;
1974: }
1975: /* Control never gets here */
1976:
1977: /* Not multiline mode: start of subject assertion, unless notbol. */
1978:
1979: case OP_CIRC:
1980: if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1981:
1982: /* Start of subject assertion */
1983:
1984: case OP_SOD:
1985: if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1986: ecode++;
1987: break;
1988:
1989: /* Multiline mode: start of subject unless notbol, or after any newline. */
1990:
1991: case OP_CIRCM:
1992: if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1993: if (eptr != md->start_subject &&
1994: (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1995: RRETURN(MATCH_NOMATCH);
1996: ecode++;
1997: break;
1998:
1999: /* Start of match assertion */
2000:
2001: case OP_SOM:
2002: if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2003: ecode++;
2004: break;
2005:
2006: /* Reset the start of match point */
2007:
2008: case OP_SET_SOM:
2009: mstart = eptr;
2010: ecode++;
2011: break;
2012:
2013: /* Multiline mode: assert before any newline, or before end of subject
2014: unless noteol is set. */
2015:
2016: case OP_DOLLM:
2017: if (eptr < md->end_subject)
2018: { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
2019: else
2020: {
2021: if (md->noteol) RRETURN(MATCH_NOMATCH);
2022: SCHECK_PARTIAL();
2023: }
2024: ecode++;
2025: break;
2026:
2027: /* Not multiline mode: assert before a terminating newline or before end of
2028: subject unless noteol is set. */
2029:
2030: case OP_DOLL:
2031: if (md->noteol) RRETURN(MATCH_NOMATCH);
2032: if (!md->endonly) goto ASSERT_NL_OR_EOS;
2033:
2034: /* ... else fall through for endonly */
2035:
2036: /* End of subject assertion (\z) */
2037:
2038: case OP_EOD:
2039: if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2040: SCHECK_PARTIAL();
2041: ecode++;
2042: break;
2043:
2044: /* End of subject or ending \n assertion (\Z) */
2045:
2046: case OP_EODN:
2047: ASSERT_NL_OR_EOS:
2048: if (eptr < md->end_subject &&
2049: (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2050: RRETURN(MATCH_NOMATCH);
2051:
2052: /* Either at end of string or \n before end. */
2053:
2054: SCHECK_PARTIAL();
2055: ecode++;
2056: break;
2057:
2058: /* Word boundary assertions */
2059:
2060: case OP_NOT_WORD_BOUNDARY:
2061: case OP_WORD_BOUNDARY:
2062: {
2063:
2064: /* Find out if the previous and current characters are "word" characters.
2065: It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2066: be "non-word" characters. Remember the earliest consulted character for
2067: partial matching. */
2068:
2069: #ifdef SUPPORT_UTF8
2070: if (utf8)
2071: {
2072: /* Get status of previous character */
2073:
2074: if (eptr == md->start_subject) prev_is_word = FALSE; else
2075: {
2076: USPTR lastptr = eptr - 1;
2077: while((*lastptr & 0xc0) == 0x80) lastptr--;
2078: if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2079: GETCHAR(c, lastptr);
2080: #ifdef SUPPORT_UCP
2081: if (md->use_ucp)
2082: {
2083: if (c == '_') prev_is_word = TRUE; else
2084: {
2085: int cat = UCD_CATEGORY(c);
2086: prev_is_word = (cat == ucp_L || cat == ucp_N);
2087: }
2088: }
2089: else
2090: #endif
2091: prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2092: }
2093:
2094: /* Get status of next character */
2095:
2096: if (eptr >= md->end_subject)
2097: {
2098: SCHECK_PARTIAL();
2099: cur_is_word = FALSE;
2100: }
2101: else
2102: {
2103: GETCHAR(c, eptr);
2104: #ifdef SUPPORT_UCP
2105: if (md->use_ucp)
2106: {
2107: if (c == '_') cur_is_word = TRUE; else
2108: {
2109: int cat = UCD_CATEGORY(c);
2110: cur_is_word = (cat == ucp_L || cat == ucp_N);
2111: }
2112: }
2113: else
2114: #endif
2115: cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2116: }
2117: }
2118: else
2119: #endif
2120:
2121: /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2122: consistency with the behaviour of \w we do use it in this case. */
2123:
2124: {
2125: /* Get status of previous character */
2126:
2127: if (eptr == md->start_subject) prev_is_word = FALSE; else
2128: {
2129: if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2130: #ifdef SUPPORT_UCP
2131: if (md->use_ucp)
2132: {
2133: c = eptr[-1];
2134: if (c == '_') prev_is_word = TRUE; else
2135: {
2136: int cat = UCD_CATEGORY(c);
2137: prev_is_word = (cat == ucp_L || cat == ucp_N);
2138: }
2139: }
2140: else
2141: #endif
2142: prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2143: }
2144:
2145: /* Get status of next character */
2146:
2147: if (eptr >= md->end_subject)
2148: {
2149: SCHECK_PARTIAL();
2150: cur_is_word = FALSE;
2151: }
2152: else
2153: #ifdef SUPPORT_UCP
2154: if (md->use_ucp)
2155: {
2156: c = *eptr;
2157: if (c == '_') cur_is_word = TRUE; else
2158: {
2159: int cat = UCD_CATEGORY(c);
2160: cur_is_word = (cat == ucp_L || cat == ucp_N);
2161: }
2162: }
2163: else
2164: #endif
2165: cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2166: }
2167:
2168: /* Now see if the situation is what we want */
2169:
2170: if ((*ecode++ == OP_WORD_BOUNDARY)?
2171: cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2172: RRETURN(MATCH_NOMATCH);
2173: }
2174: break;
2175:
2176: /* Match a single character type; inline for speed */
2177:
2178: case OP_ANY:
2179: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2180: /* Fall through */
2181:
2182: case OP_ALLANY:
2183: if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2184: { /* not be updated before SCHECK_PARTIAL. */
2185: SCHECK_PARTIAL();
2186: RRETURN(MATCH_NOMATCH);
2187: }
2188: eptr++;
2189: if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2190: ecode++;
2191: break;
2192:
2193: /* Match a single byte, even in UTF-8 mode. This opcode really does match
2194: any byte, even newline, independent of the setting of PCRE_DOTALL. */
2195:
2196: case OP_ANYBYTE:
2197: if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2198: { /* not be updated before SCHECK_PARTIAL. */
2199: SCHECK_PARTIAL();
2200: RRETURN(MATCH_NOMATCH);
2201: }
2202: eptr++;
2203: ecode++;
2204: break;
2205:
2206: case OP_NOT_DIGIT:
2207: if (eptr >= md->end_subject)
2208: {
2209: SCHECK_PARTIAL();
2210: RRETURN(MATCH_NOMATCH);
2211: }
2212: GETCHARINCTEST(c, eptr);
2213: if (
2214: #ifdef SUPPORT_UTF8
2215: c < 256 &&
2216: #endif
2217: (md->ctypes[c] & ctype_digit) != 0
2218: )
2219: RRETURN(MATCH_NOMATCH);
2220: ecode++;
2221: break;
2222:
2223: case OP_DIGIT:
2224: if (eptr >= md->end_subject)
2225: {
2226: SCHECK_PARTIAL();
2227: RRETURN(MATCH_NOMATCH);
2228: }
2229: GETCHARINCTEST(c, eptr);
2230: if (
2231: #ifdef SUPPORT_UTF8
2232: c >= 256 ||
2233: #endif
2234: (md->ctypes[c] & ctype_digit) == 0
2235: )
2236: RRETURN(MATCH_NOMATCH);
2237: ecode++;
2238: break;
2239:
2240: case OP_NOT_WHITESPACE:
2241: if (eptr >= md->end_subject)
2242: {
2243: SCHECK_PARTIAL();
2244: RRETURN(MATCH_NOMATCH);
2245: }
2246: GETCHARINCTEST(c, eptr);
2247: if (
2248: #ifdef SUPPORT_UTF8
2249: c < 256 &&
2250: #endif
2251: (md->ctypes[c] & ctype_space) != 0
2252: )
2253: RRETURN(MATCH_NOMATCH);
2254: ecode++;
2255: break;
2256:
2257: case OP_WHITESPACE:
2258: if (eptr >= md->end_subject)
2259: {
2260: SCHECK_PARTIAL();
2261: RRETURN(MATCH_NOMATCH);
2262: }
2263: GETCHARINCTEST(c, eptr);
2264: if (
2265: #ifdef SUPPORT_UTF8
2266: c >= 256 ||
2267: #endif
2268: (md->ctypes[c] & ctype_space) == 0
2269: )
2270: RRETURN(MATCH_NOMATCH);
2271: ecode++;
2272: break;
2273:
2274: case OP_NOT_WORDCHAR:
2275: if (eptr >= md->end_subject)
2276: {
2277: SCHECK_PARTIAL();
2278: RRETURN(MATCH_NOMATCH);
2279: }
2280: GETCHARINCTEST(c, eptr);
2281: if (
2282: #ifdef SUPPORT_UTF8
2283: c < 256 &&
2284: #endif
2285: (md->ctypes[c] & ctype_word) != 0
2286: )
2287: RRETURN(MATCH_NOMATCH);
2288: ecode++;
2289: break;
2290:
2291: case OP_WORDCHAR:
2292: if (eptr >= md->end_subject)
2293: {
2294: SCHECK_PARTIAL();
2295: RRETURN(MATCH_NOMATCH);
2296: }
2297: GETCHARINCTEST(c, eptr);
2298: if (
2299: #ifdef SUPPORT_UTF8
2300: c >= 256 ||
2301: #endif
2302: (md->ctypes[c] & ctype_word) == 0
2303: )
2304: RRETURN(MATCH_NOMATCH);
2305: ecode++;
2306: break;
2307:
2308: case OP_ANYNL:
2309: if (eptr >= md->end_subject)
2310: {
2311: SCHECK_PARTIAL();
2312: RRETURN(MATCH_NOMATCH);
2313: }
2314: GETCHARINCTEST(c, eptr);
2315: switch(c)
2316: {
2317: default: RRETURN(MATCH_NOMATCH);
2318:
2319: case 0x000d:
2320: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2321: break;
2322:
2323: case 0x000a:
2324: break;
2325:
2326: case 0x000b:
2327: case 0x000c:
2328: case 0x0085:
2329: case 0x2028:
2330: case 0x2029:
2331: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2332: break;
2333: }
2334: ecode++;
2335: break;
2336:
2337: case OP_NOT_HSPACE:
2338: if (eptr >= md->end_subject)
2339: {
2340: SCHECK_PARTIAL();
2341: RRETURN(MATCH_NOMATCH);
2342: }
2343: GETCHARINCTEST(c, eptr);
2344: switch(c)
2345: {
2346: default: break;
2347: case 0x09: /* HT */
2348: case 0x20: /* SPACE */
2349: case 0xa0: /* NBSP */
2350: case 0x1680: /* OGHAM SPACE MARK */
2351: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2352: case 0x2000: /* EN QUAD */
2353: case 0x2001: /* EM QUAD */
2354: case 0x2002: /* EN SPACE */
2355: case 0x2003: /* EM SPACE */
2356: case 0x2004: /* THREE-PER-EM SPACE */
2357: case 0x2005: /* FOUR-PER-EM SPACE */
2358: case 0x2006: /* SIX-PER-EM SPACE */
2359: case 0x2007: /* FIGURE SPACE */
2360: case 0x2008: /* PUNCTUATION SPACE */
2361: case 0x2009: /* THIN SPACE */
2362: case 0x200A: /* HAIR SPACE */
2363: case 0x202f: /* NARROW NO-BREAK SPACE */
2364: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2365: case 0x3000: /* IDEOGRAPHIC SPACE */
2366: RRETURN(MATCH_NOMATCH);
2367: }
2368: ecode++;
2369: break;
2370:
2371: case OP_HSPACE:
2372: if (eptr >= md->end_subject)
2373: {
2374: SCHECK_PARTIAL();
2375: RRETURN(MATCH_NOMATCH);
2376: }
2377: GETCHARINCTEST(c, eptr);
2378: switch(c)
2379: {
2380: default: RRETURN(MATCH_NOMATCH);
2381: case 0x09: /* HT */
2382: case 0x20: /* SPACE */
2383: case 0xa0: /* NBSP */
2384: case 0x1680: /* OGHAM SPACE MARK */
2385: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2386: case 0x2000: /* EN QUAD */
2387: case 0x2001: /* EM QUAD */
2388: case 0x2002: /* EN SPACE */
2389: case 0x2003: /* EM SPACE */
2390: case 0x2004: /* THREE-PER-EM SPACE */
2391: case 0x2005: /* FOUR-PER-EM SPACE */
2392: case 0x2006: /* SIX-PER-EM SPACE */
2393: case 0x2007: /* FIGURE SPACE */
2394: case 0x2008: /* PUNCTUATION SPACE */
2395: case 0x2009: /* THIN SPACE */
2396: case 0x200A: /* HAIR SPACE */
2397: case 0x202f: /* NARROW NO-BREAK SPACE */
2398: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2399: case 0x3000: /* IDEOGRAPHIC SPACE */
2400: break;
2401: }
2402: ecode++;
2403: break;
2404:
2405: case OP_NOT_VSPACE:
2406: if (eptr >= md->end_subject)
2407: {
2408: SCHECK_PARTIAL();
2409: RRETURN(MATCH_NOMATCH);
2410: }
2411: GETCHARINCTEST(c, eptr);
2412: switch(c)
2413: {
2414: default: break;
2415: case 0x0a: /* LF */
2416: case 0x0b: /* VT */
2417: case 0x0c: /* FF */
2418: case 0x0d: /* CR */
2419: case 0x85: /* NEL */
2420: case 0x2028: /* LINE SEPARATOR */
2421: case 0x2029: /* PARAGRAPH SEPARATOR */
2422: RRETURN(MATCH_NOMATCH);
2423: }
2424: ecode++;
2425: break;
2426:
2427: case OP_VSPACE:
2428: if (eptr >= md->end_subject)
2429: {
2430: SCHECK_PARTIAL();
2431: RRETURN(MATCH_NOMATCH);
2432: }
2433: GETCHARINCTEST(c, eptr);
2434: switch(c)
2435: {
2436: default: RRETURN(MATCH_NOMATCH);
2437: case 0x0a: /* LF */
2438: case 0x0b: /* VT */
2439: case 0x0c: /* FF */
2440: case 0x0d: /* CR */
2441: case 0x85: /* NEL */
2442: case 0x2028: /* LINE SEPARATOR */
2443: case 0x2029: /* PARAGRAPH SEPARATOR */
2444: break;
2445: }
2446: ecode++;
2447: break;
2448:
2449: #ifdef SUPPORT_UCP
2450: /* Check the next character by Unicode property. We will get here only
2451: if the support is in the binary; otherwise a compile-time error occurs. */
2452:
2453: case OP_PROP:
2454: case OP_NOTPROP:
2455: if (eptr >= md->end_subject)
2456: {
2457: SCHECK_PARTIAL();
2458: RRETURN(MATCH_NOMATCH);
2459: }
2460: GETCHARINCTEST(c, eptr);
2461: {
2462: const ucd_record *prop = GET_UCD(c);
2463:
2464: switch(ecode[1])
2465: {
2466: case PT_ANY:
2467: if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2468: break;
2469:
2470: case PT_LAMP:
2471: if ((prop->chartype == ucp_Lu ||
2472: prop->chartype == ucp_Ll ||
2473: prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2474: RRETURN(MATCH_NOMATCH);
2475: break;
2476:
2477: case PT_GC:
2478: if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2479: RRETURN(MATCH_NOMATCH);
2480: break;
2481:
2482: case PT_PC:
2483: if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2484: RRETURN(MATCH_NOMATCH);
2485: break;
2486:
2487: case PT_SC:
2488: if ((ecode[2] != prop->script) == (op == OP_PROP))
2489: RRETURN(MATCH_NOMATCH);
2490: break;
2491:
2492: /* These are specials */
2493:
2494: case PT_ALNUM:
2495: if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2496: _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2497: RRETURN(MATCH_NOMATCH);
2498: break;
2499:
2500: case PT_SPACE: /* Perl space */
2501: if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2502: c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2503: == (op == OP_NOTPROP))
2504: RRETURN(MATCH_NOMATCH);
2505: break;
2506:
2507: case PT_PXSPACE: /* POSIX space */
2508: if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2509: c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2510: c == CHAR_FF || c == CHAR_CR)
2511: == (op == OP_NOTPROP))
2512: RRETURN(MATCH_NOMATCH);
2513: break;
2514:
2515: case PT_WORD:
2516: if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2517: _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2518: c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2519: RRETURN(MATCH_NOMATCH);
2520: break;
2521:
2522: /* This should never occur */
2523:
2524: default:
2525: RRETURN(PCRE_ERROR_INTERNAL);
2526: }
2527:
2528: ecode += 3;
2529: }
2530: break;
2531:
2532: /* Match an extended Unicode sequence. We will get here only if the support
2533: is in the binary; otherwise a compile-time error occurs. */
2534:
2535: case OP_EXTUNI:
2536: if (eptr >= md->end_subject)
2537: {
2538: SCHECK_PARTIAL();
2539: RRETURN(MATCH_NOMATCH);
2540: }
2541: GETCHARINCTEST(c, eptr);
2542: if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
2543: while (eptr < md->end_subject)
2544: {
2545: int len = 1;
2546: if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2547: if (UCD_CATEGORY(c) != ucp_M) break;
2548: eptr += len;
2549: }
2550: ecode++;
2551: break;
2552: #endif
2553:
2554:
2555: /* Match a back reference, possibly repeatedly. Look past the end of the
2556: item to see if there is repeat information following. The code is similar
2557: to that for character classes, but repeated for efficiency. Then obey
2558: similar code to character type repeats - written out again for speed.
2559: However, if the referenced string is the empty string, always treat
2560: it as matched, any number of times (otherwise there could be infinite
2561: loops). */
2562:
2563: case OP_REF:
2564: case OP_REFI:
2565: caseless = op == OP_REFI;
2566: offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2567: ecode += 3;
2568:
2569: /* If the reference is unset, there are two possibilities:
2570:
2571: (a) In the default, Perl-compatible state, set the length negative;
2572: this ensures that every attempt at a match fails. We can't just fail
2573: here, because of the possibility of quantifiers with zero minima.
2574:
2575: (b) If the JavaScript compatibility flag is set, set the length to zero
2576: so that the back reference matches an empty string.
2577:
2578: Otherwise, set the length to the length of what was matched by the
2579: referenced subpattern. */
2580:
2581: if (offset >= offset_top || md->offset_vector[offset] < 0)
2582: length = (md->jscript_compat)? 0 : -1;
2583: else
2584: length = md->offset_vector[offset+1] - md->offset_vector[offset];
2585:
2586: /* Set up for repetition, or handle the non-repeated case */
2587:
2588: switch (*ecode)
2589: {
2590: case OP_CRSTAR:
2591: case OP_CRMINSTAR:
2592: case OP_CRPLUS:
2593: case OP_CRMINPLUS:
2594: case OP_CRQUERY:
2595: case OP_CRMINQUERY:
2596: c = *ecode++ - OP_CRSTAR;
2597: minimize = (c & 1) != 0;
2598: min = rep_min[c]; /* Pick up values from tables; */
2599: max = rep_max[c]; /* zero for max => infinity */
2600: if (max == 0) max = INT_MAX;
2601: break;
2602:
2603: case OP_CRRANGE:
2604: case OP_CRMINRANGE:
2605: minimize = (*ecode == OP_CRMINRANGE);
2606: min = GET2(ecode, 1);
2607: max = GET2(ecode, 3);
2608: if (max == 0) max = INT_MAX;
2609: ecode += 5;
2610: break;
2611:
2612: default: /* No repeat follows */
2613: if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2614: {
2615: CHECK_PARTIAL();
2616: RRETURN(MATCH_NOMATCH);
2617: }
2618: eptr += length;
2619: continue; /* With the main loop */
2620: }
2621:
2622: /* Handle repeated back references. If the length of the reference is
2623: zero, just continue with the main loop. */
2624:
2625: if (length == 0) continue;
2626:
2627: /* First, ensure the minimum number of matches are present. We get back
2628: the length of the reference string explicitly rather than passing the
2629: address of eptr, so that eptr can be a register variable. */
2630:
2631: for (i = 1; i <= min; i++)
2632: {
2633: int slength;
2634: if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2635: {
2636: CHECK_PARTIAL();
2637: RRETURN(MATCH_NOMATCH);
2638: }
2639: eptr += slength;
2640: }
2641:
2642: /* If min = max, continue at the same level without recursion.
2643: They are not both allowed to be zero. */
2644:
2645: if (min == max) continue;
2646:
2647: /* If minimizing, keep trying and advancing the pointer */
2648:
2649: if (minimize)
2650: {
2651: for (fi = min;; fi++)
2652: {
2653: int slength;
2654: RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2655: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2656: if (fi >= max) RRETURN(MATCH_NOMATCH);
2657: if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2658: {
2659: CHECK_PARTIAL();
2660: RRETURN(MATCH_NOMATCH);
2661: }
2662: eptr += slength;
2663: }
2664: /* Control never gets here */
2665: }
2666:
2667: /* If maximizing, find the longest string and work backwards */
2668:
2669: else
2670: {
2671: pp = eptr;
2672: for (i = min; i < max; i++)
2673: {
2674: int slength;
2675: if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2676: {
2677: CHECK_PARTIAL();
2678: break;
2679: }
2680: eptr += slength;
2681: }
2682: while (eptr >= pp)
2683: {
2684: RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2685: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2686: eptr -= length;
2687: }
2688: RRETURN(MATCH_NOMATCH);
2689: }
2690: /* Control never gets here */
2691:
2692: /* Match a bit-mapped character class, possibly repeatedly. This op code is
2693: used when all the characters in the class have values in the range 0-255,
2694: and either the matching is caseful, or the characters are in the range
2695: 0-127 when UTF-8 processing is enabled. The only difference between
2696: OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2697: encountered.
2698:
2699: First, look past the end of the item to see if there is repeat information
2700: following. Then obey similar code to character type repeats - written out
2701: again for speed. */
2702:
2703: case OP_NCLASS:
2704: case OP_CLASS:
2705: {
2706: data = ecode + 1; /* Save for matching */
2707: ecode += 33; /* Advance past the item */
2708:
2709: switch (*ecode)
2710: {
2711: case OP_CRSTAR:
2712: case OP_CRMINSTAR:
2713: case OP_CRPLUS:
2714: case OP_CRMINPLUS:
2715: case OP_CRQUERY:
2716: case OP_CRMINQUERY:
2717: c = *ecode++ - OP_CRSTAR;
2718: minimize = (c & 1) != 0;
2719: min = rep_min[c]; /* Pick up values from tables; */
2720: max = rep_max[c]; /* zero for max => infinity */
2721: if (max == 0) max = INT_MAX;
2722: break;
2723:
2724: case OP_CRRANGE:
2725: case OP_CRMINRANGE:
2726: minimize = (*ecode == OP_CRMINRANGE);
2727: min = GET2(ecode, 1);
2728: max = GET2(ecode, 3);
2729: if (max == 0) max = INT_MAX;
2730: ecode += 5;
2731: break;
2732:
2733: default: /* No repeat follows */
2734: min = max = 1;
2735: break;
2736: }
2737:
2738: /* First, ensure the minimum number of matches are present. */
2739:
2740: #ifdef SUPPORT_UTF8
2741: /* UTF-8 mode */
2742: if (utf8)
2743: {
2744: for (i = 1; i <= min; i++)
2745: {
2746: if (eptr >= md->end_subject)
2747: {
2748: SCHECK_PARTIAL();
2749: RRETURN(MATCH_NOMATCH);
2750: }
2751: GETCHARINC(c, eptr);
2752: if (c > 255)
2753: {
2754: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2755: }
2756: else
2757: {
2758: if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2759: }
2760: }
2761: }
2762: else
2763: #endif
2764: /* Not UTF-8 mode */
2765: {
2766: for (i = 1; i <= min; i++)
2767: {
2768: if (eptr >= md->end_subject)
2769: {
2770: SCHECK_PARTIAL();
2771: RRETURN(MATCH_NOMATCH);
2772: }
2773: c = *eptr++;
2774: if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2775: }
2776: }
2777:
2778: /* If max == min we can continue with the main loop without the
2779: need to recurse. */
2780:
2781: if (min == max) continue;
2782:
2783: /* If minimizing, keep testing the rest of the expression and advancing
2784: the pointer while it matches the class. */
2785:
2786: if (minimize)
2787: {
2788: #ifdef SUPPORT_UTF8
2789: /* UTF-8 mode */
2790: if (utf8)
2791: {
2792: for (fi = min;; fi++)
2793: {
2794: RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2795: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2796: if (fi >= max) RRETURN(MATCH_NOMATCH);
2797: if (eptr >= md->end_subject)
2798: {
2799: SCHECK_PARTIAL();
2800: RRETURN(MATCH_NOMATCH);
2801: }
2802: GETCHARINC(c, eptr);
2803: if (c > 255)
2804: {
2805: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2806: }
2807: else
2808: {
2809: if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2810: }
2811: }
2812: }
2813: else
2814: #endif
2815: /* Not UTF-8 mode */
2816: {
2817: for (fi = min;; fi++)
2818: {
2819: RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2820: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2821: if (fi >= max) RRETURN(MATCH_NOMATCH);
2822: if (eptr >= md->end_subject)
2823: {
2824: SCHECK_PARTIAL();
2825: RRETURN(MATCH_NOMATCH);
2826: }
2827: c = *eptr++;
2828: if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2829: }
2830: }
2831: /* Control never gets here */
2832: }
2833:
2834: /* If maximizing, find the longest possible run, then work backwards. */
2835:
2836: else
2837: {
2838: pp = eptr;
2839:
2840: #ifdef SUPPORT_UTF8
2841: /* UTF-8 mode */
2842: if (utf8)
2843: {
2844: for (i = min; i < max; i++)
2845: {
2846: int len = 1;
2847: if (eptr >= md->end_subject)
2848: {
2849: SCHECK_PARTIAL();
2850: break;
2851: }
2852: GETCHARLEN(c, eptr, len);
2853: if (c > 255)
2854: {
2855: if (op == OP_CLASS) break;
2856: }
2857: else
2858: {
2859: if ((data[c/8] & (1 << (c&7))) == 0) break;
2860: }
2861: eptr += len;
2862: }
2863: for (;;)
2864: {
2865: RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2866: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2867: if (eptr-- == pp) break; /* Stop if tried at original pos */
2868: BACKCHAR(eptr);
2869: }
2870: }
2871: else
2872: #endif
2873: /* Not UTF-8 mode */
2874: {
2875: for (i = min; i < max; i++)
2876: {
2877: if (eptr >= md->end_subject)
2878: {
2879: SCHECK_PARTIAL();
2880: break;
2881: }
2882: c = *eptr;
2883: if ((data[c/8] & (1 << (c&7))) == 0) break;
2884: eptr++;
2885: }
2886: while (eptr >= pp)
2887: {
2888: RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2889: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2890: eptr--;
2891: }
2892: }
2893:
2894: RRETURN(MATCH_NOMATCH);
2895: }
2896: }
2897: /* Control never gets here */
2898:
2899:
2900: /* Match an extended character class. This opcode is encountered only
2901: when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2902: mode, because Unicode properties are supported in non-UTF-8 mode. */
2903:
2904: #ifdef SUPPORT_UTF8
2905: case OP_XCLASS:
2906: {
2907: data = ecode + 1 + LINK_SIZE; /* Save for matching */
2908: ecode += GET(ecode, 1); /* Advance past the item */
2909:
2910: switch (*ecode)
2911: {
2912: case OP_CRSTAR:
2913: case OP_CRMINSTAR:
2914: case OP_CRPLUS:
2915: case OP_CRMINPLUS:
2916: case OP_CRQUERY:
2917: case OP_CRMINQUERY:
2918: c = *ecode++ - OP_CRSTAR;
2919: minimize = (c & 1) != 0;
2920: min = rep_min[c]; /* Pick up values from tables; */
2921: max = rep_max[c]; /* zero for max => infinity */
2922: if (max == 0) max = INT_MAX;
2923: break;
2924:
2925: case OP_CRRANGE:
2926: case OP_CRMINRANGE:
2927: minimize = (*ecode == OP_CRMINRANGE);
2928: min = GET2(ecode, 1);
2929: max = GET2(ecode, 3);
2930: if (max == 0) max = INT_MAX;
2931: ecode += 5;
2932: break;
2933:
2934: default: /* No repeat follows */
2935: min = max = 1;
2936: break;
2937: }
2938:
2939: /* First, ensure the minimum number of matches are present. */
2940:
2941: for (i = 1; i <= min; i++)
2942: {
2943: if (eptr >= md->end_subject)
2944: {
2945: SCHECK_PARTIAL();
2946: RRETURN(MATCH_NOMATCH);
2947: }
2948: GETCHARINCTEST(c, eptr);
2949: if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2950: }
2951:
2952: /* If max == min we can continue with the main loop without the
2953: need to recurse. */
2954:
2955: if (min == max) continue;
2956:
2957: /* If minimizing, keep testing the rest of the expression and advancing
2958: the pointer while it matches the class. */
2959:
2960: if (minimize)
2961: {
2962: for (fi = min;; fi++)
2963: {
2964: RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2965: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2966: if (fi >= max) RRETURN(MATCH_NOMATCH);
2967: if (eptr >= md->end_subject)
2968: {
2969: SCHECK_PARTIAL();
2970: RRETURN(MATCH_NOMATCH);
2971: }
2972: GETCHARINCTEST(c, eptr);
2973: if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2974: }
2975: /* Control never gets here */
2976: }
2977:
2978: /* If maximizing, find the longest possible run, then work backwards. */
2979:
2980: else
2981: {
2982: pp = eptr;
2983: for (i = min; i < max; i++)
2984: {
2985: int len = 1;
2986: if (eptr >= md->end_subject)
2987: {
2988: SCHECK_PARTIAL();
2989: break;
2990: }
2991: GETCHARLENTEST(c, eptr, len);
2992: if (!_pcre_xclass(c, data)) break;
2993: eptr += len;
2994: }
2995: for(;;)
2996: {
2997: RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2998: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2999: if (eptr-- == pp) break; /* Stop if tried at original pos */
3000: if (utf8) BACKCHAR(eptr);
3001: }
3002: RRETURN(MATCH_NOMATCH);
3003: }
3004:
3005: /* Control never gets here */
3006: }
3007: #endif /* End of XCLASS */
3008:
3009: /* Match a single character, casefully */
3010:
3011: case OP_CHAR:
3012: #ifdef SUPPORT_UTF8
3013: if (utf8)
3014: {
3015: length = 1;
3016: ecode++;
3017: GETCHARLEN(fc, ecode, length);
3018: if (length > md->end_subject - eptr)
3019: {
3020: CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3021: RRETURN(MATCH_NOMATCH);
3022: }
3023: while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
3024: }
3025: else
3026: #endif
3027:
3028: /* Non-UTF-8 mode */
3029: {
3030: if (md->end_subject - eptr < 1)
3031: {
3032: SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3033: RRETURN(MATCH_NOMATCH);
3034: }
3035: if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3036: ecode += 2;
3037: }
3038: break;
3039:
3040: /* Match a single character, caselessly. If we are at the end of the
3041: subject, give up immediately. */
3042:
3043: case OP_CHARI:
3044: if (eptr >= md->end_subject)
3045: {
3046: SCHECK_PARTIAL();
3047: RRETURN(MATCH_NOMATCH);
3048: }
3049:
3050: #ifdef SUPPORT_UTF8
3051: if (utf8)
3052: {
3053: length = 1;
3054: ecode++;
3055: GETCHARLEN(fc, ecode, length);
3056:
3057: /* If the pattern character's value is < 128, we have only one byte, and
3058: we know that its other case must also be one byte long, so we can use the
3059: fast lookup table. We know that there is at least one byte left in the
3060: subject. */
3061:
3062: if (fc < 128)
3063: {
3064: if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
3065: }
3066:
3067: /* Otherwise we must pick up the subject character. Note that we cannot
3068: use the value of "length" to check for sufficient bytes left, because the
3069: other case of the character may have more or fewer bytes. */
3070:
3071: else
3072: {
3073: unsigned int dc;
3074: GETCHARINC(dc, eptr);
3075: ecode += length;
3076:
3077: /* If we have Unicode property support, we can use it to test the other
3078: case of the character, if there is one. */
3079:
3080: if (fc != dc)
3081: {
3082: #ifdef SUPPORT_UCP
3083: if (dc != UCD_OTHERCASE(fc))
3084: #endif
3085: RRETURN(MATCH_NOMATCH);
3086: }
3087: }
3088: }
3089: else
3090: #endif /* SUPPORT_UTF8 */
3091:
3092: /* Non-UTF-8 mode */
3093: {
3094: if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
3095: ecode += 2;
3096: }
3097: break;
3098:
3099: /* Match a single character repeatedly. */
3100:
3101: case OP_EXACT:
3102: case OP_EXACTI:
3103: min = max = GET2(ecode, 1);
3104: ecode += 3;
3105: goto REPEATCHAR;
3106:
3107: case OP_POSUPTO:
3108: case OP_POSUPTOI:
3109: possessive = TRUE;
3110: /* Fall through */
3111:
3112: case OP_UPTO:
3113: case OP_UPTOI:
3114: case OP_MINUPTO:
3115: case OP_MINUPTOI:
3116: min = 0;
3117: max = GET2(ecode, 1);
3118: minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3119: ecode += 3;
3120: goto REPEATCHAR;
3121:
3122: case OP_POSSTAR:
3123: case OP_POSSTARI:
3124: possessive = TRUE;
3125: min = 0;
3126: max = INT_MAX;
3127: ecode++;
3128: goto REPEATCHAR;
3129:
3130: case OP_POSPLUS:
3131: case OP_POSPLUSI:
3132: possessive = TRUE;
3133: min = 1;
3134: max = INT_MAX;
3135: ecode++;
3136: goto REPEATCHAR;
3137:
3138: case OP_POSQUERY:
3139: case OP_POSQUERYI:
3140: possessive = TRUE;
3141: min = 0;
3142: max = 1;
3143: ecode++;
3144: goto REPEATCHAR;
3145:
3146: case OP_STAR:
3147: case OP_STARI:
3148: case OP_MINSTAR:
3149: case OP_MINSTARI:
3150: case OP_PLUS:
3151: case OP_PLUSI:
3152: case OP_MINPLUS:
3153: case OP_MINPLUSI:
3154: case OP_QUERY:
3155: case OP_QUERYI:
3156: case OP_MINQUERY:
3157: case OP_MINQUERYI:
3158: c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3159: minimize = (c & 1) != 0;
3160: min = rep_min[c]; /* Pick up values from tables; */
3161: max = rep_max[c]; /* zero for max => infinity */
3162: if (max == 0) max = INT_MAX;
3163:
3164: /* Common code for all repeated single-character matches. */
3165:
3166: REPEATCHAR:
3167: #ifdef SUPPORT_UTF8
3168: if (utf8)
3169: {
3170: length = 1;
3171: charptr = ecode;
3172: GETCHARLEN(fc, ecode, length);
3173: ecode += length;
3174:
3175: /* Handle multibyte character matching specially here. There is
3176: support for caseless matching if UCP support is present. */
3177:
3178: if (length > 1)
3179: {
3180: #ifdef SUPPORT_UCP
3181: unsigned int othercase;
3182: if (op >= OP_STARI && /* Caseless */
3183: (othercase = UCD_OTHERCASE(fc)) != fc)
3184: oclength = _pcre_ord2utf8(othercase, occhars);
3185: else oclength = 0;
3186: #endif /* SUPPORT_UCP */
3187:
3188: for (i = 1; i <= min; i++)
3189: {
3190: if (eptr <= md->end_subject - length &&
3191: memcmp(eptr, charptr, length) == 0) eptr += length;
3192: #ifdef SUPPORT_UCP
3193: else if (oclength > 0 &&
3194: eptr <= md->end_subject - oclength &&
3195: memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3196: #endif /* SUPPORT_UCP */
3197: else
3198: {
3199: CHECK_PARTIAL();
3200: RRETURN(MATCH_NOMATCH);
3201: }
3202: }
3203:
3204: if (min == max) continue;
3205:
3206: if (minimize)
3207: {
3208: for (fi = min;; fi++)
3209: {
3210: RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3211: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3212: if (fi >= max) RRETURN(MATCH_NOMATCH);
3213: if (eptr <= md->end_subject - length &&
3214: memcmp(eptr, charptr, length) == 0) eptr += length;
3215: #ifdef SUPPORT_UCP
3216: else if (oclength > 0 &&
3217: eptr <= md->end_subject - oclength &&
3218: memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3219: #endif /* SUPPORT_UCP */
3220: else
3221: {
3222: CHECK_PARTIAL();
3223: RRETURN(MATCH_NOMATCH);
3224: }
3225: }
3226: /* Control never gets here */
3227: }
3228:
3229: else /* Maximize */
3230: {
3231: pp = eptr;
3232: for (i = min; i < max; i++)
3233: {
3234: if (eptr <= md->end_subject - length &&
3235: memcmp(eptr, charptr, length) == 0) eptr += length;
3236: #ifdef SUPPORT_UCP
3237: else if (oclength > 0 &&
3238: eptr <= md->end_subject - oclength &&
3239: memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3240: #endif /* SUPPORT_UCP */
3241: else
3242: {
3243: CHECK_PARTIAL();
3244: break;
3245: }
3246: }
3247:
3248: if (possessive) continue;
3249:
3250: for(;;)
3251: {
3252: RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3253: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3254: if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3255: #ifdef SUPPORT_UCP
3256: eptr--;
3257: BACKCHAR(eptr);
3258: #else /* without SUPPORT_UCP */
3259: eptr -= length;
3260: #endif /* SUPPORT_UCP */
3261: }
3262: }
3263: /* Control never gets here */
3264: }
3265:
3266: /* If the length of a UTF-8 character is 1, we fall through here, and
3267: obey the code as for non-UTF-8 characters below, though in this case the
3268: value of fc will always be < 128. */
3269: }
3270: else
3271: #endif /* SUPPORT_UTF8 */
3272:
3273: /* When not in UTF-8 mode, load a single-byte character. */
3274:
3275: fc = *ecode++;
3276:
3277: /* The value of fc at this point is always less than 256, though we may or
3278: may not be in UTF-8 mode. The code is duplicated for the caseless and
3279: caseful cases, for speed, since matching characters is likely to be quite
3280: common. First, ensure the minimum number of matches are present. If min =
3281: max, continue at the same level without recursing. Otherwise, if
3282: minimizing, keep trying the rest of the expression and advancing one
3283: matching character if failing, up to the maximum. Alternatively, if
3284: maximizing, find the maximum number of characters and work backwards. */
3285:
3286: DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3287: max, eptr));
3288:
3289: if (op >= OP_STARI) /* Caseless */
3290: {
3291: fc = md->lcc[fc];
3292: for (i = 1; i <= min; i++)
3293: {
3294: if (eptr >= md->end_subject)
3295: {
3296: SCHECK_PARTIAL();
3297: RRETURN(MATCH_NOMATCH);
3298: }
3299: if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
3300: }
3301: if (min == max) continue;
3302: if (minimize)
3303: {
3304: for (fi = min;; fi++)
3305: {
3306: RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3307: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3308: if (fi >= max) RRETURN(MATCH_NOMATCH);
3309: if (eptr >= md->end_subject)
3310: {
3311: SCHECK_PARTIAL();
3312: RRETURN(MATCH_NOMATCH);
3313: }
3314: if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
3315: }
3316: /* Control never gets here */
3317: }
3318: else /* Maximize */
3319: {
3320: pp = eptr;
3321: for (i = min; i < max; i++)
3322: {
3323: if (eptr >= md->end_subject)
3324: {
3325: SCHECK_PARTIAL();
3326: break;
3327: }
3328: if (fc != md->lcc[*eptr]) break;
3329: eptr++;
3330: }
3331:
3332: if (possessive) continue;
3333:
3334: while (eptr >= pp)
3335: {
3336: RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3337: eptr--;
3338: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3339: }
3340: RRETURN(MATCH_NOMATCH);
3341: }
3342: /* Control never gets here */
3343: }
3344:
3345: /* Caseful comparisons (includes all multi-byte characters) */
3346:
3347: else
3348: {
3349: for (i = 1; i <= min; i++)
3350: {
3351: if (eptr >= md->end_subject)
3352: {
3353: SCHECK_PARTIAL();
3354: RRETURN(MATCH_NOMATCH);
3355: }
3356: if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3357: }
3358:
3359: if (min == max) continue;
3360:
3361: if (minimize)
3362: {
3363: for (fi = min;; fi++)
3364: {
3365: RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3366: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3367: if (fi >= max) RRETURN(MATCH_NOMATCH);
3368: if (eptr >= md->end_subject)
3369: {
3370: SCHECK_PARTIAL();
3371: RRETURN(MATCH_NOMATCH);
3372: }
3373: if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3374: }
3375: /* Control never gets here */
3376: }
3377: else /* Maximize */
3378: {
3379: pp = eptr;
3380: for (i = min; i < max; i++)
3381: {
3382: if (eptr >= md->end_subject)
3383: {
3384: SCHECK_PARTIAL();
3385: break;
3386: }
3387: if (fc != *eptr) break;
3388: eptr++;
3389: }
3390: if (possessive) continue;
3391:
3392: while (eptr >= pp)
3393: {
3394: RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3395: eptr--;
3396: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3397: }
3398: RRETURN(MATCH_NOMATCH);
3399: }
3400: }
3401: /* Control never gets here */
3402:
3403: /* Match a negated single one-byte character. The character we are
3404: checking can be multibyte. */
3405:
3406: case OP_NOT:
3407: case OP_NOTI:
3408: if (eptr >= md->end_subject)
3409: {
3410: SCHECK_PARTIAL();
3411: RRETURN(MATCH_NOMATCH);
3412: }
3413: ecode++;
3414: GETCHARINCTEST(c, eptr);
3415: if (op == OP_NOTI) /* The caseless case */
3416: {
3417: #ifdef SUPPORT_UTF8
3418: if (c < 256)
3419: #endif
3420: c = md->lcc[c];
3421: if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
3422: }
3423: else /* Caseful */
3424: {
3425: if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
3426: }
3427: break;
3428:
3429: /* Match a negated single one-byte character repeatedly. This is almost a
3430: repeat of the code for a repeated single character, but I haven't found a
3431: nice way of commoning these up that doesn't require a test of the
3432: positive/negative option for each character match. Maybe that wouldn't add
3433: very much to the time taken, but character matching *is* what this is all
3434: about... */
3435:
3436: case OP_NOTEXACT:
3437: case OP_NOTEXACTI:
3438: min = max = GET2(ecode, 1);
3439: ecode += 3;
3440: goto REPEATNOTCHAR;
3441:
3442: case OP_NOTUPTO:
3443: case OP_NOTUPTOI:
3444: case OP_NOTMINUPTO:
3445: case OP_NOTMINUPTOI:
3446: min = 0;
3447: max = GET2(ecode, 1);
3448: minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3449: ecode += 3;
3450: goto REPEATNOTCHAR;
3451:
3452: case OP_NOTPOSSTAR:
3453: case OP_NOTPOSSTARI:
3454: possessive = TRUE;
3455: min = 0;
3456: max = INT_MAX;
3457: ecode++;
3458: goto REPEATNOTCHAR;
3459:
3460: case OP_NOTPOSPLUS:
3461: case OP_NOTPOSPLUSI:
3462: possessive = TRUE;
3463: min = 1;
3464: max = INT_MAX;
3465: ecode++;
3466: goto REPEATNOTCHAR;
3467:
3468: case OP_NOTPOSQUERY:
3469: case OP_NOTPOSQUERYI:
3470: possessive = TRUE;
3471: min = 0;
3472: max = 1;
3473: ecode++;
3474: goto REPEATNOTCHAR;
3475:
3476: case OP_NOTPOSUPTO:
3477: case OP_NOTPOSUPTOI:
3478: possessive = TRUE;
3479: min = 0;
3480: max = GET2(ecode, 1);
3481: ecode += 3;
3482: goto REPEATNOTCHAR;
3483:
3484: case OP_NOTSTAR:
3485: case OP_NOTSTARI:
3486: case OP_NOTMINSTAR:
3487: case OP_NOTMINSTARI:
3488: case OP_NOTPLUS:
3489: case OP_NOTPLUSI:
3490: case OP_NOTMINPLUS:
3491: case OP_NOTMINPLUSI:
3492: case OP_NOTQUERY:
3493: case OP_NOTQUERYI:
3494: case OP_NOTMINQUERY:
3495: case OP_NOTMINQUERYI:
3496: c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3497: minimize = (c & 1) != 0;
3498: min = rep_min[c]; /* Pick up values from tables; */
3499: max = rep_max[c]; /* zero for max => infinity */
3500: if (max == 0) max = INT_MAX;
3501:
3502: /* Common code for all repeated single-byte matches. */
3503:
3504: REPEATNOTCHAR:
3505: fc = *ecode++;
3506:
3507: /* The code is duplicated for the caseless and caseful cases, for speed,
3508: since matching characters is likely to be quite common. First, ensure the
3509: minimum number of matches are present. If min = max, continue at the same
3510: level without recursing. Otherwise, if minimizing, keep trying the rest of
3511: the expression and advancing one matching character if failing, up to the
3512: maximum. Alternatively, if maximizing, find the maximum number of
3513: characters and work backwards. */
3514:
3515: DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3516: max, eptr));
3517:
3518: if (op >= OP_NOTSTARI) /* Caseless */
3519: {
3520: fc = md->lcc[fc];
3521:
3522: #ifdef SUPPORT_UTF8
3523: /* UTF-8 mode */
3524: if (utf8)
3525: {
3526: register unsigned int d;
3527: for (i = 1; i <= min; i++)
3528: {
3529: if (eptr >= md->end_subject)
3530: {
3531: SCHECK_PARTIAL();
3532: RRETURN(MATCH_NOMATCH);
3533: }
3534: GETCHARINC(d, eptr);
3535: if (d < 256) d = md->lcc[d];
3536: if (fc == d) RRETURN(MATCH_NOMATCH);
3537: }
3538: }
3539: else
3540: #endif
3541:
3542: /* Not UTF-8 mode */
3543: {
3544: for (i = 1; i <= min; i++)
3545: {
3546: if (eptr >= md->end_subject)
3547: {
3548: SCHECK_PARTIAL();
3549: RRETURN(MATCH_NOMATCH);
3550: }
3551: if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
3552: }
3553: }
3554:
3555: if (min == max) continue;
3556:
3557: if (minimize)
3558: {
3559: #ifdef SUPPORT_UTF8
3560: /* UTF-8 mode */
3561: if (utf8)
3562: {
3563: register unsigned int d;
3564: for (fi = min;; fi++)
3565: {
3566: RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3567: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3568: if (fi >= max) RRETURN(MATCH_NOMATCH);
3569: if (eptr >= md->end_subject)
3570: {
3571: SCHECK_PARTIAL();
3572: RRETURN(MATCH_NOMATCH);
3573: }
3574: GETCHARINC(d, eptr);
3575: if (d < 256) d = md->lcc[d];
3576: if (fc == d) RRETURN(MATCH_NOMATCH);
3577: }
3578: }
3579: else
3580: #endif
3581: /* Not UTF-8 mode */
3582: {
3583: for (fi = min;; fi++)
3584: {
3585: RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3586: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3587: if (fi >= max) RRETURN(MATCH_NOMATCH);
3588: if (eptr >= md->end_subject)
3589: {
3590: SCHECK_PARTIAL();
3591: RRETURN(MATCH_NOMATCH);
3592: }
3593: if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
3594: }
3595: }
3596: /* Control never gets here */
3597: }
3598:
3599: /* Maximize case */
3600:
3601: else
3602: {
3603: pp = eptr;
3604:
3605: #ifdef SUPPORT_UTF8
3606: /* UTF-8 mode */
3607: if (utf8)
3608: {
3609: register unsigned int d;
3610: for (i = min; i < max; i++)
3611: {
3612: int len = 1;
3613: if (eptr >= md->end_subject)
3614: {
3615: SCHECK_PARTIAL();
3616: break;
3617: }
3618: GETCHARLEN(d, eptr, len);
3619: if (d < 256) d = md->lcc[d];
3620: if (fc == d) break;
3621: eptr += len;
3622: }
3623: if (possessive) continue;
3624: for(;;)
3625: {
3626: RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3627: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3628: if (eptr-- == pp) break; /* Stop if tried at original pos */
3629: BACKCHAR(eptr);
3630: }
3631: }
3632: else
3633: #endif
3634: /* Not UTF-8 mode */
3635: {
3636: for (i = min; i < max; i++)
3637: {
3638: if (eptr >= md->end_subject)
3639: {
3640: SCHECK_PARTIAL();
3641: break;
3642: }
3643: if (fc == md->lcc[*eptr]) break;
3644: eptr++;
3645: }
3646: if (possessive) continue;
3647: while (eptr >= pp)
3648: {
3649: RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3650: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3651: eptr--;
3652: }
3653: }
3654:
3655: RRETURN(MATCH_NOMATCH);
3656: }
3657: /* Control never gets here */
3658: }
3659:
3660: /* Caseful comparisons */
3661:
3662: else
3663: {
3664: #ifdef SUPPORT_UTF8
3665: /* UTF-8 mode */
3666: if (utf8)
3667: {
3668: register unsigned int d;
3669: for (i = 1; i <= min; i++)
3670: {
3671: if (eptr >= md->end_subject)
3672: {
3673: SCHECK_PARTIAL();
3674: RRETURN(MATCH_NOMATCH);
3675: }
3676: GETCHARINC(d, eptr);
3677: if (fc == d) RRETURN(MATCH_NOMATCH);
3678: }
3679: }
3680: else
3681: #endif
3682: /* Not UTF-8 mode */
3683: {
3684: for (i = 1; i <= min; i++)
3685: {
3686: if (eptr >= md->end_subject)
3687: {
3688: SCHECK_PARTIAL();
3689: RRETURN(MATCH_NOMATCH);
3690: }
3691: if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3692: }
3693: }
3694:
3695: if (min == max) continue;
3696:
3697: if (minimize)
3698: {
3699: #ifdef SUPPORT_UTF8
3700: /* UTF-8 mode */
3701: if (utf8)
3702: {
3703: register unsigned int d;
3704: for (fi = min;; fi++)
3705: {
3706: RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3707: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3708: if (fi >= max) RRETURN(MATCH_NOMATCH);
3709: if (eptr >= md->end_subject)
3710: {
3711: SCHECK_PARTIAL();
3712: RRETURN(MATCH_NOMATCH);
3713: }
3714: GETCHARINC(d, eptr);
3715: if (fc == d) RRETURN(MATCH_NOMATCH);
3716: }
3717: }
3718: else
3719: #endif
3720: /* Not UTF-8 mode */
3721: {
3722: for (fi = min;; fi++)
3723: {
3724: RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3725: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3726: if (fi >= max) RRETURN(MATCH_NOMATCH);
3727: if (eptr >= md->end_subject)
3728: {
3729: SCHECK_PARTIAL();
3730: RRETURN(MATCH_NOMATCH);
3731: }
3732: if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3733: }
3734: }
3735: /* Control never gets here */
3736: }
3737:
3738: /* Maximize case */
3739:
3740: else
3741: {
3742: pp = eptr;
3743:
3744: #ifdef SUPPORT_UTF8
3745: /* UTF-8 mode */
3746: if (utf8)
3747: {
3748: register unsigned int d;
3749: for (i = min; i < max; i++)
3750: {
3751: int len = 1;
3752: if (eptr >= md->end_subject)
3753: {
3754: SCHECK_PARTIAL();
3755: break;
3756: }
3757: GETCHARLEN(d, eptr, len);
3758: if (fc == d) break;
3759: eptr += len;
3760: }
3761: if (possessive) continue;
3762: for(;;)
3763: {
3764: RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3765: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3766: if (eptr-- == pp) break; /* Stop if tried at original pos */
3767: BACKCHAR(eptr);
3768: }
3769: }
3770: else
3771: #endif
3772: /* Not UTF-8 mode */
3773: {
3774: for (i = min; i < max; i++)
3775: {
3776: if (eptr >= md->end_subject)
3777: {
3778: SCHECK_PARTIAL();
3779: break;
3780: }
3781: if (fc == *eptr) break;
3782: eptr++;
3783: }
3784: if (possessive) continue;
3785: while (eptr >= pp)
3786: {
3787: RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3788: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3789: eptr--;
3790: }
3791: }
3792:
3793: RRETURN(MATCH_NOMATCH);
3794: }
3795: }
3796: /* Control never gets here */
3797:
3798: /* Match a single character type repeatedly; several different opcodes
3799: share code. This is very similar to the code for single characters, but we
3800: repeat it in the interests of efficiency. */
3801:
3802: case OP_TYPEEXACT:
3803: min = max = GET2(ecode, 1);
3804: minimize = TRUE;
3805: ecode += 3;
3806: goto REPEATTYPE;
3807:
3808: case OP_TYPEUPTO:
3809: case OP_TYPEMINUPTO:
3810: min = 0;
3811: max = GET2(ecode, 1);
3812: minimize = *ecode == OP_TYPEMINUPTO;
3813: ecode += 3;
3814: goto REPEATTYPE;
3815:
3816: case OP_TYPEPOSSTAR:
3817: possessive = TRUE;
3818: min = 0;
3819: max = INT_MAX;
3820: ecode++;
3821: goto REPEATTYPE;
3822:
3823: case OP_TYPEPOSPLUS:
3824: possessive = TRUE;
3825: min = 1;
3826: max = INT_MAX;
3827: ecode++;
3828: goto REPEATTYPE;
3829:
3830: case OP_TYPEPOSQUERY:
3831: possessive = TRUE;
3832: min = 0;
3833: max = 1;
3834: ecode++;
3835: goto REPEATTYPE;
3836:
3837: case OP_TYPEPOSUPTO:
3838: possessive = TRUE;
3839: min = 0;
3840: max = GET2(ecode, 1);
3841: ecode += 3;
3842: goto REPEATTYPE;
3843:
3844: case OP_TYPESTAR:
3845: case OP_TYPEMINSTAR:
3846: case OP_TYPEPLUS:
3847: case OP_TYPEMINPLUS:
3848: case OP_TYPEQUERY:
3849: case OP_TYPEMINQUERY:
3850: c = *ecode++ - OP_TYPESTAR;
3851: minimize = (c & 1) != 0;
3852: min = rep_min[c]; /* Pick up values from tables; */
3853: max = rep_max[c]; /* zero for max => infinity */
3854: if (max == 0) max = INT_MAX;
3855:
3856: /* Common code for all repeated single character type matches. Note that
3857: in UTF-8 mode, '.' matches a character of any length, but for the other
3858: character types, the valid characters are all one-byte long. */
3859:
3860: REPEATTYPE:
3861: ctype = *ecode++; /* Code for the character type */
3862:
3863: #ifdef SUPPORT_UCP
3864: if (ctype == OP_PROP || ctype == OP_NOTPROP)
3865: {
3866: prop_fail_result = ctype == OP_NOTPROP;
3867: prop_type = *ecode++;
3868: prop_value = *ecode++;
3869: }
3870: else prop_type = -1;
3871: #endif
3872:
3873: /* First, ensure the minimum number of matches are present. Use inline
3874: code for maximizing the speed, and do the type test once at the start
3875: (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3876: is tidier. Also separate the UCP code, which can be the same for both UTF-8
3877: and single-bytes. */
3878:
3879: if (min > 0)
3880: {
3881: #ifdef SUPPORT_UCP
3882: if (prop_type >= 0)
3883: {
3884: switch(prop_type)
3885: {
3886: case PT_ANY:
3887: if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3888: for (i = 1; i <= min; i++)
3889: {
3890: if (eptr >= md->end_subject)
3891: {
3892: SCHECK_PARTIAL();
3893: RRETURN(MATCH_NOMATCH);
3894: }
3895: GETCHARINCTEST(c, eptr);
3896: }
3897: break;
3898:
3899: case PT_LAMP:
3900: for (i = 1; i <= min; i++)
3901: {
3902: int chartype;
3903: if (eptr >= md->end_subject)
3904: {
3905: SCHECK_PARTIAL();
3906: RRETURN(MATCH_NOMATCH);
3907: }
3908: GETCHARINCTEST(c, eptr);
3909: chartype = UCD_CHARTYPE(c);
3910: if ((chartype == ucp_Lu ||
3911: chartype == ucp_Ll ||
3912: chartype == ucp_Lt) == prop_fail_result)
3913: RRETURN(MATCH_NOMATCH);
3914: }
3915: break;
3916:
3917: case PT_GC:
3918: for (i = 1; i <= min; i++)
3919: {
3920: if (eptr >= md->end_subject)
3921: {
3922: SCHECK_PARTIAL();
3923: RRETURN(MATCH_NOMATCH);
3924: }
3925: GETCHARINCTEST(c, eptr);
3926: if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
3927: RRETURN(MATCH_NOMATCH);
3928: }
3929: break;
3930:
3931: case PT_PC:
3932: for (i = 1; i <= min; i++)
3933: {
3934: if (eptr >= md->end_subject)
3935: {
3936: SCHECK_PARTIAL();
3937: RRETURN(MATCH_NOMATCH);
3938: }
3939: GETCHARINCTEST(c, eptr);
3940: if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
3941: RRETURN(MATCH_NOMATCH);
3942: }
3943: break;
3944:
3945: case PT_SC:
3946: for (i = 1; i <= min; i++)
3947: {
3948: if (eptr >= md->end_subject)
3949: {
3950: SCHECK_PARTIAL();
3951: RRETURN(MATCH_NOMATCH);
3952: }
3953: GETCHARINCTEST(c, eptr);
3954: if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
3955: RRETURN(MATCH_NOMATCH);
3956: }
3957: break;
3958:
3959: case PT_ALNUM:
3960: for (i = 1; i <= min; i++)
3961: {
3962: int category;
3963: if (eptr >= md->end_subject)
3964: {
3965: SCHECK_PARTIAL();
3966: RRETURN(MATCH_NOMATCH);
3967: }
3968: GETCHARINCTEST(c, eptr);
3969: category = UCD_CATEGORY(c);
3970: if ((category == ucp_L || category == ucp_N) == prop_fail_result)
3971: RRETURN(MATCH_NOMATCH);
3972: }
3973: break;
3974:
3975: case PT_SPACE: /* Perl space */
3976: for (i = 1; i <= min; i++)
3977: {
3978: if (eptr >= md->end_subject)
3979: {
3980: SCHECK_PARTIAL();
3981: RRETURN(MATCH_NOMATCH);
3982: }
3983: GETCHARINCTEST(c, eptr);
3984: if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3985: c == CHAR_FF || c == CHAR_CR)
3986: == prop_fail_result)
3987: RRETURN(MATCH_NOMATCH);
3988: }
3989: break;
3990:
3991: case PT_PXSPACE: /* POSIX space */
3992: for (i = 1; i <= min; i++)
3993: {
3994: if (eptr >= md->end_subject)
3995: {
3996: SCHECK_PARTIAL();
3997: RRETURN(MATCH_NOMATCH);
3998: }
3999: GETCHARINCTEST(c, eptr);
4000: if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4001: c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4002: == prop_fail_result)
4003: RRETURN(MATCH_NOMATCH);
4004: }
4005: break;
4006:
4007: case PT_WORD:
4008: for (i = 1; i <= min; i++)
4009: {
4010: int category;
4011: if (eptr >= md->end_subject)
4012: {
4013: SCHECK_PARTIAL();
4014: RRETURN(MATCH_NOMATCH);
4015: }
4016: GETCHARINCTEST(c, eptr);
4017: category = UCD_CATEGORY(c);
4018: if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4019: == prop_fail_result)
4020: RRETURN(MATCH_NOMATCH);
4021: }
4022: break;
4023:
4024: /* This should not occur */
4025:
4026: default:
4027: RRETURN(PCRE_ERROR_INTERNAL);
4028: }
4029: }
4030:
4031: /* Match extended Unicode sequences. We will get here only if the
4032: support is in the binary; otherwise a compile-time error occurs. */
4033:
4034: else if (ctype == OP_EXTUNI)
4035: {
4036: for (i = 1; i <= min; i++)
4037: {
4038: if (eptr >= md->end_subject)
4039: {
4040: SCHECK_PARTIAL();
4041: RRETURN(MATCH_NOMATCH);
4042: }
4043: GETCHARINCTEST(c, eptr);
4044: if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4045: while (eptr < md->end_subject)
4046: {
4047: int len = 1;
4048: if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4049: if (UCD_CATEGORY(c) != ucp_M) break;
4050: eptr += len;
4051: }
4052: }
4053: }
4054:
4055: else
4056: #endif /* SUPPORT_UCP */
4057:
4058: /* Handle all other cases when the coding is UTF-8 */
4059:
4060: #ifdef SUPPORT_UTF8
4061: if (utf8) switch(ctype)
4062: {
4063: case OP_ANY:
4064: for (i = 1; i <= min; i++)
4065: {
4066: if (eptr >= md->end_subject)
4067: {
4068: SCHECK_PARTIAL();
4069: RRETURN(MATCH_NOMATCH);
4070: }
4071: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4072: eptr++;
4073: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4074: }
4075: break;
4076:
4077: case OP_ALLANY:
4078: for (i = 1; i <= min; i++)
4079: {
4080: if (eptr >= md->end_subject)
4081: {
4082: SCHECK_PARTIAL();
4083: RRETURN(MATCH_NOMATCH);
4084: }
4085: eptr++;
4086: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4087: }
4088: break;
4089:
4090: case OP_ANYBYTE:
4091: if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4092: eptr += min;
4093: break;
4094:
4095: case OP_ANYNL:
4096: for (i = 1; i <= min; i++)
4097: {
4098: if (eptr >= md->end_subject)
4099: {
4100: SCHECK_PARTIAL();
4101: RRETURN(MATCH_NOMATCH);
4102: }
4103: GETCHARINC(c, eptr);
4104: switch(c)
4105: {
4106: default: RRETURN(MATCH_NOMATCH);
4107:
4108: case 0x000d:
4109: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4110: break;
4111:
4112: case 0x000a:
4113: break;
4114:
4115: case 0x000b:
4116: case 0x000c:
4117: case 0x0085:
4118: case 0x2028:
4119: case 0x2029:
4120: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4121: break;
4122: }
4123: }
4124: break;
4125:
4126: case OP_NOT_HSPACE:
4127: for (i = 1; i <= min; i++)
4128: {
4129: if (eptr >= md->end_subject)
4130: {
4131: SCHECK_PARTIAL();
4132: RRETURN(MATCH_NOMATCH);
4133: }
4134: GETCHARINC(c, eptr);
4135: switch(c)
4136: {
4137: default: break;
4138: case 0x09: /* HT */
4139: case 0x20: /* SPACE */
4140: case 0xa0: /* NBSP */
4141: case 0x1680: /* OGHAM SPACE MARK */
4142: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4143: case 0x2000: /* EN QUAD */
4144: case 0x2001: /* EM QUAD */
4145: case 0x2002: /* EN SPACE */
4146: case 0x2003: /* EM SPACE */
4147: case 0x2004: /* THREE-PER-EM SPACE */
4148: case 0x2005: /* FOUR-PER-EM SPACE */
4149: case 0x2006: /* SIX-PER-EM SPACE */
4150: case 0x2007: /* FIGURE SPACE */
4151: case 0x2008: /* PUNCTUATION SPACE */
4152: case 0x2009: /* THIN SPACE */
4153: case 0x200A: /* HAIR SPACE */
4154: case 0x202f: /* NARROW NO-BREAK SPACE */
4155: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4156: case 0x3000: /* IDEOGRAPHIC SPACE */
4157: RRETURN(MATCH_NOMATCH);
4158: }
4159: }
4160: break;
4161:
4162: case OP_HSPACE:
4163: for (i = 1; i <= min; i++)
4164: {
4165: if (eptr >= md->end_subject)
4166: {
4167: SCHECK_PARTIAL();
4168: RRETURN(MATCH_NOMATCH);
4169: }
4170: GETCHARINC(c, eptr);
4171: switch(c)
4172: {
4173: default: RRETURN(MATCH_NOMATCH);
4174: case 0x09: /* HT */
4175: case 0x20: /* SPACE */
4176: case 0xa0: /* NBSP */
4177: case 0x1680: /* OGHAM SPACE MARK */
4178: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4179: case 0x2000: /* EN QUAD */
4180: case 0x2001: /* EM QUAD */
4181: case 0x2002: /* EN SPACE */
4182: case 0x2003: /* EM SPACE */
4183: case 0x2004: /* THREE-PER-EM SPACE */
4184: case 0x2005: /* FOUR-PER-EM SPACE */
4185: case 0x2006: /* SIX-PER-EM SPACE */
4186: case 0x2007: /* FIGURE SPACE */
4187: case 0x2008: /* PUNCTUATION SPACE */
4188: case 0x2009: /* THIN SPACE */
4189: case 0x200A: /* HAIR SPACE */
4190: case 0x202f: /* NARROW NO-BREAK SPACE */
4191: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4192: case 0x3000: /* IDEOGRAPHIC SPACE */
4193: break;
4194: }
4195: }
4196: break;
4197:
4198: case OP_NOT_VSPACE:
4199: for (i = 1; i <= min; i++)
4200: {
4201: if (eptr >= md->end_subject)
4202: {
4203: SCHECK_PARTIAL();
4204: RRETURN(MATCH_NOMATCH);
4205: }
4206: GETCHARINC(c, eptr);
4207: switch(c)
4208: {
4209: default: break;
4210: case 0x0a: /* LF */
4211: case 0x0b: /* VT */
4212: case 0x0c: /* FF */
4213: case 0x0d: /* CR */
4214: case 0x85: /* NEL */
4215: case 0x2028: /* LINE SEPARATOR */
4216: case 0x2029: /* PARAGRAPH SEPARATOR */
4217: RRETURN(MATCH_NOMATCH);
4218: }
4219: }
4220: break;
4221:
4222: case OP_VSPACE:
4223: for (i = 1; i <= min; i++)
4224: {
4225: if (eptr >= md->end_subject)
4226: {
4227: SCHECK_PARTIAL();
4228: RRETURN(MATCH_NOMATCH);
4229: }
4230: GETCHARINC(c, eptr);
4231: switch(c)
4232: {
4233: default: RRETURN(MATCH_NOMATCH);
4234: case 0x0a: /* LF */
4235: case 0x0b: /* VT */
4236: case 0x0c: /* FF */
4237: case 0x0d: /* CR */
4238: case 0x85: /* NEL */
4239: case 0x2028: /* LINE SEPARATOR */
4240: case 0x2029: /* PARAGRAPH SEPARATOR */
4241: break;
4242: }
4243: }
4244: break;
4245:
4246: case OP_NOT_DIGIT:
4247: for (i = 1; i <= min; i++)
4248: {
4249: if (eptr >= md->end_subject)
4250: {
4251: SCHECK_PARTIAL();
4252: RRETURN(MATCH_NOMATCH);
4253: }
4254: GETCHARINC(c, eptr);
4255: if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4256: RRETURN(MATCH_NOMATCH);
4257: }
4258: break;
4259:
4260: case OP_DIGIT:
4261: for (i = 1; i <= min; i++)
4262: {
4263: if (eptr >= md->end_subject)
4264: {
4265: SCHECK_PARTIAL();
4266: RRETURN(MATCH_NOMATCH);
4267: }
4268: if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
4269: RRETURN(MATCH_NOMATCH);
4270: /* No need to skip more bytes - we know it's a 1-byte character */
4271: }
4272: break;
4273:
4274: case OP_NOT_WHITESPACE:
4275: for (i = 1; i <= min; i++)
4276: {
4277: if (eptr >= md->end_subject)
4278: {
4279: SCHECK_PARTIAL();
4280: RRETURN(MATCH_NOMATCH);
4281: }
4282: if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4283: RRETURN(MATCH_NOMATCH);
4284: while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4285: }
4286: break;
4287:
4288: case OP_WHITESPACE:
4289: for (i = 1; i <= min; i++)
4290: {
4291: if (eptr >= md->end_subject)
4292: {
4293: SCHECK_PARTIAL();
4294: RRETURN(MATCH_NOMATCH);
4295: }
4296: if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
4297: RRETURN(MATCH_NOMATCH);
4298: /* No need to skip more bytes - we know it's a 1-byte character */
4299: }
4300: break;
4301:
4302: case OP_NOT_WORDCHAR:
4303: for (i = 1; i <= min; i++)
4304: {
4305: if (eptr >= md->end_subject)
4306: {
4307: SCHECK_PARTIAL();
4308: RRETURN(MATCH_NOMATCH);
4309: }
4310: if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4311: RRETURN(MATCH_NOMATCH);
4312: while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4313: }
4314: break;
4315:
4316: case OP_WORDCHAR:
4317: for (i = 1; i <= min; i++)
4318: {
4319: if (eptr >= md->end_subject)
4320: {
4321: SCHECK_PARTIAL();
4322: RRETURN(MATCH_NOMATCH);
4323: }
4324: if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
4325: RRETURN(MATCH_NOMATCH);
4326: /* No need to skip more bytes - we know it's a 1-byte character */
4327: }
4328: break;
4329:
4330: default:
4331: RRETURN(PCRE_ERROR_INTERNAL);
4332: } /* End switch(ctype) */
4333:
4334: else
4335: #endif /* SUPPORT_UTF8 */
4336:
4337: /* Code for the non-UTF-8 case for minimum matching of operators other
4338: than OP_PROP and OP_NOTPROP. */
4339:
4340: switch(ctype)
4341: {
4342: case OP_ANY:
4343: for (i = 1; i <= min; i++)
4344: {
4345: if (eptr >= md->end_subject)
4346: {
4347: SCHECK_PARTIAL();
4348: RRETURN(MATCH_NOMATCH);
4349: }
4350: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4351: eptr++;
4352: }
4353: break;
4354:
4355: case OP_ALLANY:
4356: if (eptr > md->end_subject - min)
4357: {
4358: SCHECK_PARTIAL();
4359: RRETURN(MATCH_NOMATCH);
4360: }
4361: eptr += min;
4362: break;
4363:
4364: case OP_ANYBYTE:
4365: if (eptr > md->end_subject - min)
4366: {
4367: SCHECK_PARTIAL();
4368: RRETURN(MATCH_NOMATCH);
4369: }
4370: eptr += min;
4371: break;
4372:
4373: case OP_ANYNL:
4374: for (i = 1; i <= min; i++)
4375: {
4376: if (eptr >= md->end_subject)
4377: {
4378: SCHECK_PARTIAL();
4379: RRETURN(MATCH_NOMATCH);
4380: }
4381: switch(*eptr++)
4382: {
4383: default: RRETURN(MATCH_NOMATCH);
4384:
4385: case 0x000d:
4386: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4387: break;
4388:
4389: case 0x000a:
4390: break;
4391:
4392: case 0x000b:
4393: case 0x000c:
4394: case 0x0085:
4395: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4396: break;
4397: }
4398: }
4399: break;
4400:
4401: case OP_NOT_HSPACE:
4402: for (i = 1; i <= min; i++)
4403: {
4404: if (eptr >= md->end_subject)
4405: {
4406: SCHECK_PARTIAL();
4407: RRETURN(MATCH_NOMATCH);
4408: }
4409: switch(*eptr++)
4410: {
4411: default: break;
4412: case 0x09: /* HT */
4413: case 0x20: /* SPACE */
4414: case 0xa0: /* NBSP */
4415: RRETURN(MATCH_NOMATCH);
4416: }
4417: }
4418: break;
4419:
4420: case OP_HSPACE:
4421: for (i = 1; i <= min; i++)
4422: {
4423: if (eptr >= md->end_subject)
4424: {
4425: SCHECK_PARTIAL();
4426: RRETURN(MATCH_NOMATCH);
4427: }
4428: switch(*eptr++)
4429: {
4430: default: RRETURN(MATCH_NOMATCH);
4431: case 0x09: /* HT */
4432: case 0x20: /* SPACE */
4433: case 0xa0: /* NBSP */
4434: break;
4435: }
4436: }
4437: break;
4438:
4439: case OP_NOT_VSPACE:
4440: for (i = 1; i <= min; i++)
4441: {
4442: if (eptr >= md->end_subject)
4443: {
4444: SCHECK_PARTIAL();
4445: RRETURN(MATCH_NOMATCH);
4446: }
4447: switch(*eptr++)
4448: {
4449: default: break;
4450: case 0x0a: /* LF */
4451: case 0x0b: /* VT */
4452: case 0x0c: /* FF */
4453: case 0x0d: /* CR */
4454: case 0x85: /* NEL */
4455: RRETURN(MATCH_NOMATCH);
4456: }
4457: }
4458: break;
4459:
4460: case OP_VSPACE:
4461: for (i = 1; i <= min; i++)
4462: {
4463: if (eptr >= md->end_subject)
4464: {
4465: SCHECK_PARTIAL();
4466: RRETURN(MATCH_NOMATCH);
4467: }
4468: switch(*eptr++)
4469: {
4470: default: RRETURN(MATCH_NOMATCH);
4471: case 0x0a: /* LF */
4472: case 0x0b: /* VT */
4473: case 0x0c: /* FF */
4474: case 0x0d: /* CR */
4475: case 0x85: /* NEL */
4476: break;
4477: }
4478: }
4479: break;
4480:
4481: case OP_NOT_DIGIT:
4482: for (i = 1; i <= min; i++)
4483: {
4484: if (eptr >= md->end_subject)
4485: {
4486: SCHECK_PARTIAL();
4487: RRETURN(MATCH_NOMATCH);
4488: }
4489: if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
4490: }
4491: break;
4492:
4493: case OP_DIGIT:
4494: for (i = 1; i <= min; i++)
4495: {
4496: if (eptr >= md->end_subject)
4497: {
4498: SCHECK_PARTIAL();
4499: RRETURN(MATCH_NOMATCH);
4500: }
4501: if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
4502: }
4503: break;
4504:
4505: case OP_NOT_WHITESPACE:
4506: for (i = 1; i <= min; i++)
4507: {
4508: if (eptr >= md->end_subject)
4509: {
4510: SCHECK_PARTIAL();
4511: RRETURN(MATCH_NOMATCH);
4512: }
4513: if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
4514: }
4515: break;
4516:
4517: case OP_WHITESPACE:
4518: for (i = 1; i <= min; i++)
4519: {
4520: if (eptr >= md->end_subject)
4521: {
4522: SCHECK_PARTIAL();
4523: RRETURN(MATCH_NOMATCH);
4524: }
4525: if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
4526: }
4527: break;
4528:
4529: case OP_NOT_WORDCHAR:
4530: for (i = 1; i <= min; i++)
4531: {
4532: if (eptr >= md->end_subject)
4533: {
4534: SCHECK_PARTIAL();
4535: RRETURN(MATCH_NOMATCH);
4536: }
4537: if ((md->ctypes[*eptr++] & ctype_word) != 0)
4538: RRETURN(MATCH_NOMATCH);
4539: }
4540: break;
4541:
4542: case OP_WORDCHAR:
4543: for (i = 1; i <= min; i++)
4544: {
4545: if (eptr >= md->end_subject)
4546: {
4547: SCHECK_PARTIAL();
4548: RRETURN(MATCH_NOMATCH);
4549: }
4550: if ((md->ctypes[*eptr++] & ctype_word) == 0)
4551: RRETURN(MATCH_NOMATCH);
4552: }
4553: break;
4554:
4555: default:
4556: RRETURN(PCRE_ERROR_INTERNAL);
4557: }
4558: }
4559:
4560: /* If min = max, continue at the same level without recursing */
4561:
4562: if (min == max) continue;
4563:
4564: /* If minimizing, we have to test the rest of the pattern before each
4565: subsequent match. Again, separate the UTF-8 case for speed, and also
4566: separate the UCP cases. */
4567:
4568: if (minimize)
4569: {
4570: #ifdef SUPPORT_UCP
4571: if (prop_type >= 0)
4572: {
4573: switch(prop_type)
4574: {
4575: case PT_ANY:
4576: for (fi = min;; fi++)
4577: {
4578: RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4579: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4580: if (fi >= max) RRETURN(MATCH_NOMATCH);
4581: if (eptr >= md->end_subject)
4582: {
4583: SCHECK_PARTIAL();
4584: RRETURN(MATCH_NOMATCH);
4585: }
4586: GETCHARINCTEST(c, eptr);
4587: if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4588: }
4589: /* Control never gets here */
4590:
4591: case PT_LAMP:
4592: for (fi = min;; fi++)
4593: {
4594: int chartype;
4595: RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4596: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4597: if (fi >= max) RRETURN(MATCH_NOMATCH);
4598: if (eptr >= md->end_subject)
4599: {
4600: SCHECK_PARTIAL();
4601: RRETURN(MATCH_NOMATCH);
4602: }
4603: GETCHARINCTEST(c, eptr);
4604: chartype = UCD_CHARTYPE(c);
4605: if ((chartype == ucp_Lu ||
4606: chartype == ucp_Ll ||
4607: chartype == ucp_Lt) == prop_fail_result)
4608: RRETURN(MATCH_NOMATCH);
4609: }
4610: /* Control never gets here */
4611:
4612: case PT_GC:
4613: for (fi = min;; fi++)
4614: {
4615: RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4616: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4617: if (fi >= max) RRETURN(MATCH_NOMATCH);
4618: if (eptr >= md->end_subject)
4619: {
4620: SCHECK_PARTIAL();
4621: RRETURN(MATCH_NOMATCH);
4622: }
4623: GETCHARINCTEST(c, eptr);
4624: if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4625: RRETURN(MATCH_NOMATCH);
4626: }
4627: /* Control never gets here */
4628:
4629: case PT_PC:
4630: for (fi = min;; fi++)
4631: {
4632: RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4633: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4634: if (fi >= max) RRETURN(MATCH_NOMATCH);
4635: if (eptr >= md->end_subject)
4636: {
4637: SCHECK_PARTIAL();
4638: RRETURN(MATCH_NOMATCH);
4639: }
4640: GETCHARINCTEST(c, eptr);
4641: if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4642: RRETURN(MATCH_NOMATCH);
4643: }
4644: /* Control never gets here */
4645:
4646: case PT_SC:
4647: for (fi = min;; fi++)
4648: {
4649: RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4650: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4651: if (fi >= max) RRETURN(MATCH_NOMATCH);
4652: if (eptr >= md->end_subject)
4653: {
4654: SCHECK_PARTIAL();
4655: RRETURN(MATCH_NOMATCH);
4656: }
4657: GETCHARINCTEST(c, eptr);
4658: if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4659: RRETURN(MATCH_NOMATCH);
4660: }
4661: /* Control never gets here */
4662:
4663: case PT_ALNUM:
4664: for (fi = min;; fi++)
4665: {
4666: int category;
4667: RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4668: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4669: if (fi >= max) RRETURN(MATCH_NOMATCH);
4670: if (eptr >= md->end_subject)
4671: {
4672: SCHECK_PARTIAL();
4673: RRETURN(MATCH_NOMATCH);
4674: }
4675: GETCHARINCTEST(c, eptr);
4676: category = UCD_CATEGORY(c);
4677: if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4678: RRETURN(MATCH_NOMATCH);
4679: }
4680: /* Control never gets here */
4681:
4682: case PT_SPACE: /* Perl space */
4683: for (fi = min;; fi++)
4684: {
4685: RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4686: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4687: if (fi >= max) RRETURN(MATCH_NOMATCH);
4688: if (eptr >= md->end_subject)
4689: {
4690: SCHECK_PARTIAL();
4691: RRETURN(MATCH_NOMATCH);
4692: }
4693: GETCHARINCTEST(c, eptr);
4694: if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4695: c == CHAR_FF || c == CHAR_CR)
4696: == prop_fail_result)
4697: RRETURN(MATCH_NOMATCH);
4698: }
4699: /* Control never gets here */
4700:
4701: case PT_PXSPACE: /* POSIX space */
4702: for (fi = min;; fi++)
4703: {
4704: RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4705: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4706: if (fi >= max) RRETURN(MATCH_NOMATCH);
4707: if (eptr >= md->end_subject)
4708: {
4709: SCHECK_PARTIAL();
4710: RRETURN(MATCH_NOMATCH);
4711: }
4712: GETCHARINCTEST(c, eptr);
4713: if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4714: c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4715: == prop_fail_result)
4716: RRETURN(MATCH_NOMATCH);
4717: }
4718: /* Control never gets here */
4719:
4720: case PT_WORD:
4721: for (fi = min;; fi++)
4722: {
4723: int category;
4724: RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4725: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4726: if (fi >= max) RRETURN(MATCH_NOMATCH);
4727: if (eptr >= md->end_subject)
4728: {
4729: SCHECK_PARTIAL();
4730: RRETURN(MATCH_NOMATCH);
4731: }
4732: GETCHARINCTEST(c, eptr);
4733: category = UCD_CATEGORY(c);
4734: if ((category == ucp_L ||
4735: category == ucp_N ||
4736: c == CHAR_UNDERSCORE)
4737: == prop_fail_result)
4738: RRETURN(MATCH_NOMATCH);
4739: }
4740: /* Control never gets here */
4741:
4742: /* This should never occur */
4743:
4744: default:
4745: RRETURN(PCRE_ERROR_INTERNAL);
4746: }
4747: }
4748:
4749: /* Match extended Unicode sequences. We will get here only if the
4750: support is in the binary; otherwise a compile-time error occurs. */
4751:
4752: else if (ctype == OP_EXTUNI)
4753: {
4754: for (fi = min;; fi++)
4755: {
4756: RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4757: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4758: if (fi >= max) RRETURN(MATCH_NOMATCH);
4759: if (eptr >= md->end_subject)
4760: {
4761: SCHECK_PARTIAL();
4762: RRETURN(MATCH_NOMATCH);
4763: }
4764: GETCHARINCTEST(c, eptr);
4765: if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4766: while (eptr < md->end_subject)
4767: {
4768: int len = 1;
4769: if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4770: if (UCD_CATEGORY(c) != ucp_M) break;
4771: eptr += len;
4772: }
4773: }
4774: }
4775: else
4776: #endif /* SUPPORT_UCP */
4777:
4778: #ifdef SUPPORT_UTF8
4779: /* UTF-8 mode */
4780: if (utf8)
4781: {
4782: for (fi = min;; fi++)
4783: {
4784: RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4785: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4786: if (fi >= max) RRETURN(MATCH_NOMATCH);
4787: if (eptr >= md->end_subject)
4788: {
4789: SCHECK_PARTIAL();
4790: RRETURN(MATCH_NOMATCH);
4791: }
4792: if (ctype == OP_ANY && IS_NEWLINE(eptr))
4793: RRETURN(MATCH_NOMATCH);
4794: GETCHARINC(c, eptr);
4795: switch(ctype)
4796: {
4797: case OP_ANY: /* This is the non-NL case */
4798: case OP_ALLANY:
4799: case OP_ANYBYTE:
4800: break;
4801:
4802: case OP_ANYNL:
4803: switch(c)
4804: {
4805: default: RRETURN(MATCH_NOMATCH);
4806: case 0x000d:
4807: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4808: break;
4809: case 0x000a:
4810: break;
4811:
4812: case 0x000b:
4813: case 0x000c:
4814: case 0x0085:
4815: case 0x2028:
4816: case 0x2029:
4817: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4818: break;
4819: }
4820: break;
4821:
4822: case OP_NOT_HSPACE:
4823: switch(c)
4824: {
4825: default: break;
4826: case 0x09: /* HT */
4827: case 0x20: /* SPACE */
4828: case 0xa0: /* NBSP */
4829: case 0x1680: /* OGHAM SPACE MARK */
4830: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4831: case 0x2000: /* EN QUAD */
4832: case 0x2001: /* EM QUAD */
4833: case 0x2002: /* EN SPACE */
4834: case 0x2003: /* EM SPACE */
4835: case 0x2004: /* THREE-PER-EM SPACE */
4836: case 0x2005: /* FOUR-PER-EM SPACE */
4837: case 0x2006: /* SIX-PER-EM SPACE */
4838: case 0x2007: /* FIGURE SPACE */
4839: case 0x2008: /* PUNCTUATION SPACE */
4840: case 0x2009: /* THIN SPACE */
4841: case 0x200A: /* HAIR SPACE */
4842: case 0x202f: /* NARROW NO-BREAK SPACE */
4843: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4844: case 0x3000: /* IDEOGRAPHIC SPACE */
4845: RRETURN(MATCH_NOMATCH);
4846: }
4847: break;
4848:
4849: case OP_HSPACE:
4850: switch(c)
4851: {
4852: default: RRETURN(MATCH_NOMATCH);
4853: case 0x09: /* HT */
4854: case 0x20: /* SPACE */
4855: case 0xa0: /* NBSP */
4856: case 0x1680: /* OGHAM SPACE MARK */
4857: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4858: case 0x2000: /* EN QUAD */
4859: case 0x2001: /* EM QUAD */
4860: case 0x2002: /* EN SPACE */
4861: case 0x2003: /* EM SPACE */
4862: case 0x2004: /* THREE-PER-EM SPACE */
4863: case 0x2005: /* FOUR-PER-EM SPACE */
4864: case 0x2006: /* SIX-PER-EM SPACE */
4865: case 0x2007: /* FIGURE SPACE */
4866: case 0x2008: /* PUNCTUATION SPACE */
4867: case 0x2009: /* THIN SPACE */
4868: case 0x200A: /* HAIR SPACE */
4869: case 0x202f: /* NARROW NO-BREAK SPACE */
4870: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4871: case 0x3000: /* IDEOGRAPHIC SPACE */
4872: break;
4873: }
4874: break;
4875:
4876: case OP_NOT_VSPACE:
4877: switch(c)
4878: {
4879: default: break;
4880: case 0x0a: /* LF */
4881: case 0x0b: /* VT */
4882: case 0x0c: /* FF */
4883: case 0x0d: /* CR */
4884: case 0x85: /* NEL */
4885: case 0x2028: /* LINE SEPARATOR */
4886: case 0x2029: /* PARAGRAPH SEPARATOR */
4887: RRETURN(MATCH_NOMATCH);
4888: }
4889: break;
4890:
4891: case OP_VSPACE:
4892: switch(c)
4893: {
4894: default: RRETURN(MATCH_NOMATCH);
4895: case 0x0a: /* LF */
4896: case 0x0b: /* VT */
4897: case 0x0c: /* FF */
4898: case 0x0d: /* CR */
4899: case 0x85: /* NEL */
4900: case 0x2028: /* LINE SEPARATOR */
4901: case 0x2029: /* PARAGRAPH SEPARATOR */
4902: break;
4903: }
4904: break;
4905:
4906: case OP_NOT_DIGIT:
4907: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4908: RRETURN(MATCH_NOMATCH);
4909: break;
4910:
4911: case OP_DIGIT:
4912: if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4913: RRETURN(MATCH_NOMATCH);
4914: break;
4915:
4916: case OP_NOT_WHITESPACE:
4917: if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4918: RRETURN(MATCH_NOMATCH);
4919: break;
4920:
4921: case OP_WHITESPACE:
4922: if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4923: RRETURN(MATCH_NOMATCH);
4924: break;
4925:
4926: case OP_NOT_WORDCHAR:
4927: if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4928: RRETURN(MATCH_NOMATCH);
4929: break;
4930:
4931: case OP_WORDCHAR:
4932: if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4933: RRETURN(MATCH_NOMATCH);
4934: break;
4935:
4936: default:
4937: RRETURN(PCRE_ERROR_INTERNAL);
4938: }
4939: }
4940: }
4941: else
4942: #endif
4943: /* Not UTF-8 mode */
4944: {
4945: for (fi = min;; fi++)
4946: {
4947: RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
4948: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4949: if (fi >= max) RRETURN(MATCH_NOMATCH);
4950: if (eptr >= md->end_subject)
4951: {
4952: SCHECK_PARTIAL();
4953: RRETURN(MATCH_NOMATCH);
4954: }
4955: if (ctype == OP_ANY && IS_NEWLINE(eptr))
4956: RRETURN(MATCH_NOMATCH);
4957: c = *eptr++;
4958: switch(ctype)
4959: {
4960: case OP_ANY: /* This is the non-NL case */
4961: case OP_ALLANY:
4962: case OP_ANYBYTE:
4963: break;
4964:
4965: case OP_ANYNL:
4966: switch(c)
4967: {
4968: default: RRETURN(MATCH_NOMATCH);
4969: case 0x000d:
4970: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4971: break;
4972:
4973: case 0x000a:
4974: break;
4975:
4976: case 0x000b:
4977: case 0x000c:
4978: case 0x0085:
4979: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4980: break;
4981: }
4982: break;
4983:
4984: case OP_NOT_HSPACE:
4985: switch(c)
4986: {
4987: default: break;
4988: case 0x09: /* HT */
4989: case 0x20: /* SPACE */
4990: case 0xa0: /* NBSP */
4991: RRETURN(MATCH_NOMATCH);
4992: }
4993: break;
4994:
4995: case OP_HSPACE:
4996: switch(c)
4997: {
4998: default: RRETURN(MATCH_NOMATCH);
4999: case 0x09: /* HT */
5000: case 0x20: /* SPACE */
5001: case 0xa0: /* NBSP */
5002: break;
5003: }
5004: break;
5005:
5006: case OP_NOT_VSPACE:
5007: switch(c)
5008: {
5009: default: break;
5010: case 0x0a: /* LF */
5011: case 0x0b: /* VT */
5012: case 0x0c: /* FF */
5013: case 0x0d: /* CR */
5014: case 0x85: /* NEL */
5015: RRETURN(MATCH_NOMATCH);
5016: }
5017: break;
5018:
5019: case OP_VSPACE:
5020: switch(c)
5021: {
5022: default: RRETURN(MATCH_NOMATCH);
5023: case 0x0a: /* LF */
5024: case 0x0b: /* VT */
5025: case 0x0c: /* FF */
5026: case 0x0d: /* CR */
5027: case 0x85: /* NEL */
5028: break;
5029: }
5030: break;
5031:
5032: case OP_NOT_DIGIT:
5033: if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5034: break;
5035:
5036: case OP_DIGIT:
5037: if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5038: break;
5039:
5040: case OP_NOT_WHITESPACE:
5041: if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5042: break;
5043:
5044: case OP_WHITESPACE:
5045: if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5046: break;
5047:
5048: case OP_NOT_WORDCHAR:
5049: if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5050: break;
5051:
5052: case OP_WORDCHAR:
5053: if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5054: break;
5055:
5056: default:
5057: RRETURN(PCRE_ERROR_INTERNAL);
5058: }
5059: }
5060: }
5061: /* Control never gets here */
5062: }
5063:
5064: /* If maximizing, it is worth using inline code for speed, doing the type
5065: test once at the start (i.e. keep it out of the loop). Again, keep the
5066: UTF-8 and UCP stuff separate. */
5067:
5068: else
5069: {
5070: pp = eptr; /* Remember where we started */
5071:
5072: #ifdef SUPPORT_UCP
5073: if (prop_type >= 0)
5074: {
5075: switch(prop_type)
5076: {
5077: case PT_ANY:
5078: for (i = min; i < max; i++)
5079: {
5080: int len = 1;
5081: if (eptr >= md->end_subject)
5082: {
5083: SCHECK_PARTIAL();
5084: break;
5085: }
5086: GETCHARLENTEST(c, eptr, len);
5087: if (prop_fail_result) break;
5088: eptr+= len;
5089: }
5090: break;
5091:
5092: case PT_LAMP:
5093: for (i = min; i < max; i++)
5094: {
5095: int chartype;
5096: int len = 1;
5097: if (eptr >= md->end_subject)
5098: {
5099: SCHECK_PARTIAL();
5100: break;
5101: }
5102: GETCHARLENTEST(c, eptr, len);
5103: chartype = UCD_CHARTYPE(c);
5104: if ((chartype == ucp_Lu ||
5105: chartype == ucp_Ll ||
5106: chartype == ucp_Lt) == prop_fail_result)
5107: break;
5108: eptr+= len;
5109: }
5110: break;
5111:
5112: case PT_GC:
5113: for (i = min; i < max; i++)
5114: {
5115: int len = 1;
5116: if (eptr >= md->end_subject)
5117: {
5118: SCHECK_PARTIAL();
5119: break;
5120: }
5121: GETCHARLENTEST(c, eptr, len);
5122: if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5123: eptr+= len;
5124: }
5125: break;
5126:
5127: case PT_PC:
5128: for (i = min; i < max; i++)
5129: {
5130: int len = 1;
5131: if (eptr >= md->end_subject)
5132: {
5133: SCHECK_PARTIAL();
5134: break;
5135: }
5136: GETCHARLENTEST(c, eptr, len);
5137: if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5138: eptr+= len;
5139: }
5140: break;
5141:
5142: case PT_SC:
5143: for (i = min; i < max; i++)
5144: {
5145: int len = 1;
5146: if (eptr >= md->end_subject)
5147: {
5148: SCHECK_PARTIAL();
5149: break;
5150: }
5151: GETCHARLENTEST(c, eptr, len);
5152: if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5153: eptr+= len;
5154: }
5155: break;
5156:
5157: case PT_ALNUM:
5158: for (i = min; i < max; i++)
5159: {
5160: int category;
5161: int len = 1;
5162: if (eptr >= md->end_subject)
5163: {
5164: SCHECK_PARTIAL();
5165: break;
5166: }
5167: GETCHARLENTEST(c, eptr, len);
5168: category = UCD_CATEGORY(c);
5169: if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5170: break;
5171: eptr+= len;
5172: }
5173: break;
5174:
5175: case PT_SPACE: /* Perl space */
5176: for (i = min; i < max; i++)
5177: {
5178: int len = 1;
5179: if (eptr >= md->end_subject)
5180: {
5181: SCHECK_PARTIAL();
5182: break;
5183: }
5184: GETCHARLENTEST(c, eptr, len);
5185: if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5186: c == CHAR_FF || c == CHAR_CR)
5187: == prop_fail_result)
5188: break;
5189: eptr+= len;
5190: }
5191: break;
5192:
5193: case PT_PXSPACE: /* POSIX space */
5194: for (i = min; i < max; i++)
5195: {
5196: int len = 1;
5197: if (eptr >= md->end_subject)
5198: {
5199: SCHECK_PARTIAL();
5200: break;
5201: }
5202: GETCHARLENTEST(c, eptr, len);
5203: if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5204: c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5205: == prop_fail_result)
5206: break;
5207: eptr+= len;
5208: }
5209: break;
5210:
5211: case PT_WORD:
5212: for (i = min; i < max; i++)
5213: {
5214: int category;
5215: int len = 1;
5216: if (eptr >= md->end_subject)
5217: {
5218: SCHECK_PARTIAL();
5219: break;
5220: }
5221: GETCHARLENTEST(c, eptr, len);
5222: category = UCD_CATEGORY(c);
5223: if ((category == ucp_L || category == ucp_N ||
5224: c == CHAR_UNDERSCORE) == prop_fail_result)
5225: break;
5226: eptr+= len;
5227: }
5228: break;
5229:
5230: default:
5231: RRETURN(PCRE_ERROR_INTERNAL);
5232: }
5233:
5234: /* eptr is now past the end of the maximum run */
5235:
5236: if (possessive) continue;
5237: for(;;)
5238: {
5239: RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5240: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5241: if (eptr-- == pp) break; /* Stop if tried at original pos */
5242: if (utf8) BACKCHAR(eptr);
5243: }
5244: }
5245:
5246: /* Match extended Unicode sequences. We will get here only if the
5247: support is in the binary; otherwise a compile-time error occurs. */
5248:
5249: else if (ctype == OP_EXTUNI)
5250: {
5251: for (i = min; i < max; i++)
5252: {
5253: int len = 1;
5254: if (eptr >= md->end_subject)
5255: {
5256: SCHECK_PARTIAL();
5257: break;
5258: }
5259: if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5260: if (UCD_CATEGORY(c) == ucp_M) break;
5261: eptr += len;
5262: while (eptr < md->end_subject)
5263: {
5264: len = 1;
5265: if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5266: if (UCD_CATEGORY(c) != ucp_M) break;
5267: eptr += len;
5268: }
5269: }
5270:
5271: /* eptr is now past the end of the maximum run */
5272:
5273: if (possessive) continue;
5274:
5275: for(;;)
5276: {
5277: RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5278: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5279: if (eptr-- == pp) break; /* Stop if tried at original pos */
5280: for (;;) /* Move back over one extended */
5281: {
5282: if (!utf8) c = *eptr; else
5283: {
5284: BACKCHAR(eptr);
5285: GETCHAR(c, eptr);
5286: }
5287: if (UCD_CATEGORY(c) != ucp_M) break;
5288: eptr--;
5289: }
5290: }
5291: }
5292:
5293: else
5294: #endif /* SUPPORT_UCP */
5295:
5296: #ifdef SUPPORT_UTF8
5297: /* UTF-8 mode */
5298:
5299: if (utf8)
5300: {
5301: switch(ctype)
5302: {
5303: case OP_ANY:
5304: if (max < INT_MAX)
5305: {
5306: for (i = min; i < max; i++)
5307: {
5308: if (eptr >= md->end_subject)
5309: {
5310: SCHECK_PARTIAL();
5311: break;
5312: }
5313: if (IS_NEWLINE(eptr)) break;
5314: eptr++;
5315: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5316: }
5317: }
5318:
5319: /* Handle unlimited UTF-8 repeat */
5320:
5321: else
5322: {
5323: for (i = min; i < max; i++)
5324: {
5325: if (eptr >= md->end_subject)
5326: {
5327: SCHECK_PARTIAL();
5328: break;
5329: }
5330: if (IS_NEWLINE(eptr)) break;
5331: eptr++;
5332: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5333: }
5334: }
5335: break;
5336:
5337: case OP_ALLANY:
5338: if (max < INT_MAX)
5339: {
5340: for (i = min; i < max; i++)
5341: {
5342: if (eptr >= md->end_subject)
5343: {
5344: SCHECK_PARTIAL();
5345: break;
5346: }
5347: eptr++;
5348: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5349: }
5350: }
5351: else
5352: {
5353: eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5354: SCHECK_PARTIAL();
5355: }
5356: break;
5357:
5358: /* The byte case is the same as non-UTF8 */
5359:
5360: case OP_ANYBYTE:
5361: c = max - min;
5362: if (c > (unsigned int)(md->end_subject - eptr))
5363: {
5364: eptr = md->end_subject;
5365: SCHECK_PARTIAL();
5366: }
5367: else eptr += c;
5368: break;
5369:
5370: case OP_ANYNL:
5371: for (i = min; i < max; i++)
5372: {
5373: int len = 1;
5374: if (eptr >= md->end_subject)
5375: {
5376: SCHECK_PARTIAL();
5377: break;
5378: }
5379: GETCHARLEN(c, eptr, len);
5380: if (c == 0x000d)
5381: {
5382: if (++eptr >= md->end_subject) break;
5383: if (*eptr == 0x000a) eptr++;
5384: }
5385: else
5386: {
5387: if (c != 0x000a &&
5388: (md->bsr_anycrlf ||
5389: (c != 0x000b && c != 0x000c &&
5390: c != 0x0085 && c != 0x2028 && c != 0x2029)))
5391: break;
5392: eptr += len;
5393: }
5394: }
5395: break;
5396:
5397: case OP_NOT_HSPACE:
5398: case OP_HSPACE:
5399: for (i = min; i < max; i++)
5400: {
5401: BOOL gotspace;
5402: int len = 1;
5403: if (eptr >= md->end_subject)
5404: {
5405: SCHECK_PARTIAL();
5406: break;
5407: }
5408: GETCHARLEN(c, eptr, len);
5409: switch(c)
5410: {
5411: default: gotspace = FALSE; break;
5412: case 0x09: /* HT */
5413: case 0x20: /* SPACE */
5414: case 0xa0: /* NBSP */
5415: case 0x1680: /* OGHAM SPACE MARK */
5416: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5417: case 0x2000: /* EN QUAD */
5418: case 0x2001: /* EM QUAD */
5419: case 0x2002: /* EN SPACE */
5420: case 0x2003: /* EM SPACE */
5421: case 0x2004: /* THREE-PER-EM SPACE */
5422: case 0x2005: /* FOUR-PER-EM SPACE */
5423: case 0x2006: /* SIX-PER-EM SPACE */
5424: case 0x2007: /* FIGURE SPACE */
5425: case 0x2008: /* PUNCTUATION SPACE */
5426: case 0x2009: /* THIN SPACE */
5427: case 0x200A: /* HAIR SPACE */
5428: case 0x202f: /* NARROW NO-BREAK SPACE */
5429: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5430: case 0x3000: /* IDEOGRAPHIC SPACE */
5431: gotspace = TRUE;
5432: break;
5433: }
5434: if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5435: eptr += len;
5436: }
5437: break;
5438:
5439: case OP_NOT_VSPACE:
5440: case OP_VSPACE:
5441: for (i = min; i < max; i++)
5442: {
5443: BOOL gotspace;
5444: int len = 1;
5445: if (eptr >= md->end_subject)
5446: {
5447: SCHECK_PARTIAL();
5448: break;
5449: }
5450: GETCHARLEN(c, eptr, len);
5451: switch(c)
5452: {
5453: default: gotspace = FALSE; break;
5454: case 0x0a: /* LF */
5455: case 0x0b: /* VT */
5456: case 0x0c: /* FF */
5457: case 0x0d: /* CR */
5458: case 0x85: /* NEL */
5459: case 0x2028: /* LINE SEPARATOR */
5460: case 0x2029: /* PARAGRAPH SEPARATOR */
5461: gotspace = TRUE;
5462: break;
5463: }
5464: if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5465: eptr += len;
5466: }
5467: break;
5468:
5469: case OP_NOT_DIGIT:
5470: for (i = min; i < max; i++)
5471: {
5472: int len = 1;
5473: if (eptr >= md->end_subject)
5474: {
5475: SCHECK_PARTIAL();
5476: break;
5477: }
5478: GETCHARLEN(c, eptr, len);
5479: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5480: eptr+= len;
5481: }
5482: break;
5483:
5484: case OP_DIGIT:
5485: for (i = min; i < max; i++)
5486: {
5487: int len = 1;
5488: if (eptr >= md->end_subject)
5489: {
5490: SCHECK_PARTIAL();
5491: break;
5492: }
5493: GETCHARLEN(c, eptr, len);
5494: if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5495: eptr+= len;
5496: }
5497: break;
5498:
5499: case OP_NOT_WHITESPACE:
5500: for (i = min; i < max; i++)
5501: {
5502: int len = 1;
5503: if (eptr >= md->end_subject)
5504: {
5505: SCHECK_PARTIAL();
5506: break;
5507: }
5508: GETCHARLEN(c, eptr, len);
5509: if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5510: eptr+= len;
5511: }
5512: break;
5513:
5514: case OP_WHITESPACE:
5515: for (i = min; i < max; i++)
5516: {
5517: int len = 1;
5518: if (eptr >= md->end_subject)
5519: {
5520: SCHECK_PARTIAL();
5521: break;
5522: }
5523: GETCHARLEN(c, eptr, len);
5524: if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5525: eptr+= len;
5526: }
5527: break;
5528:
5529: case OP_NOT_WORDCHAR:
5530: for (i = min; i < max; i++)
5531: {
5532: int len = 1;
5533: if (eptr >= md->end_subject)
5534: {
5535: SCHECK_PARTIAL();
5536: break;
5537: }
5538: GETCHARLEN(c, eptr, len);
5539: if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5540: eptr+= len;
5541: }
5542: break;
5543:
5544: case OP_WORDCHAR:
5545: for (i = min; i < max; i++)
5546: {
5547: int len = 1;
5548: if (eptr >= md->end_subject)
5549: {
5550: SCHECK_PARTIAL();
5551: break;
5552: }
5553: GETCHARLEN(c, eptr, len);
5554: if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5555: eptr+= len;
5556: }
5557: break;
5558:
5559: default:
5560: RRETURN(PCRE_ERROR_INTERNAL);
5561: }
5562:
5563: /* eptr is now past the end of the maximum run. If possessive, we are
5564: done (no backing up). Otherwise, match at this position; anything other
5565: than no match is immediately returned. For nomatch, back up one
5566: character, unless we are matching \R and the last thing matched was
5567: \r\n, in which case, back up two bytes. */
5568:
5569: if (possessive) continue;
5570: for(;;)
5571: {
5572: RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5573: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5574: if (eptr-- == pp) break; /* Stop if tried at original pos */
5575: BACKCHAR(eptr);
5576: if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5577: eptr[-1] == '\r') eptr--;
5578: }
5579: }
5580: else
5581: #endif /* SUPPORT_UTF8 */
5582:
5583: /* Not UTF-8 mode */
5584: {
5585: switch(ctype)
5586: {
5587: case OP_ANY:
5588: for (i = min; i < max; i++)
5589: {
5590: if (eptr >= md->end_subject)
5591: {
5592: SCHECK_PARTIAL();
5593: break;
5594: }
5595: if (IS_NEWLINE(eptr)) break;
5596: eptr++;
5597: }
5598: break;
5599:
5600: case OP_ALLANY:
5601: case OP_ANYBYTE:
5602: c = max - min;
5603: if (c > (unsigned int)(md->end_subject - eptr))
5604: {
5605: eptr = md->end_subject;
5606: SCHECK_PARTIAL();
5607: }
5608: else eptr += c;
5609: break;
5610:
5611: case OP_ANYNL:
5612: for (i = min; i < max; i++)
5613: {
5614: if (eptr >= md->end_subject)
5615: {
5616: SCHECK_PARTIAL();
5617: break;
5618: }
5619: c = *eptr;
5620: if (c == 0x000d)
5621: {
5622: if (++eptr >= md->end_subject) break;
5623: if (*eptr == 0x000a) eptr++;
5624: }
5625: else
5626: {
5627: if (c != 0x000a &&
5628: (md->bsr_anycrlf ||
5629: (c != 0x000b && c != 0x000c && c != 0x0085)))
5630: break;
5631: eptr++;
5632: }
5633: }
5634: break;
5635:
5636: case OP_NOT_HSPACE:
5637: for (i = min; i < max; i++)
5638: {
5639: if (eptr >= md->end_subject)
5640: {
5641: SCHECK_PARTIAL();
5642: break;
5643: }
5644: c = *eptr;
5645: if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5646: eptr++;
5647: }
5648: break;
5649:
5650: case OP_HSPACE:
5651: for (i = min; i < max; i++)
5652: {
5653: if (eptr >= md->end_subject)
5654: {
5655: SCHECK_PARTIAL();
5656: break;
5657: }
5658: c = *eptr;
5659: if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5660: eptr++;
5661: }
5662: break;
5663:
5664: case OP_NOT_VSPACE:
5665: for (i = min; i < max; i++)
5666: {
5667: if (eptr >= md->end_subject)
5668: {
5669: SCHECK_PARTIAL();
5670: break;
5671: }
5672: c = *eptr;
5673: if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5674: break;
5675: eptr++;
5676: }
5677: break;
5678:
5679: case OP_VSPACE:
5680: for (i = min; i < max; i++)
5681: {
5682: if (eptr >= md->end_subject)
5683: {
5684: SCHECK_PARTIAL();
5685: break;
5686: }
5687: c = *eptr;
5688: if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5689: break;
5690: eptr++;
5691: }
5692: break;
5693:
5694: case OP_NOT_DIGIT:
5695: for (i = min; i < max; i++)
5696: {
5697: if (eptr >= md->end_subject)
5698: {
5699: SCHECK_PARTIAL();
5700: break;
5701: }
5702: if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5703: eptr++;
5704: }
5705: break;
5706:
5707: case OP_DIGIT:
5708: for (i = min; i < max; i++)
5709: {
5710: if (eptr >= md->end_subject)
5711: {
5712: SCHECK_PARTIAL();
5713: break;
5714: }
5715: if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5716: eptr++;
5717: }
5718: break;
5719:
5720: case OP_NOT_WHITESPACE:
5721: for (i = min; i < max; i++)
5722: {
5723: if (eptr >= md->end_subject)
5724: {
5725: SCHECK_PARTIAL();
5726: break;
5727: }
5728: if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5729: eptr++;
5730: }
5731: break;
5732:
5733: case OP_WHITESPACE:
5734: for (i = min; i < max; i++)
5735: {
5736: if (eptr >= md->end_subject)
5737: {
5738: SCHECK_PARTIAL();
5739: break;
5740: }
5741: if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5742: eptr++;
5743: }
5744: break;
5745:
5746: case OP_NOT_WORDCHAR:
5747: for (i = min; i < max; i++)
5748: {
5749: if (eptr >= md->end_subject)
5750: {
5751: SCHECK_PARTIAL();
5752: break;
5753: }
5754: if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5755: eptr++;
5756: }
5757: break;
5758:
5759: case OP_WORDCHAR:
5760: for (i = min; i < max; i++)
5761: {
5762: if (eptr >= md->end_subject)
5763: {
5764: SCHECK_PARTIAL();
5765: break;
5766: }
5767: if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5768: eptr++;
5769: }
5770: break;
5771:
5772: default:
5773: RRETURN(PCRE_ERROR_INTERNAL);
5774: }
5775:
5776: /* eptr is now past the end of the maximum run. If possessive, we are
5777: done (no backing up). Otherwise, match at this position; anything other
5778: than no match is immediately returned. For nomatch, back up one
5779: character (byte), unless we are matching \R and the last thing matched
5780: was \r\n, in which case, back up two bytes. */
5781:
5782: if (possessive) continue;
5783: while (eptr >= pp)
5784: {
5785: RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5786: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5787: eptr--;
5788: if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5789: eptr[-1] == '\r') eptr--;
5790: }
5791: }
5792:
5793: /* Get here if we can't make it match with any permitted repetitions */
5794:
5795: RRETURN(MATCH_NOMATCH);
5796: }
5797: /* Control never gets here */
5798:
5799: /* There's been some horrible disaster. Arrival here can only mean there is
5800: something seriously wrong in the code above or the OP_xxx definitions. */
5801:
5802: default:
5803: DPRINTF(("Unknown opcode %d\n", *ecode));
5804: RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5805: }
5806:
5807: /* Do not stick any code in here without much thought; it is assumed
5808: that "continue" in the code above comes out to here to repeat the main
5809: loop. */
5810:
5811: } /* End of main loop */
5812: /* Control never reaches here */
5813:
5814:
5815: /* When compiling to use the heap rather than the stack for recursive calls to
5816: match(), the RRETURN() macro jumps here. The number that is saved in
5817: frame->Xwhere indicates which label we actually want to return to. */
5818:
5819: #ifdef NO_RECURSE
5820: #define LBL(val) case val: goto L_RM##val;
5821: HEAP_RETURN:
5822: switch (frame->Xwhere)
5823: {
5824: LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5825: LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5826: LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5827: LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5828: LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
5829: LBL(65) LBL(66)
5830: #ifdef SUPPORT_UTF8
5831: LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5832: LBL(32) LBL(34) LBL(42) LBL(46)
5833: #ifdef SUPPORT_UCP
5834: LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5835: LBL(59) LBL(60) LBL(61) LBL(62)
5836: #endif /* SUPPORT_UCP */
5837: #endif /* SUPPORT_UTF8 */
5838: default:
5839: DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5840: return PCRE_ERROR_INTERNAL;
5841: }
5842: #undef LBL
5843: #endif /* NO_RECURSE */
5844: }
5845:
5846:
5847: /***************************************************************************
5848: ****************************************************************************
5849: RECURSION IN THE match() FUNCTION
5850:
5851: Undefine all the macros that were defined above to handle this. */
5852:
5853: #ifdef NO_RECURSE
5854: #undef eptr
5855: #undef ecode
5856: #undef mstart
5857: #undef offset_top
5858: #undef eptrb
5859: #undef flags
5860:
5861: #undef callpat
5862: #undef charptr
5863: #undef data
5864: #undef next
5865: #undef pp
5866: #undef prev
5867: #undef saved_eptr
5868:
5869: #undef new_recursive
5870:
5871: #undef cur_is_word
5872: #undef condition
5873: #undef prev_is_word
5874:
5875: #undef ctype
5876: #undef length
5877: #undef max
5878: #undef min
5879: #undef number
5880: #undef offset
5881: #undef op
5882: #undef save_capture_last
5883: #undef save_offset1
5884: #undef save_offset2
5885: #undef save_offset3
5886: #undef stacksave
5887:
5888: #undef newptrb
5889:
5890: #endif
5891:
5892: /* These two are defined as macros in both cases */
5893:
5894: #undef fc
5895: #undef fi
5896:
5897: /***************************************************************************
5898: ***************************************************************************/
5899:
5900:
5901:
5902: /*************************************************
5903: * Execute a Regular Expression *
5904: *************************************************/
5905:
5906: /* This function applies a compiled re to a subject string and picks out
5907: portions of the string if it matches. Two elements in the vector are set for
5908: each substring: the offsets to the start and end of the substring.
5909:
5910: Arguments:
5911: argument_re points to the compiled expression
5912: extra_data points to extra data or is NULL
5913: subject points to the subject string
5914: length length of subject string (may contain binary zeros)
5915: start_offset where to start in the subject string
5916: options option bits
5917: offsets points to a vector of ints to be filled in with offsets
5918: offsetcount the number of elements in the vector
5919:
5920: Returns: > 0 => success; value is the number of elements filled in
5921: = 0 => success, but offsets is not big enough
5922: -1 => failed to match
5923: < -1 => some kind of unexpected problem
5924: */
5925:
5926: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5927: pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5928: PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5929: int offsetcount)
5930: {
5931: int rc, ocount, arg_offset_max;
5932: int first_byte = -1;
5933: int req_byte = -1;
5934: int req_byte2 = -1;
5935: int newline;
5936: BOOL using_temporary_offsets = FALSE;
5937: BOOL anchored;
5938: BOOL startline;
5939: BOOL firstline;
5940: BOOL first_byte_caseless = FALSE;
5941: BOOL req_byte_caseless = FALSE;
5942: BOOL utf8;
5943: match_data match_block;
5944: match_data *md = &match_block;
5945: const uschar *tables;
5946: const uschar *start_bits = NULL;
5947: USPTR start_match = (USPTR)subject + start_offset;
5948: USPTR end_subject;
5949: USPTR start_partial = NULL;
5950: USPTR req_byte_ptr = start_match - 1;
5951:
5952: pcre_study_data internal_study;
5953: const pcre_study_data *study;
5954:
5955: real_pcre internal_re;
5956: const real_pcre *external_re = (const real_pcre *)argument_re;
5957: const real_pcre *re = external_re;
5958:
5959: /* Plausibility checks */
5960:
5961: if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5962: if (re == NULL || subject == NULL ||
5963: (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5964: if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5965: if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5966:
5967: /* These two settings are used in the code for checking a UTF-8 string that
5968: follows immediately afterwards. Other values in the md block are used only
5969: during "normal" pcre_exec() processing, not when the JIT support is in use,
5970: so they are set up later. */
5971:
5972: utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5973: md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5974: ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5975:
5976: /* Check a UTF-8 string if required. Pass back the character offset and error
5977: code for an invalid string if a results vector is available. */
5978:
5979: #ifdef SUPPORT_UTF8
5980: if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5981: {
5982: int erroroffset;
5983: int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);
5984: if (errorcode != 0)
5985: {
5986: if (offsetcount >= 2)
5987: {
5988: offsets[0] = erroroffset;
5989: offsets[1] = errorcode;
5990: }
5991: return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
5992: PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
5993: }
5994:
5995: /* Check that a start_offset points to the start of a UTF-8 character. */
5996: if (start_offset > 0 && start_offset < length &&
5997: (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
5998: return PCRE_ERROR_BADUTF8_OFFSET;
5999: }
6000: #endif
6001:
6002: /* If the pattern was successfully studied with JIT support, run the JIT
6003: executable instead of the rest of this function. Most options must be set at
6004: compile time for the JIT code to be usable. Fallback to the normal code path if
6005: an unsupported flag is set. In particular, JIT does not support partial
6006: matching. */
6007:
6008: #ifdef SUPPORT_JIT
6009: if (extra_data != NULL
6010: && (extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
6011: && extra_data->executable_jit != NULL
6012: && (extra_data->flags & PCRE_EXTRA_TABLES) == 0
6013: && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6014: PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART)) == 0)
6015: return _pcre_jit_exec(re, extra_data->executable_jit, subject, length,
6016: start_offset, options, ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
6017: ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
6018: #endif
6019:
6020: /* Carry on with non-JIT matching. This information is for finding all the
6021: numbers associated with a given name, for condition testing. */
6022:
6023: md->name_table = (uschar *)re + re->name_table_offset;
6024: md->name_count = re->name_count;
6025: md->name_entry_size = re->name_entry_size;
6026:
6027: /* Fish out the optional data from the extra_data structure, first setting
6028: the default values. */
6029:
6030: study = NULL;
6031: md->match_limit = MATCH_LIMIT;
6032: md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6033: md->callout_data = NULL;
6034:
6035: /* The table pointer is always in native byte order. */
6036:
6037: tables = external_re->tables;
6038:
6039: if (extra_data != NULL)
6040: {
6041: register unsigned int flags = extra_data->flags;
6042: if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6043: study = (const pcre_study_data *)extra_data->study_data;
6044: if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6045: md->match_limit = extra_data->match_limit;
6046: if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6047: md->match_limit_recursion = extra_data->match_limit_recursion;
6048: if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6049: md->callout_data = extra_data->callout_data;
6050: if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6051: }
6052:
6053: /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6054: is a feature that makes it possible to save compiled regex and re-use them
6055: in other programs later. */
6056:
6057: if (tables == NULL) tables = _pcre_default_tables;
6058:
6059: /* Check that the first field in the block is the magic number. If it is not,
6060: test for a regex that was compiled on a host of opposite endianness. If this is
6061: the case, flipped values are put in internal_re and internal_study if there was
6062: study data too. */
6063:
6064: if (re->magic_number != MAGIC_NUMBER)
6065: {
6066: re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
6067: if (re == NULL) return PCRE_ERROR_BADMAGIC;
6068: if (study != NULL) study = &internal_study;
6069: }
6070:
6071: /* Set up other data */
6072:
6073: anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6074: startline = (re->flags & PCRE_STARTLINE) != 0;
6075: firstline = (re->options & PCRE_FIRSTLINE) != 0;
6076:
6077: /* The code starts after the real_pcre block and the capture name table. */
6078:
6079: md->start_code = (const uschar *)external_re + re->name_table_offset +
6080: re->name_count * re->name_entry_size;
6081:
6082: md->start_subject = (USPTR)subject;
6083: md->start_offset = start_offset;
6084: md->end_subject = md->start_subject + length;
6085: end_subject = md->end_subject;
6086:
6087: md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6088: md->use_ucp = (re->options & PCRE_UCP) != 0;
6089: md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6090: md->ignore_skip_arg = FALSE;
6091:
6092: /* Some options are unpacked into BOOL variables in the hope that testing
6093: them will be faster than individual option bits. */
6094:
6095: md->notbol = (options & PCRE_NOTBOL) != 0;
6096: md->noteol = (options & PCRE_NOTEOL) != 0;
6097: md->notempty = (options & PCRE_NOTEMPTY) != 0;
6098: md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6099:
6100: md->hitend = FALSE;
6101: md->mark = md->nomatch_mark = NULL; /* In case never set */
6102:
6103: md->recursive = NULL; /* No recursion at top level */
6104: md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6105:
6106: md->lcc = tables + lcc_offset;
6107: md->ctypes = tables + ctypes_offset;
6108:
6109: /* Handle different \R options. */
6110:
6111: switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6112: {
6113: case 0:
6114: if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6115: md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6116: else
6117: #ifdef BSR_ANYCRLF
6118: md->bsr_anycrlf = TRUE;
6119: #else
6120: md->bsr_anycrlf = FALSE;
6121: #endif
6122: break;
6123:
6124: case PCRE_BSR_ANYCRLF:
6125: md->bsr_anycrlf = TRUE;
6126: break;
6127:
6128: case PCRE_BSR_UNICODE:
6129: md->bsr_anycrlf = FALSE;
6130: break;
6131:
6132: default: return PCRE_ERROR_BADNEWLINE;
6133: }
6134:
6135: /* Handle different types of newline. The three bits give eight cases. If
6136: nothing is set at run time, whatever was used at compile time applies. */
6137:
6138: switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6139: (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6140: {
6141: case 0: newline = NEWLINE; break; /* Compile-time default */
6142: case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6143: case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6144: case PCRE_NEWLINE_CR+
6145: PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6146: case PCRE_NEWLINE_ANY: newline = -1; break;
6147: case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6148: default: return PCRE_ERROR_BADNEWLINE;
6149: }
6150:
6151: if (newline == -2)
6152: {
6153: md->nltype = NLTYPE_ANYCRLF;
6154: }
6155: else if (newline < 0)
6156: {
6157: md->nltype = NLTYPE_ANY;
6158: }
6159: else
6160: {
6161: md->nltype = NLTYPE_FIXED;
6162: if (newline > 255)
6163: {
6164: md->nllen = 2;
6165: md->nl[0] = (newline >> 8) & 255;
6166: md->nl[1] = newline & 255;
6167: }
6168: else
6169: {
6170: md->nllen = 1;
6171: md->nl[0] = newline;
6172: }
6173: }
6174:
6175: /* Partial matching was originally supported only for a restricted set of
6176: regexes; from release 8.00 there are no restrictions, but the bits are still
6177: defined (though never set). So there's no harm in leaving this code. */
6178:
6179: if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6180: return PCRE_ERROR_BADPARTIAL;
6181:
6182: /* If the expression has got more back references than the offsets supplied can
6183: hold, we get a temporary chunk of working store to use during the matching.
6184: Otherwise, we can use the vector supplied, rounding down its size to a multiple
6185: of 3. */
6186:
6187: ocount = offsetcount - (offsetcount % 3);
6188: arg_offset_max = (2*ocount)/3;
6189:
6190: if (re->top_backref > 0 && re->top_backref >= ocount/3)
6191: {
6192: ocount = re->top_backref * 3 + 3;
6193: md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
6194: if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6195: using_temporary_offsets = TRUE;
6196: DPRINTF(("Got memory to hold back references\n"));
6197: }
6198: else md->offset_vector = offsets;
6199:
6200: md->offset_end = ocount;
6201: md->offset_max = (2*ocount)/3;
6202: md->offset_overflow = FALSE;
6203: md->capture_last = -1;
6204:
6205: /* Reset the working variable associated with each extraction. These should
6206: never be used unless previously set, but they get saved and restored, and so we
6207: initialize them to avoid reading uninitialized locations. Also, unset the
6208: offsets for the matched string. This is really just for tidiness with callouts,
6209: in case they inspect these fields. */
6210:
6211: if (md->offset_vector != NULL)
6212: {
6213: register int *iptr = md->offset_vector + ocount;
6214: register int *iend = iptr - re->top_bracket;
6215: if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6216: while (--iptr >= iend) *iptr = -1;
6217: md->offset_vector[0] = md->offset_vector[1] = -1;
6218: }
6219:
6220: /* Set up the first character to match, if available. The first_byte value is
6221: never set for an anchored regular expression, but the anchoring may be forced
6222: at run time, so we have to test for anchoring. The first char may be unset for
6223: an unanchored pattern, of course. If there's no first char and the pattern was
6224: studied, there may be a bitmap of possible first characters. */
6225:
6226: if (!anchored)
6227: {
6228: if ((re->flags & PCRE_FIRSTSET) != 0)
6229: {
6230: first_byte = re->first_byte & 255;
6231: if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
6232: first_byte = md->lcc[first_byte];
6233: }
6234: else
6235: if (!startline && study != NULL &&
6236: (study->flags & PCRE_STUDY_MAPPED) != 0)
6237: start_bits = study->start_bits;
6238: }
6239:
6240: /* For anchored or unanchored matches, there may be a "last known required
6241: character" set. */
6242:
6243: if ((re->flags & PCRE_REQCHSET) != 0)
6244: {
6245: req_byte = re->req_byte & 255;
6246: req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
6247: req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
6248: }
6249:
6250:
6251:
6252:
6253: /* ==========================================================================*/
6254:
6255: /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6256: the loop runs just once. */
6257:
6258: for(;;)
6259: {
6260: USPTR save_end_subject = end_subject;
6261: USPTR new_start_match;
6262:
6263: /* If firstline is TRUE, the start of the match is constrained to the first
6264: line of a multiline string. That is, the match must be before or at the first
6265: newline. Implement this by temporarily adjusting end_subject so that we stop
6266: scanning at a newline. If the match fails at the newline, later code breaks
6267: this loop. */
6268:
6269: if (firstline)
6270: {
6271: USPTR t = start_match;
6272: #ifdef SUPPORT_UTF8
6273: if (utf8)
6274: {
6275: while (t < md->end_subject && !IS_NEWLINE(t))
6276: {
6277: t++;
6278: while (t < end_subject && (*t & 0xc0) == 0x80) t++;
6279: }
6280: }
6281: else
6282: #endif
6283: while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6284: end_subject = t;
6285: }
6286:
6287: /* There are some optimizations that avoid running the match if a known
6288: starting point is not found, or if a known later character is not present.
6289: However, there is an option that disables these, for testing and for ensuring
6290: that all callouts do actually occur. The option can be set in the regex by
6291: (*NO_START_OPT) or passed in match-time options. */
6292:
6293: if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6294: {
6295: /* Advance to a unique first byte if there is one. */
6296:
6297: if (first_byte >= 0)
6298: {
6299: if (first_byte_caseless)
6300: while (start_match < end_subject && md->lcc[*start_match] != first_byte)
6301: start_match++;
6302: else
6303: while (start_match < end_subject && *start_match != first_byte)
6304: start_match++;
6305: }
6306:
6307: /* Or to just after a linebreak for a multiline match */
6308:
6309: else if (startline)
6310: {
6311: if (start_match > md->start_subject + start_offset)
6312: {
6313: #ifdef SUPPORT_UTF8
6314: if (utf8)
6315: {
6316: while (start_match < end_subject && !WAS_NEWLINE(start_match))
6317: {
6318: start_match++;
6319: while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6320: start_match++;
6321: }
6322: }
6323: else
6324: #endif
6325: while (start_match < end_subject && !WAS_NEWLINE(start_match))
6326: start_match++;
6327:
6328: /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6329: and we are now at a LF, advance the match position by one more character.
6330: */
6331:
6332: if (start_match[-1] == CHAR_CR &&
6333: (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6334: start_match < end_subject &&
6335: *start_match == CHAR_NL)
6336: start_match++;
6337: }
6338: }
6339:
6340: /* Or to a non-unique first byte after study */
6341:
6342: else if (start_bits != NULL)
6343: {
6344: while (start_match < end_subject)
6345: {
6346: register unsigned int c = *start_match;
6347: if ((start_bits[c/8] & (1 << (c&7))) == 0)
6348: {
6349: start_match++;
6350: #ifdef SUPPORT_UTF8
6351: if (utf8)
6352: while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6353: start_match++;
6354: #endif
6355: }
6356: else break;
6357: }
6358: }
6359: } /* Starting optimizations */
6360:
6361: /* Restore fudged end_subject */
6362:
6363: end_subject = save_end_subject;
6364:
6365: /* The following two optimizations are disabled for partial matching or if
6366: disabling is explicitly requested. */
6367:
6368: if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6369: {
6370: /* If the pattern was studied, a minimum subject length may be set. This is
6371: a lower bound; no actual string of that length may actually match the
6372: pattern. Although the value is, strictly, in characters, we treat it as
6373: bytes to avoid spending too much time in this optimization. */
6374:
6375: if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6376: (pcre_uint32)(end_subject - start_match) < study->minlength)
6377: {
6378: rc = MATCH_NOMATCH;
6379: break;
6380: }
6381:
6382: /* If req_byte is set, we know that that character must appear in the
6383: subject for the match to succeed. If the first character is set, req_byte
6384: must be later in the subject; otherwise the test starts at the match point.
6385: This optimization can save a huge amount of backtracking in patterns with
6386: nested unlimited repeats that aren't going to match. Writing separate code
6387: for cased/caseless versions makes it go faster, as does using an
6388: autoincrement and backing off on a match.
6389:
6390: HOWEVER: when the subject string is very, very long, searching to its end
6391: can take a long time, and give bad performance on quite ordinary patterns.
6392: This showed up when somebody was matching something like /^\d+C/ on a
6393: 32-megabyte string... so we don't do this when the string is sufficiently
6394: long. */
6395:
6396: if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6397: {
6398: register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6399:
6400: /* We don't need to repeat the search if we haven't yet reached the
6401: place we found it at last time. */
6402:
6403: if (p > req_byte_ptr)
6404: {
6405: if (req_byte_caseless)
6406: {
6407: while (p < end_subject)
6408: {
6409: register int pp = *p++;
6410: if (pp == req_byte || pp == req_byte2) { p--; break; }
6411: }
6412: }
6413: else
6414: {
6415: while (p < end_subject)
6416: {
6417: if (*p++ == req_byte) { p--; break; }
6418: }
6419: }
6420:
6421: /* If we can't find the required character, break the matching loop,
6422: forcing a match failure. */
6423:
6424: if (p >= end_subject)
6425: {
6426: rc = MATCH_NOMATCH;
6427: break;
6428: }
6429:
6430: /* If we have found the required character, save the point where we
6431: found it, so that we don't search again next time round the loop if
6432: the start hasn't passed this character yet. */
6433:
6434: req_byte_ptr = p;
6435: }
6436: }
6437: }
6438:
6439: #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6440: printf(">>>> Match against: ");
6441: pchars(start_match, end_subject - start_match, TRUE, md);
6442: printf("\n");
6443: #endif
6444:
6445: /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6446: first starting point for which a partial match was found. */
6447:
6448: md->start_match_ptr = start_match;
6449: md->start_used_ptr = start_match;
6450: md->match_call_count = 0;
6451: md->match_function_type = 0;
6452: md->end_offset_top = 0;
6453: rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0);
6454: if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6455:
6456: switch(rc)
6457: {
6458: /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6459: the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
6460: entirely. The only way we can do that is to re-do the match at the same
6461: point, with a flag to force SKIP with an argument to be ignored. Just
6462: treating this case as NOMATCH does not work because it does not check other
6463: alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
6464:
6465: case MATCH_SKIP_ARG:
6466: new_start_match = start_match;
6467: md->ignore_skip_arg = TRUE;
6468: break;
6469:
6470: /* SKIP passes back the next starting point explicitly, but if it is the
6471: same as the match we have just done, treat it as NOMATCH. */
6472:
6473: case MATCH_SKIP:
6474: if (md->start_match_ptr != start_match)
6475: {
6476: new_start_match = md->start_match_ptr;
6477: break;
6478: }
6479: /* Fall through */
6480:
6481: /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6482: exactly like PRUNE. Unset the ignore SKIP-with-argument flag. */
6483:
6484: case MATCH_NOMATCH:
6485: case MATCH_PRUNE:
6486: case MATCH_THEN:
6487: md->ignore_skip_arg = FALSE;
6488: new_start_match = start_match + 1;
6489: #ifdef SUPPORT_UTF8
6490: if (utf8)
6491: while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6492: new_start_match++;
6493: #endif
6494: break;
6495:
6496: /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6497:
6498: case MATCH_COMMIT:
6499: rc = MATCH_NOMATCH;
6500: goto ENDLOOP;
6501:
6502: /* Any other return is either a match, or some kind of error. */
6503:
6504: default:
6505: goto ENDLOOP;
6506: }
6507:
6508: /* Control reaches here for the various types of "no match at this point"
6509: result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6510:
6511: rc = MATCH_NOMATCH;
6512:
6513: /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6514: newline in the subject (though it may continue over the newline). Therefore,
6515: if we have just failed to match, starting at a newline, do not continue. */
6516:
6517: if (firstline && IS_NEWLINE(start_match)) break;
6518:
6519: /* Advance to new matching position */
6520:
6521: start_match = new_start_match;
6522:
6523: /* Break the loop if the pattern is anchored or if we have passed the end of
6524: the subject. */
6525:
6526: if (anchored || start_match > end_subject) break;
6527:
6528: /* If we have just passed a CR and we are now at a LF, and the pattern does
6529: not contain any explicit matches for \r or \n, and the newline option is CRLF
6530: or ANY or ANYCRLF, advance the match position by one more character. */
6531:
6532: if (start_match[-1] == CHAR_CR &&
6533: start_match < end_subject &&
6534: *start_match == CHAR_NL &&
6535: (re->flags & PCRE_HASCRORLF) == 0 &&
6536: (md->nltype == NLTYPE_ANY ||
6537: md->nltype == NLTYPE_ANYCRLF ||
6538: md->nllen == 2))
6539: start_match++;
6540:
6541: md->mark = NULL; /* Reset for start of next match attempt */
6542: } /* End of for(;;) "bumpalong" loop */
6543:
6544: /* ==========================================================================*/
6545:
6546: /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6547: conditions is true:
6548:
6549: (1) The pattern is anchored or the match was failed by (*COMMIT);
6550:
6551: (2) We are past the end of the subject;
6552:
6553: (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6554: this option requests that a match occur at or before the first newline in
6555: the subject.
6556:
6557: When we have a match and the offset vector is big enough to deal with any
6558: backreferences, captured substring offsets will already be set up. In the case
6559: where we had to get some local store to hold offsets for backreference
6560: processing, copy those that we can. In this case there need not be overflow if
6561: certain parts of the pattern were not used, even though there are more
6562: capturing parentheses than vector slots. */
6563:
6564: ENDLOOP:
6565:
6566: if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6567: {
6568: if (using_temporary_offsets)
6569: {
6570: if (arg_offset_max >= 4)
6571: {
6572: memcpy(offsets + 2, md->offset_vector + 2,
6573: (arg_offset_max - 2) * sizeof(int));
6574: DPRINTF(("Copied offsets from temporary memory\n"));
6575: }
6576: if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE;
6577: DPRINTF(("Freeing temporary memory\n"));
6578: (pcre_free)(md->offset_vector);
6579: }
6580:
6581: /* Set the return code to the number of captured strings, or 0 if there were
6582: too many to fit into the vector. */
6583:
6584: rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)?
6585: 0 : md->end_offset_top/2;
6586:
6587: /* If there is space in the offset vector, set any unused pairs at the end of
6588: the pattern to -1 for backwards compatibility. It is documented that this
6589: happens. In earlier versions, the whole set of potential capturing offsets
6590: was set to -1 each time round the loop, but this is handled differently now.
6591: "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
6592: those at the end that need unsetting here. We can't just unset them all at
6593: the start of the whole thing because they may get set in one branch that is
6594: not the final matching branch. */
6595:
6596: if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
6597: {
6598: register int *iptr, *iend;
6599: int resetcount = 2 + re->top_bracket * 2;
6600: if (resetcount > offsetcount) resetcount = ocount;
6601: iptr = offsets + md->end_offset_top;
6602: iend = offsets + resetcount;
6603: while (iptr < iend) *iptr++ = -1;
6604: }
6605:
6606: /* If there is space, set up the whole thing as substring 0. The value of
6607: md->start_match_ptr might be modified if \K was encountered on the success
6608: matching path. */
6609:
6610: if (offsetcount < 2) rc = 0; else
6611: {
6612: offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6613: offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6614: }
6615:
6616: /* Return MARK data if requested */
6617:
6618: if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6619: *(extra_data->mark) = (unsigned char *)(md->mark);
6620: DPRINTF((">>>> returning %d\n", rc));
6621: return rc;
6622: }
6623:
6624: /* Control gets here if there has been an error, or if the overall match
6625: attempt has failed at all permitted starting positions. */
6626:
6627: if (using_temporary_offsets)
6628: {
6629: DPRINTF(("Freeing temporary memory\n"));
6630: (pcre_free)(md->offset_vector);
6631: }
6632:
6633: /* For anything other than nomatch or partial match, just return the code. */
6634:
6635: if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6636: {
6637: DPRINTF((">>>> error: returning %d\n", rc));
6638: return rc;
6639: }
6640:
6641: /* Handle partial matches - disable any mark data */
6642:
6643: if (start_partial != NULL)
6644: {
6645: DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6646: md->mark = NULL;
6647: if (offsetcount > 1)
6648: {
6649: offsets[0] = (int)(start_partial - (USPTR)subject);
6650: offsets[1] = (int)(end_subject - (USPTR)subject);
6651: }
6652: rc = PCRE_ERROR_PARTIAL;
6653: }
6654:
6655: /* This is the classic nomatch case */
6656:
6657: else
6658: {
6659: DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6660: rc = PCRE_ERROR_NOMATCH;
6661: }
6662:
6663: /* Return the MARK data if it has been requested. */
6664:
6665: if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6666: *(extra_data->mark) = (unsigned char *)(md->nomatch_mark);
6667: return rc;
6668: }
6669:
6670: /* End of pcre_exec.c */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>