Annotation of embedaddon/php/ext/pcre/pcrelib/pcre_exec.c, revision 1.1.1.1
1.1 misho 1: /*************************************************
2: * Perl-Compatible Regular Expressions *
3: *************************************************/
4:
5: /* PCRE is a library of functions to support regular expressions whose syntax
6: and semantics are as close as possible to those of the Perl 5 language.
7:
8: Written by Philip Hazel
9: Copyright (c) 1997-2010 University of Cambridge
10:
11: -----------------------------------------------------------------------------
12: Redistribution and use in source and binary forms, with or without
13: modification, are permitted provided that the following conditions are met:
14:
15: * Redistributions of source code must retain the above copyright notice,
16: this list of conditions and the following disclaimer.
17:
18: * Redistributions in binary form must reproduce the above copyright
19: notice, this list of conditions and the following disclaimer in the
20: documentation and/or other materials provided with the distribution.
21:
22: * Neither the name of the University of Cambridge nor the names of its
23: contributors may be used to endorse or promote products derived from
24: this software without specific prior written permission.
25:
26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36: POSSIBILITY OF SUCH DAMAGE.
37: -----------------------------------------------------------------------------
38: */
39:
40:
41: /* This module contains pcre_exec(), the externally visible function that does
42: pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43: possible. There are also some static supporting functions. */
44:
45: #include "config.h"
46:
47: #define NLBLOCK md /* Block containing newline information */
48: #define PSSTART start_subject /* Field containing processed string start */
49: #define PSEND end_subject /* Field containing processed string end */
50:
51: #include "pcre_internal.h"
52:
53: /* Undefine some potentially clashing cpp symbols */
54:
55: #undef min
56: #undef max
57:
58: /* Flag bits for the match() function */
59:
60: #define match_condassert 0x01 /* Called to check a condition assertion */
61: #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
62:
63: /* Non-error returns from the match() function. Error returns are externally
64: defined PCRE_ERROR_xxx codes, which are all negative. */
65:
66: #define MATCH_MATCH 1
67: #define MATCH_NOMATCH 0
68:
69: /* Special internal returns from the match() function. Make them sufficiently
70: negative to avoid the external error codes. */
71:
72: #define MATCH_ACCEPT (-999)
73: #define MATCH_COMMIT (-998)
74: #define MATCH_PRUNE (-997)
75: #define MATCH_SKIP (-996)
76: #define MATCH_SKIP_ARG (-995)
77: #define MATCH_THEN (-994)
78:
79: /* This is a convenience macro for code that occurs many times. */
80:
81: #define MRRETURN(ra) \
82: { \
83: md->mark = markptr; \
84: RRETURN(ra); \
85: }
86:
87: /* Maximum number of ints of offset to save on the stack for recursive calls.
88: If the offset vector is bigger, malloc is used. This should be a multiple of 3,
89: because the offset vector is always a multiple of 3 long. */
90:
91: #define REC_STACK_SAVE_MAX 30
92:
93: /* Min and max values for the common repeats; for the maxima, 0 => infinity */
94:
95: static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
96: static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
97:
98:
99:
100: #ifdef PCRE_DEBUG
101: /*************************************************
102: * Debugging function to print chars *
103: *************************************************/
104:
105: /* Print a sequence of chars in printable format, stopping at the end of the
106: subject if the requested.
107:
108: Arguments:
109: p points to characters
110: length number to print
111: is_subject TRUE if printing from within md->start_subject
112: md pointer to matching data block, if is_subject is TRUE
113:
114: Returns: nothing
115: */
116:
117: static void
118: pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
119: {
120: unsigned int c;
121: if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
122: while (length-- > 0)
123: if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
124: }
125: #endif
126:
127:
128:
129: /*************************************************
130: * Match a back-reference *
131: *************************************************/
132:
133: /* If a back reference hasn't been set, the length that is passed is greater
134: than the number of characters left in the string, so the match fails.
135:
136: Arguments:
137: offset index into the offset vector
138: eptr points into the subject
139: length length to be matched
140: md points to match data block
141: ims the ims flags
142:
143: Returns: TRUE if matched
144: */
145:
146: static BOOL
147: match_ref(int offset, register USPTR eptr, int length, match_data *md,
148: unsigned long int ims)
149: {
150: USPTR p = md->start_subject + md->offset_vector[offset];
151:
152: #ifdef PCRE_DEBUG
153: if (eptr >= md->end_subject)
154: printf("matching subject <null>");
155: else
156: {
157: printf("matching subject ");
158: pchars(eptr, length, TRUE, md);
159: }
160: printf(" against backref ");
161: pchars(p, length, FALSE, md);
162: printf("\n");
163: #endif
164:
165: /* Always fail if not enough characters left */
166:
167: if (length > md->end_subject - eptr) return FALSE;
168:
169: /* Separate the caseless case for speed. In UTF-8 mode we can only do this
170: properly if Unicode properties are supported. Otherwise, we can check only
171: ASCII characters. */
172:
173: if ((ims & PCRE_CASELESS) != 0)
174: {
175: #ifdef SUPPORT_UTF8
176: #ifdef SUPPORT_UCP
177: if (md->utf8)
178: {
179: USPTR endptr = eptr + length;
180: while (eptr < endptr)
181: {
182: int c, d;
183: GETCHARINC(c, eptr);
184: GETCHARINC(d, p);
185: if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
186: }
187: }
188: else
189: #endif
190: #endif
191:
192: /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
193: is no UCP support. */
194:
195: while (length-- > 0)
196: { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
197: }
198:
199: /* In the caseful case, we can just compare the bytes, whether or not we
200: are in UTF-8 mode. */
201:
202: else
203: { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
204:
205: return TRUE;
206: }
207:
208:
209:
210: /***************************************************************************
211: ****************************************************************************
212: RECURSION IN THE match() FUNCTION
213:
214: The match() function is highly recursive, though not every recursive call
215: increases the recursive depth. Nevertheless, some regular expressions can cause
216: it to recurse to a great depth. I was writing for Unix, so I just let it call
217: itself recursively. This uses the stack for saving everything that has to be
218: saved for a recursive call. On Unix, the stack can be large, and this works
219: fine.
220:
221: It turns out that on some non-Unix-like systems there are problems with
222: programs that use a lot of stack. (This despite the fact that every last chip
223: has oodles of memory these days, and techniques for extending the stack have
224: been known for decades.) So....
225:
226: There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
227: calls by keeping local variables that need to be preserved in blocks of memory
228: obtained from malloc() instead instead of on the stack. Macros are used to
229: achieve this so that the actual code doesn't look very different to what it
230: always used to.
231:
232: The original heap-recursive code used longjmp(). However, it seems that this
233: can be very slow on some operating systems. Following a suggestion from Stan
234: Switzer, the use of longjmp() has been abolished, at the cost of having to
235: provide a unique number for each call to RMATCH. There is no way of generating
236: a sequence of numbers at compile time in C. I have given them names, to make
237: them stand out more clearly.
238:
239: Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
240: FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
241: tests. Furthermore, not using longjmp() means that local dynamic variables
242: don't have indeterminate values; this has meant that the frame size can be
243: reduced because the result can be "passed back" by straight setting of the
244: variable instead of being passed in the frame.
245: ****************************************************************************
246: ***************************************************************************/
247:
248: /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
249: below must be updated in sync. */
250:
251: enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
252: RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
253: RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
254: RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
255: RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
256: RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
257: RM61, RM62 };
258:
259: /* These versions of the macros use the stack, as normal. There are debugging
260: versions and production versions. Note that the "rw" argument of RMATCH isn't
261: actually used in this definition. */
262:
263: #ifndef NO_RECURSE
264: #define REGISTER register
265:
266: #ifdef PCRE_DEBUG
267: #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
268: { \
269: printf("match() called in line %d\n", __LINE__); \
270: rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1); \
271: printf("to line %d\n", __LINE__); \
272: }
273: #define RRETURN(ra) \
274: { \
275: printf("match() returned %d from line %d ", ra, __LINE__); \
276: return ra; \
277: }
278: #else
279: #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
280: rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1)
281: #define RRETURN(ra) return ra
282: #endif
283:
284: #else
285:
286:
287: /* These versions of the macros manage a private stack on the heap. Note that
288: the "rd" argument of RMATCH isn't actually used in this definition. It's the md
289: argument of match(), which never changes. */
290:
291: #define REGISTER
292:
293: #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
294: {\
295: heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
296: if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
297: frame->Xwhere = rw; \
298: newframe->Xeptr = ra;\
299: newframe->Xecode = rb;\
300: newframe->Xmstart = mstart;\
301: newframe->Xmarkptr = markptr;\
302: newframe->Xoffset_top = rc;\
303: newframe->Xims = re;\
304: newframe->Xeptrb = rf;\
305: newframe->Xflags = rg;\
306: newframe->Xrdepth = frame->Xrdepth + 1;\
307: newframe->Xprevframe = frame;\
308: frame = newframe;\
309: DPRINTF(("restarting from line %d\n", __LINE__));\
310: goto HEAP_RECURSE;\
311: L_##rw:\
312: DPRINTF(("jumped back to line %d\n", __LINE__));\
313: }
314:
315: #define RRETURN(ra)\
316: {\
317: heapframe *oldframe = frame;\
318: frame = oldframe->Xprevframe;\
319: (pcre_stack_free)(oldframe);\
320: if (frame != NULL)\
321: {\
322: rrc = ra;\
323: goto HEAP_RETURN;\
324: }\
325: return ra;\
326: }
327:
328:
329: /* Structure for remembering the local variables in a private frame */
330:
331: typedef struct heapframe {
332: struct heapframe *Xprevframe;
333:
334: /* Function arguments that may change */
335:
336: USPTR Xeptr;
337: const uschar *Xecode;
338: USPTR Xmstart;
339: USPTR Xmarkptr;
340: int Xoffset_top;
341: long int Xims;
342: eptrblock *Xeptrb;
343: int Xflags;
344: unsigned int Xrdepth;
345:
346: /* Function local variables */
347:
348: USPTR Xcallpat;
349: #ifdef SUPPORT_UTF8
350: USPTR Xcharptr;
351: #endif
352: USPTR Xdata;
353: USPTR Xnext;
354: USPTR Xpp;
355: USPTR Xprev;
356: USPTR Xsaved_eptr;
357:
358: recursion_info Xnew_recursive;
359:
360: BOOL Xcur_is_word;
361: BOOL Xcondition;
362: BOOL Xprev_is_word;
363:
364: unsigned long int Xoriginal_ims;
365:
366: #ifdef SUPPORT_UCP
367: int Xprop_type;
368: int Xprop_value;
369: int Xprop_fail_result;
370: int Xprop_category;
371: int Xprop_chartype;
372: int Xprop_script;
373: int Xoclength;
374: uschar Xocchars[8];
375: #endif
376:
377: int Xcodelink;
378: int Xctype;
379: unsigned int Xfc;
380: int Xfi;
381: int Xlength;
382: int Xmax;
383: int Xmin;
384: int Xnumber;
385: int Xoffset;
386: int Xop;
387: int Xsave_capture_last;
388: int Xsave_offset1, Xsave_offset2, Xsave_offset3;
389: int Xstacksave[REC_STACK_SAVE_MAX];
390:
391: eptrblock Xnewptrb;
392:
393: /* Where to jump back to */
394:
395: int Xwhere;
396:
397: } heapframe;
398:
399: #endif
400:
401:
402: /***************************************************************************
403: ***************************************************************************/
404:
405:
406:
407: /*************************************************
408: * Match from current position *
409: *************************************************/
410:
411: /* This function is called recursively in many circumstances. Whenever it
412: returns a negative (error) response, the outer incarnation must also return the
413: same response. */
414:
415: /* These macros pack up tests that are used for partial matching, and which
416: appears several times in the code. We set the "hit end" flag if the pointer is
417: at the end of the subject and also past the start of the subject (i.e.
418: something has been matched). For hard partial matching, we then return
419: immediately. The second one is used when we already know we are past the end of
420: the subject. */
421:
422: #define CHECK_PARTIAL()\
423: if (md->partial != 0 && eptr >= md->end_subject && \
424: eptr > md->start_used_ptr) \
425: { \
426: md->hitend = TRUE; \
427: if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
428: }
429:
430: #define SCHECK_PARTIAL()\
431: if (md->partial != 0 && eptr > md->start_used_ptr) \
432: { \
433: md->hitend = TRUE; \
434: if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
435: }
436:
437:
438: /* Performance note: It might be tempting to extract commonly used fields from
439: the md structure (e.g. utf8, end_subject) into individual variables to improve
440: performance. Tests using gcc on a SPARC disproved this; in the first case, it
441: made performance worse.
442:
443: Arguments:
444: eptr pointer to current character in subject
445: ecode pointer to current position in compiled code
446: mstart pointer to the current match start position (can be modified
447: by encountering \K)
448: markptr pointer to the most recent MARK name, or NULL
449: offset_top current top pointer
450: md pointer to "static" info for the match
451: ims current /i, /m, and /s options
452: eptrb pointer to chain of blocks containing eptr at start of
453: brackets - for testing for empty matches
454: flags can contain
455: match_condassert - this is an assertion condition
456: match_cbegroup - this is the start of an unlimited repeat
457: group that can match an empty string
458: rdepth the recursion depth
459:
460: Returns: MATCH_MATCH if matched ) these values are >= 0
461: MATCH_NOMATCH if failed to match )
462: a negative MATCH_xxx value for PRUNE, SKIP, etc
463: a negative PCRE_ERROR_xxx value if aborted by an error condition
464: (e.g. stopped by repeated call or recursion limit)
465: */
466:
467: static int
468: match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
469: const uschar *markptr, int offset_top, match_data *md, unsigned long int ims,
470: eptrblock *eptrb, int flags, unsigned int rdepth)
471: {
472: /* These variables do not need to be preserved over recursion in this function,
473: so they can be ordinary variables in all cases. Mark some of them with
474: "register" because they are used a lot in loops. */
475:
476: register int rrc; /* Returns from recursive calls */
477: register int i; /* Used for loops not involving calls to RMATCH() */
478: register unsigned int c; /* Character values not kept over RMATCH() calls */
479: register BOOL utf8; /* Local copy of UTF-8 flag for speed */
480:
481: BOOL minimize, possessive; /* Quantifier options */
482: int condcode;
483:
484: /* When recursion is not being used, all "local" variables that have to be
485: preserved over calls to RMATCH() are part of a "frame" which is obtained from
486: heap storage. Set up the top-level frame here; others are obtained from the
487: heap whenever RMATCH() does a "recursion". See the macro definitions above. */
488:
489: #ifdef NO_RECURSE
490: heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
491: if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
492: frame->Xprevframe = NULL; /* Marks the top level */
493:
494: /* Copy in the original argument variables */
495:
496: frame->Xeptr = eptr;
497: frame->Xecode = ecode;
498: frame->Xmstart = mstart;
499: frame->Xmarkptr = markptr;
500: frame->Xoffset_top = offset_top;
501: frame->Xims = ims;
502: frame->Xeptrb = eptrb;
503: frame->Xflags = flags;
504: frame->Xrdepth = rdepth;
505:
506: /* This is where control jumps back to to effect "recursion" */
507:
508: HEAP_RECURSE:
509:
510: /* Macros make the argument variables come from the current frame */
511:
512: #define eptr frame->Xeptr
513: #define ecode frame->Xecode
514: #define mstart frame->Xmstart
515: #define markptr frame->Xmarkptr
516: #define offset_top frame->Xoffset_top
517: #define ims frame->Xims
518: #define eptrb frame->Xeptrb
519: #define flags frame->Xflags
520: #define rdepth frame->Xrdepth
521:
522: /* Ditto for the local variables */
523:
524: #ifdef SUPPORT_UTF8
525: #define charptr frame->Xcharptr
526: #endif
527: #define callpat frame->Xcallpat
528: #define codelink frame->Xcodelink
529: #define data frame->Xdata
530: #define next frame->Xnext
531: #define pp frame->Xpp
532: #define prev frame->Xprev
533: #define saved_eptr frame->Xsaved_eptr
534:
535: #define new_recursive frame->Xnew_recursive
536:
537: #define cur_is_word frame->Xcur_is_word
538: #define condition frame->Xcondition
539: #define prev_is_word frame->Xprev_is_word
540:
541: #define original_ims frame->Xoriginal_ims
542:
543: #ifdef SUPPORT_UCP
544: #define prop_type frame->Xprop_type
545: #define prop_value frame->Xprop_value
546: #define prop_fail_result frame->Xprop_fail_result
547: #define prop_category frame->Xprop_category
548: #define prop_chartype frame->Xprop_chartype
549: #define prop_script frame->Xprop_script
550: #define oclength frame->Xoclength
551: #define occhars frame->Xocchars
552: #endif
553:
554: #define ctype frame->Xctype
555: #define fc frame->Xfc
556: #define fi frame->Xfi
557: #define length frame->Xlength
558: #define max frame->Xmax
559: #define min frame->Xmin
560: #define number frame->Xnumber
561: #define offset frame->Xoffset
562: #define op frame->Xop
563: #define save_capture_last frame->Xsave_capture_last
564: #define save_offset1 frame->Xsave_offset1
565: #define save_offset2 frame->Xsave_offset2
566: #define save_offset3 frame->Xsave_offset3
567: #define stacksave frame->Xstacksave
568:
569: #define newptrb frame->Xnewptrb
570:
571: /* When recursion is being used, local variables are allocated on the stack and
572: get preserved during recursion in the normal way. In this environment, fi and
573: i, and fc and c, can be the same variables. */
574:
575: #else /* NO_RECURSE not defined */
576: #define fi i
577: #define fc c
578:
579:
580: #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
581: const uschar *charptr; /* in small blocks of the code. My normal */
582: #endif /* style of coding would have declared */
583: const uschar *callpat; /* them within each of those blocks. */
584: const uschar *data; /* However, in order to accommodate the */
585: const uschar *next; /* version of this code that uses an */
586: USPTR pp; /* external "stack" implemented on the */
587: const uschar *prev; /* heap, it is easier to declare them all */
588: USPTR saved_eptr; /* here, so the declarations can be cut */
589: /* out in a block. The only declarations */
590: recursion_info new_recursive; /* within blocks below are for variables */
591: /* that do not have to be preserved over */
592: BOOL cur_is_word; /* a recursive call to RMATCH(). */
593: BOOL condition;
594: BOOL prev_is_word;
595:
596: unsigned long int original_ims;
597:
598: #ifdef SUPPORT_UCP
599: int prop_type;
600: int prop_value;
601: int prop_fail_result;
602: int prop_category;
603: int prop_chartype;
604: int prop_script;
605: int oclength;
606: uschar occhars[8];
607: #endif
608:
609: int codelink;
610: int ctype;
611: int length;
612: int max;
613: int min;
614: int number;
615: int offset;
616: int op;
617: int save_capture_last;
618: int save_offset1, save_offset2, save_offset3;
619: int stacksave[REC_STACK_SAVE_MAX];
620:
621: eptrblock newptrb;
622: #endif /* NO_RECURSE */
623:
624: /* These statements are here to stop the compiler complaining about unitialized
625: variables. */
626:
627: #ifdef SUPPORT_UCP
628: prop_value = 0;
629: prop_fail_result = 0;
630: #endif
631:
632:
633: /* This label is used for tail recursion, which is used in a few cases even
634: when NO_RECURSE is not defined, in order to reduce the amount of stack that is
635: used. Thanks to Ian Taylor for noticing this possibility and sending the
636: original patch. */
637:
638: TAIL_RECURSE:
639:
640: /* OK, now we can get on with the real code of the function. Recursive calls
641: are specified by the macro RMATCH and RRETURN is used to return. When
642: NO_RECURSE is *not* defined, these just turn into a recursive call to match()
643: and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
644: defined). However, RMATCH isn't like a function call because it's quite a
645: complicated macro. It has to be used in one particular way. This shouldn't,
646: however, impact performance when true recursion is being used. */
647:
648: #ifdef SUPPORT_UTF8
649: utf8 = md->utf8; /* Local copy of the flag */
650: #else
651: utf8 = FALSE;
652: #endif
653:
654: /* First check that we haven't called match() too many times, or that we
655: haven't exceeded the recursive call limit. */
656:
657: if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
658: if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
659:
660: original_ims = ims; /* Save for resetting on ')' */
661:
662: /* At the start of a group with an unlimited repeat that may match an empty
663: string, the match_cbegroup flag is set. When this is the case, add the current
664: subject pointer to the chain of such remembered pointers, to be checked when we
665: hit the closing ket, in order to break infinite loops that match no characters.
666: When match() is called in other circumstances, don't add to the chain. The
667: match_cbegroup flag must NOT be used with tail recursion, because the memory
668: block that is used is on the stack, so a new one may be required for each
669: match(). */
670:
671: if ((flags & match_cbegroup) != 0)
672: {
673: newptrb.epb_saved_eptr = eptr;
674: newptrb.epb_prev = eptrb;
675: eptrb = &newptrb;
676: }
677:
678: /* Now start processing the opcodes. */
679:
680: for (;;)
681: {
682: minimize = possessive = FALSE;
683: op = *ecode;
684:
685: switch(op)
686: {
687: case OP_MARK:
688: markptr = ecode + 2;
689: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
690: ims, eptrb, flags, RM55);
691:
692: /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
693: argument, and we must check whether that argument matches this MARK's
694: argument. It is passed back in md->start_match_ptr (an overloading of that
695: variable). If it does match, we reset that variable to the current subject
696: position and return MATCH_SKIP. Otherwise, pass back the return code
697: unaltered. */
698:
699: if (rrc == MATCH_SKIP_ARG &&
700: strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
701: {
702: md->start_match_ptr = eptr;
703: RRETURN(MATCH_SKIP);
704: }
705:
706: if (md->mark == NULL) md->mark = markptr;
707: RRETURN(rrc);
708:
709: case OP_FAIL:
710: MRRETURN(MATCH_NOMATCH);
711:
712: /* COMMIT overrides PRUNE, SKIP, and THEN */
713:
714: case OP_COMMIT:
715: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
716: ims, eptrb, flags, RM52);
717: if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
718: rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
719: rrc != MATCH_THEN)
720: RRETURN(rrc);
721: MRRETURN(MATCH_COMMIT);
722:
723: /* PRUNE overrides THEN */
724:
725: case OP_PRUNE:
726: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
727: ims, eptrb, flags, RM51);
728: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
729: MRRETURN(MATCH_PRUNE);
730:
731: case OP_PRUNE_ARG:
732: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
733: ims, eptrb, flags, RM56);
734: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
735: md->mark = ecode + 2;
736: RRETURN(MATCH_PRUNE);
737:
738: /* SKIP overrides PRUNE and THEN */
739:
740: case OP_SKIP:
741: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
742: ims, eptrb, flags, RM53);
743: if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
744: RRETURN(rrc);
745: md->start_match_ptr = eptr; /* Pass back current position */
746: MRRETURN(MATCH_SKIP);
747:
748: case OP_SKIP_ARG:
749: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
750: ims, eptrb, flags, RM57);
751: if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
752: RRETURN(rrc);
753:
754: /* Pass back the current skip name by overloading md->start_match_ptr and
755: returning the special MATCH_SKIP_ARG return code. This will either be
756: caught by a matching MARK, or get to the top, where it is treated the same
757: as PRUNE. */
758:
759: md->start_match_ptr = ecode + 2;
760: RRETURN(MATCH_SKIP_ARG);
761:
762: /* For THEN (and THEN_ARG) we pass back the address of the bracket or
763: the alt that is at the start of the current branch. This makes it possible
764: to skip back past alternatives that precede the THEN within the current
765: branch. */
766:
767: case OP_THEN:
768: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
769: ims, eptrb, flags, RM54);
770: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
771: md->start_match_ptr = ecode - GET(ecode, 1);
772: MRRETURN(MATCH_THEN);
773:
774: case OP_THEN_ARG:
775: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
776: offset_top, md, ims, eptrb, flags, RM58);
777: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
778: md->start_match_ptr = ecode - GET(ecode, 1);
779: md->mark = ecode + LINK_SIZE + 2;
780: RRETURN(MATCH_THEN);
781:
782: /* Handle a capturing bracket. If there is space in the offset vector, save
783: the current subject position in the working slot at the top of the vector.
784: We mustn't change the current values of the data slot, because they may be
785: set from a previous iteration of this group, and be referred to by a
786: reference inside the group.
787:
788: If the bracket fails to match, we need to restore this value and also the
789: values of the final offsets, in case they were set by a previous iteration
790: of the same bracket.
791:
792: If there isn't enough space in the offset vector, treat this as if it were
793: a non-capturing bracket. Don't worry about setting the flag for the error
794: case here; that is handled in the code for KET. */
795:
796: case OP_CBRA:
797: case OP_SCBRA:
798: number = GET2(ecode, 1+LINK_SIZE);
799: offset = number << 1;
800:
801: #ifdef PCRE_DEBUG
802: printf("start bracket %d\n", number);
803: printf("subject=");
804: pchars(eptr, 16, TRUE, md);
805: printf("\n");
806: #endif
807:
808: if (offset < md->offset_max)
809: {
810: save_offset1 = md->offset_vector[offset];
811: save_offset2 = md->offset_vector[offset+1];
812: save_offset3 = md->offset_vector[md->offset_end - number];
813: save_capture_last = md->capture_last;
814:
815: DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
816: md->offset_vector[md->offset_end - number] =
817: (int)(eptr - md->start_subject);
818:
819: flags = (op == OP_SCBRA)? match_cbegroup : 0;
820: do
821: {
822: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
823: ims, eptrb, flags, RM1);
824: if (rrc != MATCH_NOMATCH &&
825: (rrc != MATCH_THEN || md->start_match_ptr != ecode))
826: RRETURN(rrc);
827: md->capture_last = save_capture_last;
828: ecode += GET(ecode, 1);
829: }
830: while (*ecode == OP_ALT);
831:
832: DPRINTF(("bracket %d failed\n", number));
833:
834: md->offset_vector[offset] = save_offset1;
835: md->offset_vector[offset+1] = save_offset2;
836: md->offset_vector[md->offset_end - number] = save_offset3;
837:
838: if (rrc != MATCH_THEN) md->mark = markptr;
839: RRETURN(MATCH_NOMATCH);
840: }
841:
842: /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
843: as a non-capturing bracket. */
844:
845: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
846: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
847:
848: DPRINTF(("insufficient capture room: treat as non-capturing\n"));
849:
850: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
851: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
852:
853: /* Non-capturing bracket. Loop for all the alternatives. When we get to the
854: final alternative within the brackets, we would return the result of a
855: recursive call to match() whatever happened. We can reduce stack usage by
856: turning this into a tail recursion, except in the case when match_cbegroup
857: is set.*/
858:
859: case OP_BRA:
860: case OP_SBRA:
861: DPRINTF(("start non-capturing bracket\n"));
862: flags = (op >= OP_SBRA)? match_cbegroup : 0;
863: for (;;)
864: {
865: if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
866: {
867: if (flags == 0) /* Not a possibly empty group */
868: {
869: ecode += _pcre_OP_lengths[*ecode];
870: DPRINTF(("bracket 0 tail recursion\n"));
871: goto TAIL_RECURSE;
872: }
873:
874: /* Possibly empty group; can't use tail recursion. */
875:
876: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
877: eptrb, flags, RM48);
878: if (rrc == MATCH_NOMATCH) md->mark = markptr;
879: RRETURN(rrc);
880: }
881:
882: /* For non-final alternatives, continue the loop for a NOMATCH result;
883: otherwise return. */
884:
885: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
886: eptrb, flags, RM2);
887: if (rrc != MATCH_NOMATCH &&
888: (rrc != MATCH_THEN || md->start_match_ptr != ecode))
889: RRETURN(rrc);
890: ecode += GET(ecode, 1);
891: }
892: /* Control never reaches here. */
893:
894: /* Conditional group: compilation checked that there are no more than
895: two branches. If the condition is false, skipping the first branch takes us
896: past the end if there is only one branch, but that's OK because that is
897: exactly what going to the ket would do. As there is only one branch to be
898: obeyed, we can use tail recursion to avoid using another stack frame. */
899:
900: case OP_COND:
901: case OP_SCOND:
902: codelink= GET(ecode, 1);
903:
904: /* Because of the way auto-callout works during compile, a callout item is
905: inserted between OP_COND and an assertion condition. */
906:
907: if (ecode[LINK_SIZE+1] == OP_CALLOUT)
908: {
909: if (pcre_callout != NULL)
910: {
911: pcre_callout_block cb;
912: cb.version = 1; /* Version 1 of the callout block */
913: cb.callout_number = ecode[LINK_SIZE+2];
914: cb.offset_vector = md->offset_vector;
915: cb.subject = (PCRE_SPTR)md->start_subject;
916: cb.subject_length = (int)(md->end_subject - md->start_subject);
917: cb.start_match = (int)(mstart - md->start_subject);
918: cb.current_position = (int)(eptr - md->start_subject);
919: cb.pattern_position = GET(ecode, LINK_SIZE + 3);
920: cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
921: cb.capture_top = offset_top/2;
922: cb.capture_last = md->capture_last;
923: cb.callout_data = md->callout_data;
924: if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
925: if (rrc < 0) RRETURN(rrc);
926: }
927: ecode += _pcre_OP_lengths[OP_CALLOUT];
928: }
929:
930: condcode = ecode[LINK_SIZE+1];
931:
932: /* Now see what the actual condition is */
933:
934: if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
935: {
936: if (md->recursive == NULL) /* Not recursing => FALSE */
937: {
938: condition = FALSE;
939: ecode += GET(ecode, 1);
940: }
941: else
942: {
943: int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
944: condition = (recno == RREF_ANY || recno == md->recursive->group_num);
945:
946: /* If the test is for recursion into a specific subpattern, and it is
947: false, but the test was set up by name, scan the table to see if the
948: name refers to any other numbers, and test them. The condition is true
949: if any one is set. */
950:
951: if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
952: {
953: uschar *slotA = md->name_table;
954: for (i = 0; i < md->name_count; i++)
955: {
956: if (GET2(slotA, 0) == recno) break;
957: slotA += md->name_entry_size;
958: }
959:
960: /* Found a name for the number - there can be only one; duplicate
961: names for different numbers are allowed, but not vice versa. First
962: scan down for duplicates. */
963:
964: if (i < md->name_count)
965: {
966: uschar *slotB = slotA;
967: while (slotB > md->name_table)
968: {
969: slotB -= md->name_entry_size;
970: if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
971: {
972: condition = GET2(slotB, 0) == md->recursive->group_num;
973: if (condition) break;
974: }
975: else break;
976: }
977:
978: /* Scan up for duplicates */
979:
980: if (!condition)
981: {
982: slotB = slotA;
983: for (i++; i < md->name_count; i++)
984: {
985: slotB += md->name_entry_size;
986: if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
987: {
988: condition = GET2(slotB, 0) == md->recursive->group_num;
989: if (condition) break;
990: }
991: else break;
992: }
993: }
994: }
995: }
996:
997: /* Chose branch according to the condition */
998:
999: ecode += condition? 3 : GET(ecode, 1);
1000: }
1001: }
1002:
1003: else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1004: {
1005: offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1006: condition = offset < offset_top && md->offset_vector[offset] >= 0;
1007:
1008: /* If the numbered capture is unset, but the reference was by name,
1009: scan the table to see if the name refers to any other numbers, and test
1010: them. The condition is true if any one is set. This is tediously similar
1011: to the code above, but not close enough to try to amalgamate. */
1012:
1013: if (!condition && condcode == OP_NCREF)
1014: {
1015: int refno = offset >> 1;
1016: uschar *slotA = md->name_table;
1017:
1018: for (i = 0; i < md->name_count; i++)
1019: {
1020: if (GET2(slotA, 0) == refno) break;
1021: slotA += md->name_entry_size;
1022: }
1023:
1024: /* Found a name for the number - there can be only one; duplicate names
1025: for different numbers are allowed, but not vice versa. First scan down
1026: for duplicates. */
1027:
1028: if (i < md->name_count)
1029: {
1030: uschar *slotB = slotA;
1031: while (slotB > md->name_table)
1032: {
1033: slotB -= md->name_entry_size;
1034: if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1035: {
1036: offset = GET2(slotB, 0) << 1;
1037: condition = offset < offset_top &&
1038: md->offset_vector[offset] >= 0;
1039: if (condition) break;
1040: }
1041: else break;
1042: }
1043:
1044: /* Scan up for duplicates */
1045:
1046: if (!condition)
1047: {
1048: slotB = slotA;
1049: for (i++; i < md->name_count; i++)
1050: {
1051: slotB += md->name_entry_size;
1052: if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1053: {
1054: offset = GET2(slotB, 0) << 1;
1055: condition = offset < offset_top &&
1056: md->offset_vector[offset] >= 0;
1057: if (condition) break;
1058: }
1059: else break;
1060: }
1061: }
1062: }
1063: }
1064:
1065: /* Chose branch according to the condition */
1066:
1067: ecode += condition? 3 : GET(ecode, 1);
1068: }
1069:
1070: else if (condcode == OP_DEF) /* DEFINE - always false */
1071: {
1072: condition = FALSE;
1073: ecode += GET(ecode, 1);
1074: }
1075:
1076: /* The condition is an assertion. Call match() to evaluate it - setting
1077: the final argument match_condassert causes it to stop at the end of an
1078: assertion. */
1079:
1080: else
1081: {
1082: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
1083: match_condassert, RM3);
1084: if (rrc == MATCH_MATCH)
1085: {
1086: condition = TRUE;
1087: ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1088: while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1089: }
1090: else if (rrc != MATCH_NOMATCH &&
1091: (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1092: {
1093: RRETURN(rrc); /* Need braces because of following else */
1094: }
1095: else
1096: {
1097: condition = FALSE;
1098: ecode += codelink;
1099: }
1100: }
1101:
1102: /* We are now at the branch that is to be obeyed. As there is only one,
1103: we can use tail recursion to avoid using another stack frame, except when
1104: match_cbegroup is required for an unlimited repeat of a possibly empty
1105: group. If the second alternative doesn't exist, we can just plough on. */
1106:
1107: if (condition || *ecode == OP_ALT)
1108: {
1109: ecode += 1 + LINK_SIZE;
1110: if (op == OP_SCOND) /* Possibly empty group */
1111: {
1112: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
1113: RRETURN(rrc);
1114: }
1115: else /* Group must match something */
1116: {
1117: flags = 0;
1118: goto TAIL_RECURSE;
1119: }
1120: }
1121: else /* Condition false & no alternative */
1122: {
1123: ecode += 1 + LINK_SIZE;
1124: }
1125: break;
1126:
1127:
1128: /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1129: to close any currently open capturing brackets. */
1130:
1131: case OP_CLOSE:
1132: number = GET2(ecode, 1);
1133: offset = number << 1;
1134:
1135: #ifdef PCRE_DEBUG
1136: printf("end bracket %d at *ACCEPT", number);
1137: printf("\n");
1138: #endif
1139:
1140: md->capture_last = number;
1141: if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1142: {
1143: md->offset_vector[offset] =
1144: md->offset_vector[md->offset_end - number];
1145: md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1146: if (offset_top <= offset) offset_top = offset + 2;
1147: }
1148: ecode += 3;
1149: break;
1150:
1151:
1152: /* End of the pattern, either real or forced. If we are in a top-level
1153: recursion, we should restore the offsets appropriately and continue from
1154: after the call. */
1155:
1156: case OP_ACCEPT:
1157: case OP_END:
1158: if (md->recursive != NULL && md->recursive->group_num == 0)
1159: {
1160: recursion_info *rec = md->recursive;
1161: DPRINTF(("End of pattern in a (?0) recursion\n"));
1162: md->recursive = rec->prevrec;
1163: memmove(md->offset_vector, rec->offset_save,
1164: rec->saved_max * sizeof(int));
1165: offset_top = rec->save_offset_top;
1166: ims = original_ims;
1167: ecode = rec->after_call;
1168: break;
1169: }
1170:
1171: /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
1172: set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
1173: the subject. In both cases, backtracking will then try other alternatives,
1174: if any. */
1175:
1176: if (eptr == mstart &&
1177: (md->notempty ||
1178: (md->notempty_atstart &&
1179: mstart == md->start_subject + md->start_offset)))
1180: MRRETURN(MATCH_NOMATCH);
1181:
1182: /* Otherwise, we have a match. */
1183:
1184: md->end_match_ptr = eptr; /* Record where we ended */
1185: md->end_offset_top = offset_top; /* and how many extracts were taken */
1186: md->start_match_ptr = mstart; /* and the start (\K can modify) */
1187:
1188: /* For some reason, the macros don't work properly if an expression is
1189: given as the argument to MRRETURN when the heap is in use. */
1190:
1191: rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1192: MRRETURN(rrc);
1193:
1194: /* Change option settings */
1195:
1196: case OP_OPT:
1197: ims = ecode[1];
1198: ecode += 2;
1199: DPRINTF(("ims set to %02lx\n", ims));
1200: break;
1201:
1202: /* Assertion brackets. Check the alternative branches in turn - the
1203: matching won't pass the KET for an assertion. If any one branch matches,
1204: the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1205: start of each branch to move the current point backwards, so the code at
1206: this level is identical to the lookahead case. */
1207:
1208: case OP_ASSERT:
1209: case OP_ASSERTBACK:
1210: do
1211: {
1212: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1213: RM4);
1214: if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1215: {
1216: mstart = md->start_match_ptr; /* In case \K reset it */
1217: break;
1218: }
1219: if (rrc != MATCH_NOMATCH &&
1220: (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1221: RRETURN(rrc);
1222: ecode += GET(ecode, 1);
1223: }
1224: while (*ecode == OP_ALT);
1225: if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1226:
1227: /* If checking an assertion for a condition, return MATCH_MATCH. */
1228:
1229: if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1230:
1231: /* Continue from after the assertion, updating the offsets high water
1232: mark, since extracts may have been taken during the assertion. */
1233:
1234: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1235: ecode += 1 + LINK_SIZE;
1236: offset_top = md->end_offset_top;
1237: continue;
1238:
1239: /* Negative assertion: all branches must fail to match. Encountering SKIP,
1240: PRUNE, or COMMIT means we must assume failure without checking subsequent
1241: branches. */
1242:
1243: case OP_ASSERT_NOT:
1244: case OP_ASSERTBACK_NOT:
1245: do
1246: {
1247: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1248: RM5);
1249: if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1250: if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1251: {
1252: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1253: break;
1254: }
1255: if (rrc != MATCH_NOMATCH &&
1256: (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1257: RRETURN(rrc);
1258: ecode += GET(ecode,1);
1259: }
1260: while (*ecode == OP_ALT);
1261:
1262: if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1263:
1264: ecode += 1 + LINK_SIZE;
1265: continue;
1266:
1267: /* Move the subject pointer back. This occurs only at the start of
1268: each branch of a lookbehind assertion. If we are too close to the start to
1269: move back, this match function fails. When working with UTF-8 we move
1270: back a number of characters, not bytes. */
1271:
1272: case OP_REVERSE:
1273: #ifdef SUPPORT_UTF8
1274: if (utf8)
1275: {
1276: i = GET(ecode, 1);
1277: while (i-- > 0)
1278: {
1279: eptr--;
1280: if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1281: BACKCHAR(eptr);
1282: }
1283: }
1284: else
1285: #endif
1286:
1287: /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1288:
1289: {
1290: eptr -= GET(ecode, 1);
1291: if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1292: }
1293:
1294: /* Save the earliest consulted character, then skip to next op code */
1295:
1296: if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1297: ecode += 1 + LINK_SIZE;
1298: break;
1299:
1300: /* The callout item calls an external function, if one is provided, passing
1301: details of the match so far. This is mainly for debugging, though the
1302: function is able to force a failure. */
1303:
1304: case OP_CALLOUT:
1305: if (pcre_callout != NULL)
1306: {
1307: pcre_callout_block cb;
1308: cb.version = 1; /* Version 1 of the callout block */
1309: cb.callout_number = ecode[1];
1310: cb.offset_vector = md->offset_vector;
1311: cb.subject = (PCRE_SPTR)md->start_subject;
1312: cb.subject_length = (int)(md->end_subject - md->start_subject);
1313: cb.start_match = (int)(mstart - md->start_subject);
1314: cb.current_position = (int)(eptr - md->start_subject);
1315: cb.pattern_position = GET(ecode, 2);
1316: cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1317: cb.capture_top = offset_top/2;
1318: cb.capture_last = md->capture_last;
1319: cb.callout_data = md->callout_data;
1320: if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1321: if (rrc < 0) RRETURN(rrc);
1322: }
1323: ecode += 2 + 2*LINK_SIZE;
1324: break;
1325:
1326: /* Recursion either matches the current regex, or some subexpression. The
1327: offset data is the offset to the starting bracket from the start of the
1328: whole pattern. (This is so that it works from duplicated subpatterns.)
1329:
1330: If there are any capturing brackets started but not finished, we have to
1331: save their starting points and reinstate them after the recursion. However,
1332: we don't know how many such there are (offset_top records the completed
1333: total) so we just have to save all the potential data. There may be up to
1334: 65535 such values, which is too large to put on the stack, but using malloc
1335: for small numbers seems expensive. As a compromise, the stack is used when
1336: there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1337: is used. A problem is what to do if the malloc fails ... there is no way of
1338: returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1339: values on the stack, and accept that the rest may be wrong.
1340:
1341: There are also other values that have to be saved. We use a chained
1342: sequence of blocks that actually live on the stack. Thanks to Robin Houston
1343: for the original version of this logic. */
1344:
1345: case OP_RECURSE:
1346: {
1347: callpat = md->start_code + GET(ecode, 1);
1348: new_recursive.group_num = (callpat == md->start_code)? 0 :
1349: GET2(callpat, 1 + LINK_SIZE);
1350:
1351: /* Add to "recursing stack" */
1352:
1353: new_recursive.prevrec = md->recursive;
1354: md->recursive = &new_recursive;
1355:
1356: /* Find where to continue from afterwards */
1357:
1358: ecode += 1 + LINK_SIZE;
1359: new_recursive.after_call = ecode;
1360:
1361: /* Now save the offset data. */
1362:
1363: new_recursive.saved_max = md->offset_end;
1364: if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1365: new_recursive.offset_save = stacksave;
1366: else
1367: {
1368: new_recursive.offset_save =
1369: (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1370: if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1371: }
1372:
1373: memcpy(new_recursive.offset_save, md->offset_vector,
1374: new_recursive.saved_max * sizeof(int));
1375: new_recursive.save_offset_top = offset_top;
1376:
1377: /* OK, now we can do the recursion. For each top-level alternative we
1378: restore the offset and recursion data. */
1379:
1380: DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1381: flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1382: do
1383: {
1384: RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1385: md, ims, eptrb, flags, RM6);
1386: if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1387: {
1388: DPRINTF(("Recursion matched\n"));
1389: md->recursive = new_recursive.prevrec;
1390: if (new_recursive.offset_save != stacksave)
1391: (pcre_free)(new_recursive.offset_save);
1392: MRRETURN(MATCH_MATCH);
1393: }
1394: else if (rrc != MATCH_NOMATCH &&
1395: (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1396: {
1397: DPRINTF(("Recursion gave error %d\n", rrc));
1398: if (new_recursive.offset_save != stacksave)
1399: (pcre_free)(new_recursive.offset_save);
1400: RRETURN(rrc);
1401: }
1402:
1403: md->recursive = &new_recursive;
1404: memcpy(md->offset_vector, new_recursive.offset_save,
1405: new_recursive.saved_max * sizeof(int));
1406: callpat += GET(callpat, 1);
1407: }
1408: while (*callpat == OP_ALT);
1409:
1410: DPRINTF(("Recursion didn't match\n"));
1411: md->recursive = new_recursive.prevrec;
1412: if (new_recursive.offset_save != stacksave)
1413: (pcre_free)(new_recursive.offset_save);
1414: MRRETURN(MATCH_NOMATCH);
1415: }
1416: /* Control never reaches here */
1417:
1418: /* "Once" brackets are like assertion brackets except that after a match,
1419: the point in the subject string is not moved back. Thus there can never be
1420: a move back into the brackets. Friedl calls these "atomic" subpatterns.
1421: Check the alternative branches in turn - the matching won't pass the KET
1422: for this kind of subpattern. If any one branch matches, we carry on as at
1423: the end of a normal bracket, leaving the subject pointer, but resetting
1424: the start-of-match value in case it was changed by \K. */
1425:
1426: case OP_ONCE:
1427: prev = ecode;
1428: saved_eptr = eptr;
1429:
1430: do
1431: {
1432: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1433: if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
1434: {
1435: mstart = md->start_match_ptr;
1436: break;
1437: }
1438: if (rrc != MATCH_NOMATCH &&
1439: (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1440: RRETURN(rrc);
1441: ecode += GET(ecode,1);
1442: }
1443: while (*ecode == OP_ALT);
1444:
1445: /* If hit the end of the group (which could be repeated), fail */
1446:
1447: if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1448:
1449: /* Continue as from after the assertion, updating the offsets high water
1450: mark, since extracts may have been taken. */
1451:
1452: do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1453:
1454: offset_top = md->end_offset_top;
1455: eptr = md->end_match_ptr;
1456:
1457: /* For a non-repeating ket, just continue at this level. This also
1458: happens for a repeating ket if no characters were matched in the group.
1459: This is the forcible breaking of infinite loops as implemented in Perl
1460: 5.005. If there is an options reset, it will get obeyed in the normal
1461: course of events. */
1462:
1463: if (*ecode == OP_KET || eptr == saved_eptr)
1464: {
1465: ecode += 1+LINK_SIZE;
1466: break;
1467: }
1468:
1469: /* The repeating kets try the rest of the pattern or restart from the
1470: preceding bracket, in the appropriate order. The second "call" of match()
1471: uses tail recursion, to avoid using another stack frame. We need to reset
1472: any options that changed within the bracket before re-running it, so
1473: check the next opcode. */
1474:
1475: if (ecode[1+LINK_SIZE] == OP_OPT)
1476: {
1477: ims = (ims & ~PCRE_IMS) | ecode[4];
1478: DPRINTF(("ims set to %02lx at group repeat\n", ims));
1479: }
1480:
1481: if (*ecode == OP_KETRMIN)
1482: {
1483: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1484: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1485: ecode = prev;
1486: flags = 0;
1487: goto TAIL_RECURSE;
1488: }
1489: else /* OP_KETRMAX */
1490: {
1491: RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1492: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1493: ecode += 1 + LINK_SIZE;
1494: flags = 0;
1495: goto TAIL_RECURSE;
1496: }
1497: /* Control never gets here */
1498:
1499: /* An alternation is the end of a branch; scan along to find the end of the
1500: bracketed group and go to there. */
1501:
1502: case OP_ALT:
1503: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1504: break;
1505:
1506: /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1507: indicating that it may occur zero times. It may repeat infinitely, or not
1508: at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1509: with fixed upper repeat limits are compiled as a number of copies, with the
1510: optional ones preceded by BRAZERO or BRAMINZERO. */
1511:
1512: case OP_BRAZERO:
1513: {
1514: next = ecode+1;
1515: RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1516: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1517: do next += GET(next,1); while (*next == OP_ALT);
1518: ecode = next + 1 + LINK_SIZE;
1519: }
1520: break;
1521:
1522: case OP_BRAMINZERO:
1523: {
1524: next = ecode+1;
1525: do next += GET(next, 1); while (*next == OP_ALT);
1526: RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1527: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1528: ecode++;
1529: }
1530: break;
1531:
1532: case OP_SKIPZERO:
1533: {
1534: next = ecode+1;
1535: do next += GET(next,1); while (*next == OP_ALT);
1536: ecode = next + 1 + LINK_SIZE;
1537: }
1538: break;
1539:
1540: /* End of a group, repeated or non-repeating. */
1541:
1542: case OP_KET:
1543: case OP_KETRMIN:
1544: case OP_KETRMAX:
1545: prev = ecode - GET(ecode, 1);
1546:
1547: /* If this was a group that remembered the subject start, in order to break
1548: infinite repeats of empty string matches, retrieve the subject start from
1549: the chain. Otherwise, set it NULL. */
1550:
1551: if (*prev >= OP_SBRA)
1552: {
1553: saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1554: eptrb = eptrb->epb_prev; /* Backup to previous group */
1555: }
1556: else saved_eptr = NULL;
1557:
1558: /* If we are at the end of an assertion group or an atomic group, stop
1559: matching and return MATCH_MATCH, but record the current high water mark for
1560: use by positive assertions. We also need to record the match start in case
1561: it was changed by \K. */
1562:
1563: if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1564: *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1565: *prev == OP_ONCE)
1566: {
1567: md->end_match_ptr = eptr; /* For ONCE */
1568: md->end_offset_top = offset_top;
1569: md->start_match_ptr = mstart;
1570: MRRETURN(MATCH_MATCH);
1571: }
1572:
1573: /* For capturing groups we have to check the group number back at the start
1574: and if necessary complete handling an extraction by setting the offsets and
1575: bumping the high water mark. Note that whole-pattern recursion is coded as
1576: a recurse into group 0, so it won't be picked up here. Instead, we catch it
1577: when the OP_END is reached. Other recursion is handled here. */
1578:
1579: if (*prev == OP_CBRA || *prev == OP_SCBRA)
1580: {
1581: number = GET2(prev, 1+LINK_SIZE);
1582: offset = number << 1;
1583:
1584: #ifdef PCRE_DEBUG
1585: printf("end bracket %d", number);
1586: printf("\n");
1587: #endif
1588:
1589: md->capture_last = number;
1590: if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1591: {
1592: md->offset_vector[offset] =
1593: md->offset_vector[md->offset_end - number];
1594: md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1595: if (offset_top <= offset) offset_top = offset + 2;
1596: }
1597:
1598: /* Handle a recursively called group. Restore the offsets
1599: appropriately and continue from after the call. */
1600:
1601: if (md->recursive != NULL && md->recursive->group_num == number)
1602: {
1603: recursion_info *rec = md->recursive;
1604: DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1605: md->recursive = rec->prevrec;
1606: memcpy(md->offset_vector, rec->offset_save,
1607: rec->saved_max * sizeof(int));
1608: offset_top = rec->save_offset_top;
1609: ecode = rec->after_call;
1610: ims = original_ims;
1611: break;
1612: }
1613: }
1614:
1615: /* For both capturing and non-capturing groups, reset the value of the ims
1616: flags, in case they got changed during the group. */
1617:
1618: ims = original_ims;
1619: DPRINTF(("ims reset to %02lx\n", ims));
1620:
1621: /* For a non-repeating ket, just continue at this level. This also
1622: happens for a repeating ket if no characters were matched in the group.
1623: This is the forcible breaking of infinite loops as implemented in Perl
1624: 5.005. If there is an options reset, it will get obeyed in the normal
1625: course of events. */
1626:
1627: if (*ecode == OP_KET || eptr == saved_eptr)
1628: {
1629: ecode += 1 + LINK_SIZE;
1630: break;
1631: }
1632:
1633: /* The repeating kets try the rest of the pattern or restart from the
1634: preceding bracket, in the appropriate order. In the second case, we can use
1635: tail recursion to avoid using another stack frame, unless we have an
1636: unlimited repeat of a group that can match an empty string. */
1637:
1638: flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1639:
1640: if (*ecode == OP_KETRMIN)
1641: {
1642: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1643: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1644: if (flags != 0) /* Could match an empty string */
1645: {
1646: RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1647: RRETURN(rrc);
1648: }
1649: ecode = prev;
1650: goto TAIL_RECURSE;
1651: }
1652: else /* OP_KETRMAX */
1653: {
1654: RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1655: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1656: ecode += 1 + LINK_SIZE;
1657: flags = 0;
1658: goto TAIL_RECURSE;
1659: }
1660: /* Control never gets here */
1661:
1662: /* Start of subject unless notbol, or after internal newline if multiline */
1663:
1664: case OP_CIRC:
1665: if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1666: if ((ims & PCRE_MULTILINE) != 0)
1667: {
1668: if (eptr != md->start_subject &&
1669: (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1670: MRRETURN(MATCH_NOMATCH);
1671: ecode++;
1672: break;
1673: }
1674: /* ... else fall through */
1675:
1676: /* Start of subject assertion */
1677:
1678: case OP_SOD:
1679: if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1680: ecode++;
1681: break;
1682:
1683: /* Start of match assertion */
1684:
1685: case OP_SOM:
1686: if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1687: ecode++;
1688: break;
1689:
1690: /* Reset the start of match point */
1691:
1692: case OP_SET_SOM:
1693: mstart = eptr;
1694: ecode++;
1695: break;
1696:
1697: /* Assert before internal newline if multiline, or before a terminating
1698: newline unless endonly is set, else end of subject unless noteol is set. */
1699:
1700: case OP_DOLL:
1701: if ((ims & PCRE_MULTILINE) != 0)
1702: {
1703: if (eptr < md->end_subject)
1704: { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1705: else
1706: {
1707: if (md->noteol) MRRETURN(MATCH_NOMATCH);
1708: SCHECK_PARTIAL();
1709: }
1710: ecode++;
1711: break;
1712: }
1713: else /* Not multiline */
1714: {
1715: if (md->noteol) MRRETURN(MATCH_NOMATCH);
1716: if (!md->endonly) goto ASSERT_NL_OR_EOS;
1717: }
1718:
1719: /* ... else fall through for endonly */
1720:
1721: /* End of subject assertion (\z) */
1722:
1723: case OP_EOD:
1724: if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1725: SCHECK_PARTIAL();
1726: ecode++;
1727: break;
1728:
1729: /* End of subject or ending \n assertion (\Z) */
1730:
1731: case OP_EODN:
1732: ASSERT_NL_OR_EOS:
1733: if (eptr < md->end_subject &&
1734: (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1735: MRRETURN(MATCH_NOMATCH);
1736:
1737: /* Either at end of string or \n before end. */
1738:
1739: SCHECK_PARTIAL();
1740: ecode++;
1741: break;
1742:
1743: /* Word boundary assertions */
1744:
1745: case OP_NOT_WORD_BOUNDARY:
1746: case OP_WORD_BOUNDARY:
1747: {
1748:
1749: /* Find out if the previous and current characters are "word" characters.
1750: It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1751: be "non-word" characters. Remember the earliest consulted character for
1752: partial matching. */
1753:
1754: #ifdef SUPPORT_UTF8
1755: if (utf8)
1756: {
1757: /* Get status of previous character */
1758:
1759: if (eptr == md->start_subject) prev_is_word = FALSE; else
1760: {
1761: USPTR lastptr = eptr - 1;
1762: while((*lastptr & 0xc0) == 0x80) lastptr--;
1763: if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1764: GETCHAR(c, lastptr);
1765: #ifdef SUPPORT_UCP
1766: if (md->use_ucp)
1767: {
1768: if (c == '_') prev_is_word = TRUE; else
1769: {
1770: int cat = UCD_CATEGORY(c);
1771: prev_is_word = (cat == ucp_L || cat == ucp_N);
1772: }
1773: }
1774: else
1775: #endif
1776: prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1777: }
1778:
1779: /* Get status of next character */
1780:
1781: if (eptr >= md->end_subject)
1782: {
1783: SCHECK_PARTIAL();
1784: cur_is_word = FALSE;
1785: }
1786: else
1787: {
1788: GETCHAR(c, eptr);
1789: #ifdef SUPPORT_UCP
1790: if (md->use_ucp)
1791: {
1792: if (c == '_') cur_is_word = TRUE; else
1793: {
1794: int cat = UCD_CATEGORY(c);
1795: cur_is_word = (cat == ucp_L || cat == ucp_N);
1796: }
1797: }
1798: else
1799: #endif
1800: cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1801: }
1802: }
1803: else
1804: #endif
1805:
1806: /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1807: consistency with the behaviour of \w we do use it in this case. */
1808:
1809: {
1810: /* Get status of previous character */
1811:
1812: if (eptr == md->start_subject) prev_is_word = FALSE; else
1813: {
1814: if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1815: #ifdef SUPPORT_UCP
1816: if (md->use_ucp)
1817: {
1818: c = eptr[-1];
1819: if (c == '_') prev_is_word = TRUE; else
1820: {
1821: int cat = UCD_CATEGORY(c);
1822: prev_is_word = (cat == ucp_L || cat == ucp_N);
1823: }
1824: }
1825: else
1826: #endif
1827: prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1828: }
1829:
1830: /* Get status of next character */
1831:
1832: if (eptr >= md->end_subject)
1833: {
1834: SCHECK_PARTIAL();
1835: cur_is_word = FALSE;
1836: }
1837: else
1838: #ifdef SUPPORT_UCP
1839: if (md->use_ucp)
1840: {
1841: c = *eptr;
1842: if (c == '_') cur_is_word = TRUE; else
1843: {
1844: int cat = UCD_CATEGORY(c);
1845: cur_is_word = (cat == ucp_L || cat == ucp_N);
1846: }
1847: }
1848: else
1849: #endif
1850: cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1851: }
1852:
1853: /* Now see if the situation is what we want */
1854:
1855: if ((*ecode++ == OP_WORD_BOUNDARY)?
1856: cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1857: MRRETURN(MATCH_NOMATCH);
1858: }
1859: break;
1860:
1861: /* Match a single character type; inline for speed */
1862:
1863: case OP_ANY:
1864: if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
1865: /* Fall through */
1866:
1867: case OP_ALLANY:
1868: if (eptr++ >= md->end_subject)
1869: {
1870: SCHECK_PARTIAL();
1871: MRRETURN(MATCH_NOMATCH);
1872: }
1873: if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1874: ecode++;
1875: break;
1876:
1877: /* Match a single byte, even in UTF-8 mode. This opcode really does match
1878: any byte, even newline, independent of the setting of PCRE_DOTALL. */
1879:
1880: case OP_ANYBYTE:
1881: if (eptr++ >= md->end_subject)
1882: {
1883: SCHECK_PARTIAL();
1884: MRRETURN(MATCH_NOMATCH);
1885: }
1886: ecode++;
1887: break;
1888:
1889: case OP_NOT_DIGIT:
1890: if (eptr >= md->end_subject)
1891: {
1892: SCHECK_PARTIAL();
1893: MRRETURN(MATCH_NOMATCH);
1894: }
1895: GETCHARINCTEST(c, eptr);
1896: if (
1897: #ifdef SUPPORT_UTF8
1898: c < 256 &&
1899: #endif
1900: (md->ctypes[c] & ctype_digit) != 0
1901: )
1902: MRRETURN(MATCH_NOMATCH);
1903: ecode++;
1904: break;
1905:
1906: case OP_DIGIT:
1907: if (eptr >= md->end_subject)
1908: {
1909: SCHECK_PARTIAL();
1910: MRRETURN(MATCH_NOMATCH);
1911: }
1912: GETCHARINCTEST(c, eptr);
1913: if (
1914: #ifdef SUPPORT_UTF8
1915: c >= 256 ||
1916: #endif
1917: (md->ctypes[c] & ctype_digit) == 0
1918: )
1919: MRRETURN(MATCH_NOMATCH);
1920: ecode++;
1921: break;
1922:
1923: case OP_NOT_WHITESPACE:
1924: if (eptr >= md->end_subject)
1925: {
1926: SCHECK_PARTIAL();
1927: MRRETURN(MATCH_NOMATCH);
1928: }
1929: GETCHARINCTEST(c, eptr);
1930: if (
1931: #ifdef SUPPORT_UTF8
1932: c < 256 &&
1933: #endif
1934: (md->ctypes[c] & ctype_space) != 0
1935: )
1936: MRRETURN(MATCH_NOMATCH);
1937: ecode++;
1938: break;
1939:
1940: case OP_WHITESPACE:
1941: if (eptr >= md->end_subject)
1942: {
1943: SCHECK_PARTIAL();
1944: MRRETURN(MATCH_NOMATCH);
1945: }
1946: GETCHARINCTEST(c, eptr);
1947: if (
1948: #ifdef SUPPORT_UTF8
1949: c >= 256 ||
1950: #endif
1951: (md->ctypes[c] & ctype_space) == 0
1952: )
1953: MRRETURN(MATCH_NOMATCH);
1954: ecode++;
1955: break;
1956:
1957: case OP_NOT_WORDCHAR:
1958: if (eptr >= md->end_subject)
1959: {
1960: SCHECK_PARTIAL();
1961: MRRETURN(MATCH_NOMATCH);
1962: }
1963: GETCHARINCTEST(c, eptr);
1964: if (
1965: #ifdef SUPPORT_UTF8
1966: c < 256 &&
1967: #endif
1968: (md->ctypes[c] & ctype_word) != 0
1969: )
1970: MRRETURN(MATCH_NOMATCH);
1971: ecode++;
1972: break;
1973:
1974: case OP_WORDCHAR:
1975: if (eptr >= md->end_subject)
1976: {
1977: SCHECK_PARTIAL();
1978: MRRETURN(MATCH_NOMATCH);
1979: }
1980: GETCHARINCTEST(c, eptr);
1981: if (
1982: #ifdef SUPPORT_UTF8
1983: c >= 256 ||
1984: #endif
1985: (md->ctypes[c] & ctype_word) == 0
1986: )
1987: MRRETURN(MATCH_NOMATCH);
1988: ecode++;
1989: break;
1990:
1991: case OP_ANYNL:
1992: if (eptr >= md->end_subject)
1993: {
1994: SCHECK_PARTIAL();
1995: MRRETURN(MATCH_NOMATCH);
1996: }
1997: GETCHARINCTEST(c, eptr);
1998: switch(c)
1999: {
2000: default: MRRETURN(MATCH_NOMATCH);
2001: case 0x000d:
2002: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2003: break;
2004:
2005: case 0x000a:
2006: break;
2007:
2008: case 0x000b:
2009: case 0x000c:
2010: case 0x0085:
2011: case 0x2028:
2012: case 0x2029:
2013: if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2014: break;
2015: }
2016: ecode++;
2017: break;
2018:
2019: case OP_NOT_HSPACE:
2020: if (eptr >= md->end_subject)
2021: {
2022: SCHECK_PARTIAL();
2023: MRRETURN(MATCH_NOMATCH);
2024: }
2025: GETCHARINCTEST(c, eptr);
2026: switch(c)
2027: {
2028: default: break;
2029: case 0x09: /* HT */
2030: case 0x20: /* SPACE */
2031: case 0xa0: /* NBSP */
2032: case 0x1680: /* OGHAM SPACE MARK */
2033: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2034: case 0x2000: /* EN QUAD */
2035: case 0x2001: /* EM QUAD */
2036: case 0x2002: /* EN SPACE */
2037: case 0x2003: /* EM SPACE */
2038: case 0x2004: /* THREE-PER-EM SPACE */
2039: case 0x2005: /* FOUR-PER-EM SPACE */
2040: case 0x2006: /* SIX-PER-EM SPACE */
2041: case 0x2007: /* FIGURE SPACE */
2042: case 0x2008: /* PUNCTUATION SPACE */
2043: case 0x2009: /* THIN SPACE */
2044: case 0x200A: /* HAIR SPACE */
2045: case 0x202f: /* NARROW NO-BREAK SPACE */
2046: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2047: case 0x3000: /* IDEOGRAPHIC SPACE */
2048: MRRETURN(MATCH_NOMATCH);
2049: }
2050: ecode++;
2051: break;
2052:
2053: case OP_HSPACE:
2054: if (eptr >= md->end_subject)
2055: {
2056: SCHECK_PARTIAL();
2057: MRRETURN(MATCH_NOMATCH);
2058: }
2059: GETCHARINCTEST(c, eptr);
2060: switch(c)
2061: {
2062: default: MRRETURN(MATCH_NOMATCH);
2063: case 0x09: /* HT */
2064: case 0x20: /* SPACE */
2065: case 0xa0: /* NBSP */
2066: case 0x1680: /* OGHAM SPACE MARK */
2067: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2068: case 0x2000: /* EN QUAD */
2069: case 0x2001: /* EM QUAD */
2070: case 0x2002: /* EN SPACE */
2071: case 0x2003: /* EM SPACE */
2072: case 0x2004: /* THREE-PER-EM SPACE */
2073: case 0x2005: /* FOUR-PER-EM SPACE */
2074: case 0x2006: /* SIX-PER-EM SPACE */
2075: case 0x2007: /* FIGURE SPACE */
2076: case 0x2008: /* PUNCTUATION SPACE */
2077: case 0x2009: /* THIN SPACE */
2078: case 0x200A: /* HAIR SPACE */
2079: case 0x202f: /* NARROW NO-BREAK SPACE */
2080: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2081: case 0x3000: /* IDEOGRAPHIC SPACE */
2082: break;
2083: }
2084: ecode++;
2085: break;
2086:
2087: case OP_NOT_VSPACE:
2088: if (eptr >= md->end_subject)
2089: {
2090: SCHECK_PARTIAL();
2091: MRRETURN(MATCH_NOMATCH);
2092: }
2093: GETCHARINCTEST(c, eptr);
2094: switch(c)
2095: {
2096: default: break;
2097: case 0x0a: /* LF */
2098: case 0x0b: /* VT */
2099: case 0x0c: /* FF */
2100: case 0x0d: /* CR */
2101: case 0x85: /* NEL */
2102: case 0x2028: /* LINE SEPARATOR */
2103: case 0x2029: /* PARAGRAPH SEPARATOR */
2104: MRRETURN(MATCH_NOMATCH);
2105: }
2106: ecode++;
2107: break;
2108:
2109: case OP_VSPACE:
2110: if (eptr >= md->end_subject)
2111: {
2112: SCHECK_PARTIAL();
2113: MRRETURN(MATCH_NOMATCH);
2114: }
2115: GETCHARINCTEST(c, eptr);
2116: switch(c)
2117: {
2118: default: MRRETURN(MATCH_NOMATCH);
2119: case 0x0a: /* LF */
2120: case 0x0b: /* VT */
2121: case 0x0c: /* FF */
2122: case 0x0d: /* CR */
2123: case 0x85: /* NEL */
2124: case 0x2028: /* LINE SEPARATOR */
2125: case 0x2029: /* PARAGRAPH SEPARATOR */
2126: break;
2127: }
2128: ecode++;
2129: break;
2130:
2131: #ifdef SUPPORT_UCP
2132: /* Check the next character by Unicode property. We will get here only
2133: if the support is in the binary; otherwise a compile-time error occurs. */
2134:
2135: case OP_PROP:
2136: case OP_NOTPROP:
2137: if (eptr >= md->end_subject)
2138: {
2139: SCHECK_PARTIAL();
2140: MRRETURN(MATCH_NOMATCH);
2141: }
2142: GETCHARINCTEST(c, eptr);
2143: {
2144: const ucd_record *prop = GET_UCD(c);
2145:
2146: switch(ecode[1])
2147: {
2148: case PT_ANY:
2149: if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2150: break;
2151:
2152: case PT_LAMP:
2153: if ((prop->chartype == ucp_Lu ||
2154: prop->chartype == ucp_Ll ||
2155: prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2156: MRRETURN(MATCH_NOMATCH);
2157: break;
2158:
2159: case PT_GC:
2160: if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2161: MRRETURN(MATCH_NOMATCH);
2162: break;
2163:
2164: case PT_PC:
2165: if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2166: MRRETURN(MATCH_NOMATCH);
2167: break;
2168:
2169: case PT_SC:
2170: if ((ecode[2] != prop->script) == (op == OP_PROP))
2171: MRRETURN(MATCH_NOMATCH);
2172: break;
2173:
2174: /* These are specials */
2175:
2176: case PT_ALNUM:
2177: if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2178: _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2179: MRRETURN(MATCH_NOMATCH);
2180: break;
2181:
2182: case PT_SPACE: /* Perl space */
2183: if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2184: c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2185: == (op == OP_NOTPROP))
2186: MRRETURN(MATCH_NOMATCH);
2187: break;
2188:
2189: case PT_PXSPACE: /* POSIX space */
2190: if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2191: c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2192: c == CHAR_FF || c == CHAR_CR)
2193: == (op == OP_NOTPROP))
2194: MRRETURN(MATCH_NOMATCH);
2195: break;
2196:
2197: case PT_WORD:
2198: if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2199: _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2200: c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2201: MRRETURN(MATCH_NOMATCH);
2202: break;
2203:
2204: /* This should never occur */
2205:
2206: default:
2207: RRETURN(PCRE_ERROR_INTERNAL);
2208: }
2209:
2210: ecode += 3;
2211: }
2212: break;
2213:
2214: /* Match an extended Unicode sequence. We will get here only if the support
2215: is in the binary; otherwise a compile-time error occurs. */
2216:
2217: case OP_EXTUNI:
2218: if (eptr >= md->end_subject)
2219: {
2220: SCHECK_PARTIAL();
2221: MRRETURN(MATCH_NOMATCH);
2222: }
2223: GETCHARINCTEST(c, eptr);
2224: {
2225: int category = UCD_CATEGORY(c);
2226: if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
2227: while (eptr < md->end_subject)
2228: {
2229: int len = 1;
2230: if (!utf8) c = *eptr; else
2231: {
2232: GETCHARLEN(c, eptr, len);
2233: }
2234: category = UCD_CATEGORY(c);
2235: if (category != ucp_M) break;
2236: eptr += len;
2237: }
2238: }
2239: ecode++;
2240: break;
2241: #endif
2242:
2243:
2244: /* Match a back reference, possibly repeatedly. Look past the end of the
2245: item to see if there is repeat information following. The code is similar
2246: to that for character classes, but repeated for efficiency. Then obey
2247: similar code to character type repeats - written out again for speed.
2248: However, if the referenced string is the empty string, always treat
2249: it as matched, any number of times (otherwise there could be infinite
2250: loops). */
2251:
2252: case OP_REF:
2253: {
2254: offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2255: ecode += 3;
2256:
2257: /* If the reference is unset, there are two possibilities:
2258:
2259: (a) In the default, Perl-compatible state, set the length to be longer
2260: than the amount of subject left; this ensures that every attempt at a
2261: match fails. We can't just fail here, because of the possibility of
2262: quantifiers with zero minima.
2263:
2264: (b) If the JavaScript compatibility flag is set, set the length to zero
2265: so that the back reference matches an empty string.
2266:
2267: Otherwise, set the length to the length of what was matched by the
2268: referenced subpattern. */
2269:
2270: if (offset >= offset_top || md->offset_vector[offset] < 0)
2271: length = (md->jscript_compat)? 0 : (int)(md->end_subject - eptr + 1);
2272: else
2273: length = md->offset_vector[offset+1] - md->offset_vector[offset];
2274:
2275: /* Set up for repetition, or handle the non-repeated case */
2276:
2277: switch (*ecode)
2278: {
2279: case OP_CRSTAR:
2280: case OP_CRMINSTAR:
2281: case OP_CRPLUS:
2282: case OP_CRMINPLUS:
2283: case OP_CRQUERY:
2284: case OP_CRMINQUERY:
2285: c = *ecode++ - OP_CRSTAR;
2286: minimize = (c & 1) != 0;
2287: min = rep_min[c]; /* Pick up values from tables; */
2288: max = rep_max[c]; /* zero for max => infinity */
2289: if (max == 0) max = INT_MAX;
2290: break;
2291:
2292: case OP_CRRANGE:
2293: case OP_CRMINRANGE:
2294: minimize = (*ecode == OP_CRMINRANGE);
2295: min = GET2(ecode, 1);
2296: max = GET2(ecode, 3);
2297: if (max == 0) max = INT_MAX;
2298: ecode += 5;
2299: break;
2300:
2301: default: /* No repeat follows */
2302: if (!match_ref(offset, eptr, length, md, ims))
2303: {
2304: CHECK_PARTIAL();
2305: MRRETURN(MATCH_NOMATCH);
2306: }
2307: eptr += length;
2308: continue; /* With the main loop */
2309: }
2310:
2311: /* If the length of the reference is zero, just continue with the
2312: main loop. */
2313:
2314: if (length == 0) continue;
2315:
2316: /* First, ensure the minimum number of matches are present. We get back
2317: the length of the reference string explicitly rather than passing the
2318: address of eptr, so that eptr can be a register variable. */
2319:
2320: for (i = 1; i <= min; i++)
2321: {
2322: if (!match_ref(offset, eptr, length, md, ims))
2323: {
2324: CHECK_PARTIAL();
2325: MRRETURN(MATCH_NOMATCH);
2326: }
2327: eptr += length;
2328: }
2329:
2330: /* If min = max, continue at the same level without recursion.
2331: They are not both allowed to be zero. */
2332:
2333: if (min == max) continue;
2334:
2335: /* If minimizing, keep trying and advancing the pointer */
2336:
2337: if (minimize)
2338: {
2339: for (fi = min;; fi++)
2340: {
2341: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
2342: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2343: if (fi >= max) MRRETURN(MATCH_NOMATCH);
2344: if (!match_ref(offset, eptr, length, md, ims))
2345: {
2346: CHECK_PARTIAL();
2347: MRRETURN(MATCH_NOMATCH);
2348: }
2349: eptr += length;
2350: }
2351: /* Control never gets here */
2352: }
2353:
2354: /* If maximizing, find the longest string and work backwards */
2355:
2356: else
2357: {
2358: pp = eptr;
2359: for (i = min; i < max; i++)
2360: {
2361: if (!match_ref(offset, eptr, length, md, ims))
2362: {
2363: CHECK_PARTIAL();
2364: break;
2365: }
2366: eptr += length;
2367: }
2368: while (eptr >= pp)
2369: {
2370: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
2371: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2372: eptr -= length;
2373: }
2374: MRRETURN(MATCH_NOMATCH);
2375: }
2376: }
2377: /* Control never gets here */
2378:
2379: /* Match a bit-mapped character class, possibly repeatedly. This op code is
2380: used when all the characters in the class have values in the range 0-255,
2381: and either the matching is caseful, or the characters are in the range
2382: 0-127 when UTF-8 processing is enabled. The only difference between
2383: OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2384: encountered.
2385:
2386: First, look past the end of the item to see if there is repeat information
2387: following. Then obey similar code to character type repeats - written out
2388: again for speed. */
2389:
2390: case OP_NCLASS:
2391: case OP_CLASS:
2392: {
2393: data = ecode + 1; /* Save for matching */
2394: ecode += 33; /* Advance past the item */
2395:
2396: switch (*ecode)
2397: {
2398: case OP_CRSTAR:
2399: case OP_CRMINSTAR:
2400: case OP_CRPLUS:
2401: case OP_CRMINPLUS:
2402: case OP_CRQUERY:
2403: case OP_CRMINQUERY:
2404: c = *ecode++ - OP_CRSTAR;
2405: minimize = (c & 1) != 0;
2406: min = rep_min[c]; /* Pick up values from tables; */
2407: max = rep_max[c]; /* zero for max => infinity */
2408: if (max == 0) max = INT_MAX;
2409: break;
2410:
2411: case OP_CRRANGE:
2412: case OP_CRMINRANGE:
2413: minimize = (*ecode == OP_CRMINRANGE);
2414: min = GET2(ecode, 1);
2415: max = GET2(ecode, 3);
2416: if (max == 0) max = INT_MAX;
2417: ecode += 5;
2418: break;
2419:
2420: default: /* No repeat follows */
2421: min = max = 1;
2422: break;
2423: }
2424:
2425: /* First, ensure the minimum number of matches are present. */
2426:
2427: #ifdef SUPPORT_UTF8
2428: /* UTF-8 mode */
2429: if (utf8)
2430: {
2431: for (i = 1; i <= min; i++)
2432: {
2433: if (eptr >= md->end_subject)
2434: {
2435: SCHECK_PARTIAL();
2436: MRRETURN(MATCH_NOMATCH);
2437: }
2438: GETCHARINC(c, eptr);
2439: if (c > 255)
2440: {
2441: if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2442: }
2443: else
2444: {
2445: if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2446: }
2447: }
2448: }
2449: else
2450: #endif
2451: /* Not UTF-8 mode */
2452: {
2453: for (i = 1; i <= min; i++)
2454: {
2455: if (eptr >= md->end_subject)
2456: {
2457: SCHECK_PARTIAL();
2458: MRRETURN(MATCH_NOMATCH);
2459: }
2460: c = *eptr++;
2461: if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2462: }
2463: }
2464:
2465: /* If max == min we can continue with the main loop without the
2466: need to recurse. */
2467:
2468: if (min == max) continue;
2469:
2470: /* If minimizing, keep testing the rest of the expression and advancing
2471: the pointer while it matches the class. */
2472:
2473: if (minimize)
2474: {
2475: #ifdef SUPPORT_UTF8
2476: /* UTF-8 mode */
2477: if (utf8)
2478: {
2479: for (fi = min;; fi++)
2480: {
2481: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2482: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2483: if (fi >= max) MRRETURN(MATCH_NOMATCH);
2484: if (eptr >= md->end_subject)
2485: {
2486: SCHECK_PARTIAL();
2487: MRRETURN(MATCH_NOMATCH);
2488: }
2489: GETCHARINC(c, eptr);
2490: if (c > 255)
2491: {
2492: if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2493: }
2494: else
2495: {
2496: if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2497: }
2498: }
2499: }
2500: else
2501: #endif
2502: /* Not UTF-8 mode */
2503: {
2504: for (fi = min;; fi++)
2505: {
2506: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2507: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2508: if (fi >= max) MRRETURN(MATCH_NOMATCH);
2509: if (eptr >= md->end_subject)
2510: {
2511: SCHECK_PARTIAL();
2512: MRRETURN(MATCH_NOMATCH);
2513: }
2514: c = *eptr++;
2515: if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2516: }
2517: }
2518: /* Control never gets here */
2519: }
2520:
2521: /* If maximizing, find the longest possible run, then work backwards. */
2522:
2523: else
2524: {
2525: pp = eptr;
2526:
2527: #ifdef SUPPORT_UTF8
2528: /* UTF-8 mode */
2529: if (utf8)
2530: {
2531: for (i = min; i < max; i++)
2532: {
2533: int len = 1;
2534: if (eptr >= md->end_subject)
2535: {
2536: SCHECK_PARTIAL();
2537: break;
2538: }
2539: GETCHARLEN(c, eptr, len);
2540: if (c > 255)
2541: {
2542: if (op == OP_CLASS) break;
2543: }
2544: else
2545: {
2546: if ((data[c/8] & (1 << (c&7))) == 0) break;
2547: }
2548: eptr += len;
2549: }
2550: for (;;)
2551: {
2552: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2553: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2554: if (eptr-- == pp) break; /* Stop if tried at original pos */
2555: BACKCHAR(eptr);
2556: }
2557: }
2558: else
2559: #endif
2560: /* Not UTF-8 mode */
2561: {
2562: for (i = min; i < max; i++)
2563: {
2564: if (eptr >= md->end_subject)
2565: {
2566: SCHECK_PARTIAL();
2567: break;
2568: }
2569: c = *eptr;
2570: if ((data[c/8] & (1 << (c&7))) == 0) break;
2571: eptr++;
2572: }
2573: while (eptr >= pp)
2574: {
2575: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2576: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2577: eptr--;
2578: }
2579: }
2580:
2581: MRRETURN(MATCH_NOMATCH);
2582: }
2583: }
2584: /* Control never gets here */
2585:
2586:
2587: /* Match an extended character class. This opcode is encountered only
2588: when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2589: mode, because Unicode properties are supported in non-UTF-8 mode. */
2590:
2591: #ifdef SUPPORT_UTF8
2592: case OP_XCLASS:
2593: {
2594: data = ecode + 1 + LINK_SIZE; /* Save for matching */
2595: ecode += GET(ecode, 1); /* Advance past the item */
2596:
2597: switch (*ecode)
2598: {
2599: case OP_CRSTAR:
2600: case OP_CRMINSTAR:
2601: case OP_CRPLUS:
2602: case OP_CRMINPLUS:
2603: case OP_CRQUERY:
2604: case OP_CRMINQUERY:
2605: c = *ecode++ - OP_CRSTAR;
2606: minimize = (c & 1) != 0;
2607: min = rep_min[c]; /* Pick up values from tables; */
2608: max = rep_max[c]; /* zero for max => infinity */
2609: if (max == 0) max = INT_MAX;
2610: break;
2611:
2612: case OP_CRRANGE:
2613: case OP_CRMINRANGE:
2614: minimize = (*ecode == OP_CRMINRANGE);
2615: min = GET2(ecode, 1);
2616: max = GET2(ecode, 3);
2617: if (max == 0) max = INT_MAX;
2618: ecode += 5;
2619: break;
2620:
2621: default: /* No repeat follows */
2622: min = max = 1;
2623: break;
2624: }
2625:
2626: /* First, ensure the minimum number of matches are present. */
2627:
2628: for (i = 1; i <= min; i++)
2629: {
2630: if (eptr >= md->end_subject)
2631: {
2632: SCHECK_PARTIAL();
2633: MRRETURN(MATCH_NOMATCH);
2634: }
2635: GETCHARINCTEST(c, eptr);
2636: if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2637: }
2638:
2639: /* If max == min we can continue with the main loop without the
2640: need to recurse. */
2641:
2642: if (min == max) continue;
2643:
2644: /* If minimizing, keep testing the rest of the expression and advancing
2645: the pointer while it matches the class. */
2646:
2647: if (minimize)
2648: {
2649: for (fi = min;; fi++)
2650: {
2651: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2652: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2653: if (fi >= max) MRRETURN(MATCH_NOMATCH);
2654: if (eptr >= md->end_subject)
2655: {
2656: SCHECK_PARTIAL();
2657: MRRETURN(MATCH_NOMATCH);
2658: }
2659: GETCHARINCTEST(c, eptr);
2660: if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2661: }
2662: /* Control never gets here */
2663: }
2664:
2665: /* If maximizing, find the longest possible run, then work backwards. */
2666:
2667: else
2668: {
2669: pp = eptr;
2670: for (i = min; i < max; i++)
2671: {
2672: int len = 1;
2673: if (eptr >= md->end_subject)
2674: {
2675: SCHECK_PARTIAL();
2676: break;
2677: }
2678: GETCHARLENTEST(c, eptr, len);
2679: if (!_pcre_xclass(c, data)) break;
2680: eptr += len;
2681: }
2682: for(;;)
2683: {
2684: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2685: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2686: if (eptr-- == pp) break; /* Stop if tried at original pos */
2687: if (utf8) BACKCHAR(eptr);
2688: }
2689: MRRETURN(MATCH_NOMATCH);
2690: }
2691:
2692: /* Control never gets here */
2693: }
2694: #endif /* End of XCLASS */
2695:
2696: /* Match a single character, casefully */
2697:
2698: case OP_CHAR:
2699: #ifdef SUPPORT_UTF8
2700: if (utf8)
2701: {
2702: length = 1;
2703: ecode++;
2704: GETCHARLEN(fc, ecode, length);
2705: if (length > md->end_subject - eptr)
2706: {
2707: CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2708: MRRETURN(MATCH_NOMATCH);
2709: }
2710: while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2711: }
2712: else
2713: #endif
2714:
2715: /* Non-UTF-8 mode */
2716: {
2717: if (md->end_subject - eptr < 1)
2718: {
2719: SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2720: MRRETURN(MATCH_NOMATCH);
2721: }
2722: if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2723: ecode += 2;
2724: }
2725: break;
2726:
2727: /* Match a single character, caselessly */
2728:
2729: case OP_CHARNC:
2730: #ifdef SUPPORT_UTF8
2731: if (utf8)
2732: {
2733: length = 1;
2734: ecode++;
2735: GETCHARLEN(fc, ecode, length);
2736:
2737: if (length > md->end_subject - eptr)
2738: {
2739: CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2740: MRRETURN(MATCH_NOMATCH);
2741: }
2742:
2743: /* If the pattern character's value is < 128, we have only one byte, and
2744: can use the fast lookup table. */
2745:
2746: if (fc < 128)
2747: {
2748: if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2749: }
2750:
2751: /* Otherwise we must pick up the subject character */
2752:
2753: else
2754: {
2755: unsigned int dc;
2756: GETCHARINC(dc, eptr);
2757: ecode += length;
2758:
2759: /* If we have Unicode property support, we can use it to test the other
2760: case of the character, if there is one. */
2761:
2762: if (fc != dc)
2763: {
2764: #ifdef SUPPORT_UCP
2765: if (dc != UCD_OTHERCASE(fc))
2766: #endif
2767: MRRETURN(MATCH_NOMATCH);
2768: }
2769: }
2770: }
2771: else
2772: #endif /* SUPPORT_UTF8 */
2773:
2774: /* Non-UTF-8 mode */
2775: {
2776: if (md->end_subject - eptr < 1)
2777: {
2778: SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2779: MRRETURN(MATCH_NOMATCH);
2780: }
2781: if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2782: ecode += 2;
2783: }
2784: break;
2785:
2786: /* Match a single character repeatedly. */
2787:
2788: case OP_EXACT:
2789: min = max = GET2(ecode, 1);
2790: ecode += 3;
2791: goto REPEATCHAR;
2792:
2793: case OP_POSUPTO:
2794: possessive = TRUE;
2795: /* Fall through */
2796:
2797: case OP_UPTO:
2798: case OP_MINUPTO:
2799: min = 0;
2800: max = GET2(ecode, 1);
2801: minimize = *ecode == OP_MINUPTO;
2802: ecode += 3;
2803: goto REPEATCHAR;
2804:
2805: case OP_POSSTAR:
2806: possessive = TRUE;
2807: min = 0;
2808: max = INT_MAX;
2809: ecode++;
2810: goto REPEATCHAR;
2811:
2812: case OP_POSPLUS:
2813: possessive = TRUE;
2814: min = 1;
2815: max = INT_MAX;
2816: ecode++;
2817: goto REPEATCHAR;
2818:
2819: case OP_POSQUERY:
2820: possessive = TRUE;
2821: min = 0;
2822: max = 1;
2823: ecode++;
2824: goto REPEATCHAR;
2825:
2826: case OP_STAR:
2827: case OP_MINSTAR:
2828: case OP_PLUS:
2829: case OP_MINPLUS:
2830: case OP_QUERY:
2831: case OP_MINQUERY:
2832: c = *ecode++ - OP_STAR;
2833: minimize = (c & 1) != 0;
2834:
2835: min = rep_min[c]; /* Pick up values from tables; */
2836: max = rep_max[c]; /* zero for max => infinity */
2837: if (max == 0) max = INT_MAX;
2838:
2839: /* Common code for all repeated single-character matches. */
2840:
2841: REPEATCHAR:
2842: #ifdef SUPPORT_UTF8
2843: if (utf8)
2844: {
2845: length = 1;
2846: charptr = ecode;
2847: GETCHARLEN(fc, ecode, length);
2848: ecode += length;
2849:
2850: /* Handle multibyte character matching specially here. There is
2851: support for caseless matching if UCP support is present. */
2852:
2853: if (length > 1)
2854: {
2855: #ifdef SUPPORT_UCP
2856: unsigned int othercase;
2857: if ((ims & PCRE_CASELESS) != 0 &&
2858: (othercase = UCD_OTHERCASE(fc)) != fc)
2859: oclength = _pcre_ord2utf8(othercase, occhars);
2860: else oclength = 0;
2861: #endif /* SUPPORT_UCP */
2862:
2863: for (i = 1; i <= min; i++)
2864: {
2865: if (eptr <= md->end_subject - length &&
2866: memcmp(eptr, charptr, length) == 0) eptr += length;
2867: #ifdef SUPPORT_UCP
2868: else if (oclength > 0 &&
2869: eptr <= md->end_subject - oclength &&
2870: memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2871: #endif /* SUPPORT_UCP */
2872: else
2873: {
2874: CHECK_PARTIAL();
2875: MRRETURN(MATCH_NOMATCH);
2876: }
2877: }
2878:
2879: if (min == max) continue;
2880:
2881: if (minimize)
2882: {
2883: for (fi = min;; fi++)
2884: {
2885: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2886: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2887: if (fi >= max) MRRETURN(MATCH_NOMATCH);
2888: if (eptr <= md->end_subject - length &&
2889: memcmp(eptr, charptr, length) == 0) eptr += length;
2890: #ifdef SUPPORT_UCP
2891: else if (oclength > 0 &&
2892: eptr <= md->end_subject - oclength &&
2893: memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2894: #endif /* SUPPORT_UCP */
2895: else
2896: {
2897: CHECK_PARTIAL();
2898: MRRETURN(MATCH_NOMATCH);
2899: }
2900: }
2901: /* Control never gets here */
2902: }
2903:
2904: else /* Maximize */
2905: {
2906: pp = eptr;
2907: for (i = min; i < max; i++)
2908: {
2909: if (eptr <= md->end_subject - length &&
2910: memcmp(eptr, charptr, length) == 0) eptr += length;
2911: #ifdef SUPPORT_UCP
2912: else if (oclength > 0 &&
2913: eptr <= md->end_subject - oclength &&
2914: memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2915: #endif /* SUPPORT_UCP */
2916: else
2917: {
2918: CHECK_PARTIAL();
2919: break;
2920: }
2921: }
2922:
2923: if (possessive) continue;
2924:
2925: for(;;)
2926: {
2927: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2928: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2929: if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
2930: #ifdef SUPPORT_UCP
2931: eptr--;
2932: BACKCHAR(eptr);
2933: #else /* without SUPPORT_UCP */
2934: eptr -= length;
2935: #endif /* SUPPORT_UCP */
2936: }
2937: }
2938: /* Control never gets here */
2939: }
2940:
2941: /* If the length of a UTF-8 character is 1, we fall through here, and
2942: obey the code as for non-UTF-8 characters below, though in this case the
2943: value of fc will always be < 128. */
2944: }
2945: else
2946: #endif /* SUPPORT_UTF8 */
2947:
2948: /* When not in UTF-8 mode, load a single-byte character. */
2949:
2950: fc = *ecode++;
2951:
2952: /* The value of fc at this point is always less than 256, though we may or
2953: may not be in UTF-8 mode. The code is duplicated for the caseless and
2954: caseful cases, for speed, since matching characters is likely to be quite
2955: common. First, ensure the minimum number of matches are present. If min =
2956: max, continue at the same level without recursing. Otherwise, if
2957: minimizing, keep trying the rest of the expression and advancing one
2958: matching character if failing, up to the maximum. Alternatively, if
2959: maximizing, find the maximum number of characters and work backwards. */
2960:
2961: DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2962: max, eptr));
2963:
2964: if ((ims & PCRE_CASELESS) != 0)
2965: {
2966: fc = md->lcc[fc];
2967: for (i = 1; i <= min; i++)
2968: {
2969: if (eptr >= md->end_subject)
2970: {
2971: SCHECK_PARTIAL();
2972: MRRETURN(MATCH_NOMATCH);
2973: }
2974: if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2975: }
2976: if (min == max) continue;
2977: if (minimize)
2978: {
2979: for (fi = min;; fi++)
2980: {
2981: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2982: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2983: if (fi >= max) MRRETURN(MATCH_NOMATCH);
2984: if (eptr >= md->end_subject)
2985: {
2986: SCHECK_PARTIAL();
2987: MRRETURN(MATCH_NOMATCH);
2988: }
2989: if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2990: }
2991: /* Control never gets here */
2992: }
2993: else /* Maximize */
2994: {
2995: pp = eptr;
2996: for (i = min; i < max; i++)
2997: {
2998: if (eptr >= md->end_subject)
2999: {
3000: SCHECK_PARTIAL();
3001: break;
3002: }
3003: if (fc != md->lcc[*eptr]) break;
3004: eptr++;
3005: }
3006:
3007: if (possessive) continue;
3008:
3009: while (eptr >= pp)
3010: {
3011: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
3012: eptr--;
3013: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3014: }
3015: MRRETURN(MATCH_NOMATCH);
3016: }
3017: /* Control never gets here */
3018: }
3019:
3020: /* Caseful comparisons (includes all multi-byte characters) */
3021:
3022: else
3023: {
3024: for (i = 1; i <= min; i++)
3025: {
3026: if (eptr >= md->end_subject)
3027: {
3028: SCHECK_PARTIAL();
3029: MRRETURN(MATCH_NOMATCH);
3030: }
3031: if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3032: }
3033:
3034: if (min == max) continue;
3035:
3036: if (minimize)
3037: {
3038: for (fi = min;; fi++)
3039: {
3040: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
3041: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3042: if (fi >= max) MRRETURN(MATCH_NOMATCH);
3043: if (eptr >= md->end_subject)
3044: {
3045: SCHECK_PARTIAL();
3046: MRRETURN(MATCH_NOMATCH);
3047: }
3048: if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3049: }
3050: /* Control never gets here */
3051: }
3052: else /* Maximize */
3053: {
3054: pp = eptr;
3055: for (i = min; i < max; i++)
3056: {
3057: if (eptr >= md->end_subject)
3058: {
3059: SCHECK_PARTIAL();
3060: break;
3061: }
3062: if (fc != *eptr) break;
3063: eptr++;
3064: }
3065: if (possessive) continue;
3066:
3067: while (eptr >= pp)
3068: {
3069: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
3070: eptr--;
3071: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3072: }
3073: MRRETURN(MATCH_NOMATCH);
3074: }
3075: }
3076: /* Control never gets here */
3077:
3078: /* Match a negated single one-byte character. The character we are
3079: checking can be multibyte. */
3080:
3081: case OP_NOT:
3082: if (eptr >= md->end_subject)
3083: {
3084: SCHECK_PARTIAL();
3085: MRRETURN(MATCH_NOMATCH);
3086: }
3087: ecode++;
3088: GETCHARINCTEST(c, eptr);
3089: if ((ims & PCRE_CASELESS) != 0)
3090: {
3091: #ifdef SUPPORT_UTF8
3092: if (c < 256)
3093: #endif
3094: c = md->lcc[c];
3095: if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3096: }
3097: else
3098: {
3099: if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3100: }
3101: break;
3102:
3103: /* Match a negated single one-byte character repeatedly. This is almost a
3104: repeat of the code for a repeated single character, but I haven't found a
3105: nice way of commoning these up that doesn't require a test of the
3106: positive/negative option for each character match. Maybe that wouldn't add
3107: very much to the time taken, but character matching *is* what this is all
3108: about... */
3109:
3110: case OP_NOTEXACT:
3111: min = max = GET2(ecode, 1);
3112: ecode += 3;
3113: goto REPEATNOTCHAR;
3114:
3115: case OP_NOTUPTO:
3116: case OP_NOTMINUPTO:
3117: min = 0;
3118: max = GET2(ecode, 1);
3119: minimize = *ecode == OP_NOTMINUPTO;
3120: ecode += 3;
3121: goto REPEATNOTCHAR;
3122:
3123: case OP_NOTPOSSTAR:
3124: possessive = TRUE;
3125: min = 0;
3126: max = INT_MAX;
3127: ecode++;
3128: goto REPEATNOTCHAR;
3129:
3130: case OP_NOTPOSPLUS:
3131: possessive = TRUE;
3132: min = 1;
3133: max = INT_MAX;
3134: ecode++;
3135: goto REPEATNOTCHAR;
3136:
3137: case OP_NOTPOSQUERY:
3138: possessive = TRUE;
3139: min = 0;
3140: max = 1;
3141: ecode++;
3142: goto REPEATNOTCHAR;
3143:
3144: case OP_NOTPOSUPTO:
3145: possessive = TRUE;
3146: min = 0;
3147: max = GET2(ecode, 1);
3148: ecode += 3;
3149: goto REPEATNOTCHAR;
3150:
3151: case OP_NOTSTAR:
3152: case OP_NOTMINSTAR:
3153: case OP_NOTPLUS:
3154: case OP_NOTMINPLUS:
3155: case OP_NOTQUERY:
3156: case OP_NOTMINQUERY:
3157: c = *ecode++ - OP_NOTSTAR;
3158: minimize = (c & 1) != 0;
3159: min = rep_min[c]; /* Pick up values from tables; */
3160: max = rep_max[c]; /* zero for max => infinity */
3161: if (max == 0) max = INT_MAX;
3162:
3163: /* Common code for all repeated single-byte matches. */
3164:
3165: REPEATNOTCHAR:
3166: fc = *ecode++;
3167:
3168: /* The code is duplicated for the caseless and caseful cases, for speed,
3169: since matching characters is likely to be quite common. First, ensure the
3170: minimum number of matches are present. If min = max, continue at the same
3171: level without recursing. Otherwise, if minimizing, keep trying the rest of
3172: the expression and advancing one matching character if failing, up to the
3173: maximum. Alternatively, if maximizing, find the maximum number of
3174: characters and work backwards. */
3175:
3176: DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3177: max, eptr));
3178:
3179: if ((ims & PCRE_CASELESS) != 0)
3180: {
3181: fc = md->lcc[fc];
3182:
3183: #ifdef SUPPORT_UTF8
3184: /* UTF-8 mode */
3185: if (utf8)
3186: {
3187: register unsigned int d;
3188: for (i = 1; i <= min; i++)
3189: {
3190: if (eptr >= md->end_subject)
3191: {
3192: SCHECK_PARTIAL();
3193: MRRETURN(MATCH_NOMATCH);
3194: }
3195: GETCHARINC(d, eptr);
3196: if (d < 256) d = md->lcc[d];
3197: if (fc == d) MRRETURN(MATCH_NOMATCH);
3198: }
3199: }
3200: else
3201: #endif
3202:
3203: /* Not UTF-8 mode */
3204: {
3205: for (i = 1; i <= min; i++)
3206: {
3207: if (eptr >= md->end_subject)
3208: {
3209: SCHECK_PARTIAL();
3210: MRRETURN(MATCH_NOMATCH);
3211: }
3212: if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3213: }
3214: }
3215:
3216: if (min == max) continue;
3217:
3218: if (minimize)
3219: {
3220: #ifdef SUPPORT_UTF8
3221: /* UTF-8 mode */
3222: if (utf8)
3223: {
3224: register unsigned int d;
3225: for (fi = min;; fi++)
3226: {
3227: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
3228: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3229: if (fi >= max) MRRETURN(MATCH_NOMATCH);
3230: if (eptr >= md->end_subject)
3231: {
3232: SCHECK_PARTIAL();
3233: MRRETURN(MATCH_NOMATCH);
3234: }
3235: GETCHARINC(d, eptr);
3236: if (d < 256) d = md->lcc[d];
3237: if (fc == d) MRRETURN(MATCH_NOMATCH);
3238: }
3239: }
3240: else
3241: #endif
3242: /* Not UTF-8 mode */
3243: {
3244: for (fi = min;; fi++)
3245: {
3246: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
3247: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3248: if (fi >= max) MRRETURN(MATCH_NOMATCH);
3249: if (eptr >= md->end_subject)
3250: {
3251: SCHECK_PARTIAL();
3252: MRRETURN(MATCH_NOMATCH);
3253: }
3254: if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3255: }
3256: }
3257: /* Control never gets here */
3258: }
3259:
3260: /* Maximize case */
3261:
3262: else
3263: {
3264: pp = eptr;
3265:
3266: #ifdef SUPPORT_UTF8
3267: /* UTF-8 mode */
3268: if (utf8)
3269: {
3270: register unsigned int d;
3271: for (i = min; i < max; i++)
3272: {
3273: int len = 1;
3274: if (eptr >= md->end_subject)
3275: {
3276: SCHECK_PARTIAL();
3277: break;
3278: }
3279: GETCHARLEN(d, eptr, len);
3280: if (d < 256) d = md->lcc[d];
3281: if (fc == d) break;
3282: eptr += len;
3283: }
3284: if (possessive) continue;
3285: for(;;)
3286: {
3287: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
3288: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3289: if (eptr-- == pp) break; /* Stop if tried at original pos */
3290: BACKCHAR(eptr);
3291: }
3292: }
3293: else
3294: #endif
3295: /* Not UTF-8 mode */
3296: {
3297: for (i = min; i < max; i++)
3298: {
3299: if (eptr >= md->end_subject)
3300: {
3301: SCHECK_PARTIAL();
3302: break;
3303: }
3304: if (fc == md->lcc[*eptr]) break;
3305: eptr++;
3306: }
3307: if (possessive) continue;
3308: while (eptr >= pp)
3309: {
3310: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
3311: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3312: eptr--;
3313: }
3314: }
3315:
3316: MRRETURN(MATCH_NOMATCH);
3317: }
3318: /* Control never gets here */
3319: }
3320:
3321: /* Caseful comparisons */
3322:
3323: else
3324: {
3325: #ifdef SUPPORT_UTF8
3326: /* UTF-8 mode */
3327: if (utf8)
3328: {
3329: register unsigned int d;
3330: for (i = 1; i <= min; i++)
3331: {
3332: if (eptr >= md->end_subject)
3333: {
3334: SCHECK_PARTIAL();
3335: MRRETURN(MATCH_NOMATCH);
3336: }
3337: GETCHARINC(d, eptr);
3338: if (fc == d) MRRETURN(MATCH_NOMATCH);
3339: }
3340: }
3341: else
3342: #endif
3343: /* Not UTF-8 mode */
3344: {
3345: for (i = 1; i <= min; i++)
3346: {
3347: if (eptr >= md->end_subject)
3348: {
3349: SCHECK_PARTIAL();
3350: MRRETURN(MATCH_NOMATCH);
3351: }
3352: if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3353: }
3354: }
3355:
3356: if (min == max) continue;
3357:
3358: if (minimize)
3359: {
3360: #ifdef SUPPORT_UTF8
3361: /* UTF-8 mode */
3362: if (utf8)
3363: {
3364: register unsigned int d;
3365: for (fi = min;; fi++)
3366: {
3367: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
3368: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3369: if (fi >= max) MRRETURN(MATCH_NOMATCH);
3370: if (eptr >= md->end_subject)
3371: {
3372: SCHECK_PARTIAL();
3373: MRRETURN(MATCH_NOMATCH);
3374: }
3375: GETCHARINC(d, eptr);
3376: if (fc == d) MRRETURN(MATCH_NOMATCH);
3377: }
3378: }
3379: else
3380: #endif
3381: /* Not UTF-8 mode */
3382: {
3383: for (fi = min;; fi++)
3384: {
3385: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
3386: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3387: if (fi >= max) MRRETURN(MATCH_NOMATCH);
3388: if (eptr >= md->end_subject)
3389: {
3390: SCHECK_PARTIAL();
3391: MRRETURN(MATCH_NOMATCH);
3392: }
3393: if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3394: }
3395: }
3396: /* Control never gets here */
3397: }
3398:
3399: /* Maximize case */
3400:
3401: else
3402: {
3403: pp = eptr;
3404:
3405: #ifdef SUPPORT_UTF8
3406: /* UTF-8 mode */
3407: if (utf8)
3408: {
3409: register unsigned int d;
3410: for (i = min; i < max; i++)
3411: {
3412: int len = 1;
3413: if (eptr >= md->end_subject)
3414: {
3415: SCHECK_PARTIAL();
3416: break;
3417: }
3418: GETCHARLEN(d, eptr, len);
3419: if (fc == d) break;
3420: eptr += len;
3421: }
3422: if (possessive) continue;
3423: for(;;)
3424: {
3425: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
3426: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3427: if (eptr-- == pp) break; /* Stop if tried at original pos */
3428: BACKCHAR(eptr);
3429: }
3430: }
3431: else
3432: #endif
3433: /* Not UTF-8 mode */
3434: {
3435: for (i = min; i < max; i++)
3436: {
3437: if (eptr >= md->end_subject)
3438: {
3439: SCHECK_PARTIAL();
3440: break;
3441: }
3442: if (fc == *eptr) break;
3443: eptr++;
3444: }
3445: if (possessive) continue;
3446: while (eptr >= pp)
3447: {
3448: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
3449: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3450: eptr--;
3451: }
3452: }
3453:
3454: MRRETURN(MATCH_NOMATCH);
3455: }
3456: }
3457: /* Control never gets here */
3458:
3459: /* Match a single character type repeatedly; several different opcodes
3460: share code. This is very similar to the code for single characters, but we
3461: repeat it in the interests of efficiency. */
3462:
3463: case OP_TYPEEXACT:
3464: min = max = GET2(ecode, 1);
3465: minimize = TRUE;
3466: ecode += 3;
3467: goto REPEATTYPE;
3468:
3469: case OP_TYPEUPTO:
3470: case OP_TYPEMINUPTO:
3471: min = 0;
3472: max = GET2(ecode, 1);
3473: minimize = *ecode == OP_TYPEMINUPTO;
3474: ecode += 3;
3475: goto REPEATTYPE;
3476:
3477: case OP_TYPEPOSSTAR:
3478: possessive = TRUE;
3479: min = 0;
3480: max = INT_MAX;
3481: ecode++;
3482: goto REPEATTYPE;
3483:
3484: case OP_TYPEPOSPLUS:
3485: possessive = TRUE;
3486: min = 1;
3487: max = INT_MAX;
3488: ecode++;
3489: goto REPEATTYPE;
3490:
3491: case OP_TYPEPOSQUERY:
3492: possessive = TRUE;
3493: min = 0;
3494: max = 1;
3495: ecode++;
3496: goto REPEATTYPE;
3497:
3498: case OP_TYPEPOSUPTO:
3499: possessive = TRUE;
3500: min = 0;
3501: max = GET2(ecode, 1);
3502: ecode += 3;
3503: goto REPEATTYPE;
3504:
3505: case OP_TYPESTAR:
3506: case OP_TYPEMINSTAR:
3507: case OP_TYPEPLUS:
3508: case OP_TYPEMINPLUS:
3509: case OP_TYPEQUERY:
3510: case OP_TYPEMINQUERY:
3511: c = *ecode++ - OP_TYPESTAR;
3512: minimize = (c & 1) != 0;
3513: min = rep_min[c]; /* Pick up values from tables; */
3514: max = rep_max[c]; /* zero for max => infinity */
3515: if (max == 0) max = INT_MAX;
3516:
3517: /* Common code for all repeated single character type matches. Note that
3518: in UTF-8 mode, '.' matches a character of any length, but for the other
3519: character types, the valid characters are all one-byte long. */
3520:
3521: REPEATTYPE:
3522: ctype = *ecode++; /* Code for the character type */
3523:
3524: #ifdef SUPPORT_UCP
3525: if (ctype == OP_PROP || ctype == OP_NOTPROP)
3526: {
3527: prop_fail_result = ctype == OP_NOTPROP;
3528: prop_type = *ecode++;
3529: prop_value = *ecode++;
3530: }
3531: else prop_type = -1;
3532: #endif
3533:
3534: /* First, ensure the minimum number of matches are present. Use inline
3535: code for maximizing the speed, and do the type test once at the start
3536: (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3537: is tidier. Also separate the UCP code, which can be the same for both UTF-8
3538: and single-bytes. */
3539:
3540: if (min > 0)
3541: {
3542: #ifdef SUPPORT_UCP
3543: if (prop_type >= 0)
3544: {
3545: switch(prop_type)
3546: {
3547: case PT_ANY:
3548: if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3549: for (i = 1; i <= min; i++)
3550: {
3551: if (eptr >= md->end_subject)
3552: {
3553: SCHECK_PARTIAL();
3554: MRRETURN(MATCH_NOMATCH);
3555: }
3556: GETCHARINCTEST(c, eptr);
3557: }
3558: break;
3559:
3560: case PT_LAMP:
3561: for (i = 1; i <= min; i++)
3562: {
3563: if (eptr >= md->end_subject)
3564: {
3565: SCHECK_PARTIAL();
3566: MRRETURN(MATCH_NOMATCH);
3567: }
3568: GETCHARINCTEST(c, eptr);
3569: prop_chartype = UCD_CHARTYPE(c);
3570: if ((prop_chartype == ucp_Lu ||
3571: prop_chartype == ucp_Ll ||
3572: prop_chartype == ucp_Lt) == prop_fail_result)
3573: MRRETURN(MATCH_NOMATCH);
3574: }
3575: break;
3576:
3577: case PT_GC:
3578: for (i = 1; i <= min; i++)
3579: {
3580: if (eptr >= md->end_subject)
3581: {
3582: SCHECK_PARTIAL();
3583: MRRETURN(MATCH_NOMATCH);
3584: }
3585: GETCHARINCTEST(c, eptr);
3586: prop_category = UCD_CATEGORY(c);
3587: if ((prop_category == prop_value) == prop_fail_result)
3588: MRRETURN(MATCH_NOMATCH);
3589: }
3590: break;
3591:
3592: case PT_PC:
3593: for (i = 1; i <= min; i++)
3594: {
3595: if (eptr >= md->end_subject)
3596: {
3597: SCHECK_PARTIAL();
3598: MRRETURN(MATCH_NOMATCH);
3599: }
3600: GETCHARINCTEST(c, eptr);
3601: prop_chartype = UCD_CHARTYPE(c);
3602: if ((prop_chartype == prop_value) == prop_fail_result)
3603: MRRETURN(MATCH_NOMATCH);
3604: }
3605: break;
3606:
3607: case PT_SC:
3608: for (i = 1; i <= min; i++)
3609: {
3610: if (eptr >= md->end_subject)
3611: {
3612: SCHECK_PARTIAL();
3613: MRRETURN(MATCH_NOMATCH);
3614: }
3615: GETCHARINCTEST(c, eptr);
3616: prop_script = UCD_SCRIPT(c);
3617: if ((prop_script == prop_value) == prop_fail_result)
3618: MRRETURN(MATCH_NOMATCH);
3619: }
3620: break;
3621:
3622: case PT_ALNUM:
3623: for (i = 1; i <= min; i++)
3624: {
3625: if (eptr >= md->end_subject)
3626: {
3627: SCHECK_PARTIAL();
3628: MRRETURN(MATCH_NOMATCH);
3629: }
3630: GETCHARINCTEST(c, eptr);
3631: prop_category = UCD_CATEGORY(c);
3632: if ((prop_category == ucp_L || prop_category == ucp_N)
3633: == prop_fail_result)
3634: MRRETURN(MATCH_NOMATCH);
3635: }
3636: break;
3637:
3638: case PT_SPACE: /* Perl space */
3639: for (i = 1; i <= min; i++)
3640: {
3641: if (eptr >= md->end_subject)
3642: {
3643: SCHECK_PARTIAL();
3644: MRRETURN(MATCH_NOMATCH);
3645: }
3646: GETCHARINCTEST(c, eptr);
3647: prop_category = UCD_CATEGORY(c);
3648: if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3649: c == CHAR_FF || c == CHAR_CR)
3650: == prop_fail_result)
3651: MRRETURN(MATCH_NOMATCH);
3652: }
3653: break;
3654:
3655: case PT_PXSPACE: /* POSIX space */
3656: for (i = 1; i <= min; i++)
3657: {
3658: if (eptr >= md->end_subject)
3659: {
3660: SCHECK_PARTIAL();
3661: MRRETURN(MATCH_NOMATCH);
3662: }
3663: GETCHARINCTEST(c, eptr);
3664: prop_category = UCD_CATEGORY(c);
3665: if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3666: c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3667: == prop_fail_result)
3668: MRRETURN(MATCH_NOMATCH);
3669: }
3670: break;
3671:
3672: case PT_WORD:
3673: for (i = 1; i <= min; i++)
3674: {
3675: if (eptr >= md->end_subject)
3676: {
3677: SCHECK_PARTIAL();
3678: MRRETURN(MATCH_NOMATCH);
3679: }
3680: GETCHARINCTEST(c, eptr);
3681: prop_category = UCD_CATEGORY(c);
3682: if ((prop_category == ucp_L || prop_category == ucp_N ||
3683: c == CHAR_UNDERSCORE)
3684: == prop_fail_result)
3685: MRRETURN(MATCH_NOMATCH);
3686: }
3687: break;
3688:
3689: /* This should not occur */
3690:
3691: default:
3692: RRETURN(PCRE_ERROR_INTERNAL);
3693: }
3694: }
3695:
3696: /* Match extended Unicode sequences. We will get here only if the
3697: support is in the binary; otherwise a compile-time error occurs. */
3698:
3699: else if (ctype == OP_EXTUNI)
3700: {
3701: for (i = 1; i <= min; i++)
3702: {
3703: if (eptr >= md->end_subject)
3704: {
3705: SCHECK_PARTIAL();
3706: MRRETURN(MATCH_NOMATCH);
3707: }
3708: GETCHARINCTEST(c, eptr);
3709: prop_category = UCD_CATEGORY(c);
3710: if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
3711: while (eptr < md->end_subject)
3712: {
3713: int len = 1;
3714: if (!utf8) c = *eptr;
3715: else { GETCHARLEN(c, eptr, len); }
3716: prop_category = UCD_CATEGORY(c);
3717: if (prop_category != ucp_M) break;
3718: eptr += len;
3719: }
3720: }
3721: }
3722:
3723: else
3724: #endif /* SUPPORT_UCP */
3725:
3726: /* Handle all other cases when the coding is UTF-8 */
3727:
3728: #ifdef SUPPORT_UTF8
3729: if (utf8) switch(ctype)
3730: {
3731: case OP_ANY:
3732: for (i = 1; i <= min; i++)
3733: {
3734: if (eptr >= md->end_subject)
3735: {
3736: SCHECK_PARTIAL();
3737: MRRETURN(MATCH_NOMATCH);
3738: }
3739: if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3740: eptr++;
3741: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3742: }
3743: break;
3744:
3745: case OP_ALLANY:
3746: for (i = 1; i <= min; i++)
3747: {
3748: if (eptr >= md->end_subject)
3749: {
3750: SCHECK_PARTIAL();
3751: MRRETURN(MATCH_NOMATCH);
3752: }
3753: eptr++;
3754: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3755: }
3756: break;
3757:
3758: case OP_ANYBYTE:
3759: if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3760: eptr += min;
3761: break;
3762:
3763: case OP_ANYNL:
3764: for (i = 1; i <= min; i++)
3765: {
3766: if (eptr >= md->end_subject)
3767: {
3768: SCHECK_PARTIAL();
3769: MRRETURN(MATCH_NOMATCH);
3770: }
3771: GETCHARINC(c, eptr);
3772: switch(c)
3773: {
3774: default: MRRETURN(MATCH_NOMATCH);
3775: case 0x000d:
3776: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3777: break;
3778:
3779: case 0x000a:
3780: break;
3781:
3782: case 0x000b:
3783: case 0x000c:
3784: case 0x0085:
3785: case 0x2028:
3786: case 0x2029:
3787: if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3788: break;
3789: }
3790: }
3791: break;
3792:
3793: case OP_NOT_HSPACE:
3794: for (i = 1; i <= min; i++)
3795: {
3796: if (eptr >= md->end_subject)
3797: {
3798: SCHECK_PARTIAL();
3799: MRRETURN(MATCH_NOMATCH);
3800: }
3801: GETCHARINC(c, eptr);
3802: switch(c)
3803: {
3804: default: break;
3805: case 0x09: /* HT */
3806: case 0x20: /* SPACE */
3807: case 0xa0: /* NBSP */
3808: case 0x1680: /* OGHAM SPACE MARK */
3809: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3810: case 0x2000: /* EN QUAD */
3811: case 0x2001: /* EM QUAD */
3812: case 0x2002: /* EN SPACE */
3813: case 0x2003: /* EM SPACE */
3814: case 0x2004: /* THREE-PER-EM SPACE */
3815: case 0x2005: /* FOUR-PER-EM SPACE */
3816: case 0x2006: /* SIX-PER-EM SPACE */
3817: case 0x2007: /* FIGURE SPACE */
3818: case 0x2008: /* PUNCTUATION SPACE */
3819: case 0x2009: /* THIN SPACE */
3820: case 0x200A: /* HAIR SPACE */
3821: case 0x202f: /* NARROW NO-BREAK SPACE */
3822: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3823: case 0x3000: /* IDEOGRAPHIC SPACE */
3824: MRRETURN(MATCH_NOMATCH);
3825: }
3826: }
3827: break;
3828:
3829: case OP_HSPACE:
3830: for (i = 1; i <= min; i++)
3831: {
3832: if (eptr >= md->end_subject)
3833: {
3834: SCHECK_PARTIAL();
3835: MRRETURN(MATCH_NOMATCH);
3836: }
3837: GETCHARINC(c, eptr);
3838: switch(c)
3839: {
3840: default: MRRETURN(MATCH_NOMATCH);
3841: case 0x09: /* HT */
3842: case 0x20: /* SPACE */
3843: case 0xa0: /* NBSP */
3844: case 0x1680: /* OGHAM SPACE MARK */
3845: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3846: case 0x2000: /* EN QUAD */
3847: case 0x2001: /* EM QUAD */
3848: case 0x2002: /* EN SPACE */
3849: case 0x2003: /* EM SPACE */
3850: case 0x2004: /* THREE-PER-EM SPACE */
3851: case 0x2005: /* FOUR-PER-EM SPACE */
3852: case 0x2006: /* SIX-PER-EM SPACE */
3853: case 0x2007: /* FIGURE SPACE */
3854: case 0x2008: /* PUNCTUATION SPACE */
3855: case 0x2009: /* THIN SPACE */
3856: case 0x200A: /* HAIR SPACE */
3857: case 0x202f: /* NARROW NO-BREAK SPACE */
3858: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3859: case 0x3000: /* IDEOGRAPHIC SPACE */
3860: break;
3861: }
3862: }
3863: break;
3864:
3865: case OP_NOT_VSPACE:
3866: for (i = 1; i <= min; i++)
3867: {
3868: if (eptr >= md->end_subject)
3869: {
3870: SCHECK_PARTIAL();
3871: MRRETURN(MATCH_NOMATCH);
3872: }
3873: GETCHARINC(c, eptr);
3874: switch(c)
3875: {
3876: default: break;
3877: case 0x0a: /* LF */
3878: case 0x0b: /* VT */
3879: case 0x0c: /* FF */
3880: case 0x0d: /* CR */
3881: case 0x85: /* NEL */
3882: case 0x2028: /* LINE SEPARATOR */
3883: case 0x2029: /* PARAGRAPH SEPARATOR */
3884: MRRETURN(MATCH_NOMATCH);
3885: }
3886: }
3887: break;
3888:
3889: case OP_VSPACE:
3890: for (i = 1; i <= min; i++)
3891: {
3892: if (eptr >= md->end_subject)
3893: {
3894: SCHECK_PARTIAL();
3895: MRRETURN(MATCH_NOMATCH);
3896: }
3897: GETCHARINC(c, eptr);
3898: switch(c)
3899: {
3900: default: MRRETURN(MATCH_NOMATCH);
3901: case 0x0a: /* LF */
3902: case 0x0b: /* VT */
3903: case 0x0c: /* FF */
3904: case 0x0d: /* CR */
3905: case 0x85: /* NEL */
3906: case 0x2028: /* LINE SEPARATOR */
3907: case 0x2029: /* PARAGRAPH SEPARATOR */
3908: break;
3909: }
3910: }
3911: break;
3912:
3913: case OP_NOT_DIGIT:
3914: for (i = 1; i <= min; i++)
3915: {
3916: if (eptr >= md->end_subject)
3917: {
3918: SCHECK_PARTIAL();
3919: MRRETURN(MATCH_NOMATCH);
3920: }
3921: GETCHARINC(c, eptr);
3922: if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3923: MRRETURN(MATCH_NOMATCH);
3924: }
3925: break;
3926:
3927: case OP_DIGIT:
3928: for (i = 1; i <= min; i++)
3929: {
3930: if (eptr >= md->end_subject)
3931: {
3932: SCHECK_PARTIAL();
3933: MRRETURN(MATCH_NOMATCH);
3934: }
3935: if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3936: MRRETURN(MATCH_NOMATCH);
3937: /* No need to skip more bytes - we know it's a 1-byte character */
3938: }
3939: break;
3940:
3941: case OP_NOT_WHITESPACE:
3942: for (i = 1; i <= min; i++)
3943: {
3944: if (eptr >= md->end_subject)
3945: {
3946: SCHECK_PARTIAL();
3947: MRRETURN(MATCH_NOMATCH);
3948: }
3949: if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
3950: MRRETURN(MATCH_NOMATCH);
3951: while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3952: }
3953: break;
3954:
3955: case OP_WHITESPACE:
3956: for (i = 1; i <= min; i++)
3957: {
3958: if (eptr >= md->end_subject)
3959: {
3960: SCHECK_PARTIAL();
3961: MRRETURN(MATCH_NOMATCH);
3962: }
3963: if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3964: MRRETURN(MATCH_NOMATCH);
3965: /* No need to skip more bytes - we know it's a 1-byte character */
3966: }
3967: break;
3968:
3969: case OP_NOT_WORDCHAR:
3970: for (i = 1; i <= min; i++)
3971: {
3972: if (eptr >= md->end_subject)
3973: {
3974: SCHECK_PARTIAL();
3975: MRRETURN(MATCH_NOMATCH);
3976: }
3977: if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
3978: MRRETURN(MATCH_NOMATCH);
3979: while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3980: }
3981: break;
3982:
3983: case OP_WORDCHAR:
3984: for (i = 1; i <= min; i++)
3985: {
3986: if (eptr >= md->end_subject)
3987: {
3988: SCHECK_PARTIAL();
3989: MRRETURN(MATCH_NOMATCH);
3990: }
3991: if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3992: MRRETURN(MATCH_NOMATCH);
3993: /* No need to skip more bytes - we know it's a 1-byte character */
3994: }
3995: break;
3996:
3997: default:
3998: RRETURN(PCRE_ERROR_INTERNAL);
3999: } /* End switch(ctype) */
4000:
4001: else
4002: #endif /* SUPPORT_UTF8 */
4003:
4004: /* Code for the non-UTF-8 case for minimum matching of operators other
4005: than OP_PROP and OP_NOTPROP. */
4006:
4007: switch(ctype)
4008: {
4009: case OP_ANY:
4010: for (i = 1; i <= min; i++)
4011: {
4012: if (eptr >= md->end_subject)
4013: {
4014: SCHECK_PARTIAL();
4015: MRRETURN(MATCH_NOMATCH);
4016: }
4017: if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4018: eptr++;
4019: }
4020: break;
4021:
4022: case OP_ALLANY:
4023: if (eptr > md->end_subject - min)
4024: {
4025: SCHECK_PARTIAL();
4026: MRRETURN(MATCH_NOMATCH);
4027: }
4028: eptr += min;
4029: break;
4030:
4031: case OP_ANYBYTE:
4032: if (eptr > md->end_subject - min)
4033: {
4034: SCHECK_PARTIAL();
4035: MRRETURN(MATCH_NOMATCH);
4036: }
4037: eptr += min;
4038: break;
4039:
4040: case OP_ANYNL:
4041: for (i = 1; i <= min; i++)
4042: {
4043: if (eptr >= md->end_subject)
4044: {
4045: SCHECK_PARTIAL();
4046: MRRETURN(MATCH_NOMATCH);
4047: }
4048: switch(*eptr++)
4049: {
4050: default: MRRETURN(MATCH_NOMATCH);
4051: case 0x000d:
4052: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4053: break;
4054: case 0x000a:
4055: break;
4056:
4057: case 0x000b:
4058: case 0x000c:
4059: case 0x0085:
4060: if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4061: break;
4062: }
4063: }
4064: break;
4065:
4066: case OP_NOT_HSPACE:
4067: for (i = 1; i <= min; i++)
4068: {
4069: if (eptr >= md->end_subject)
4070: {
4071: SCHECK_PARTIAL();
4072: MRRETURN(MATCH_NOMATCH);
4073: }
4074: switch(*eptr++)
4075: {
4076: default: break;
4077: case 0x09: /* HT */
4078: case 0x20: /* SPACE */
4079: case 0xa0: /* NBSP */
4080: MRRETURN(MATCH_NOMATCH);
4081: }
4082: }
4083: break;
4084:
4085: case OP_HSPACE:
4086: for (i = 1; i <= min; i++)
4087: {
4088: if (eptr >= md->end_subject)
4089: {
4090: SCHECK_PARTIAL();
4091: MRRETURN(MATCH_NOMATCH);
4092: }
4093: switch(*eptr++)
4094: {
4095: default: MRRETURN(MATCH_NOMATCH);
4096: case 0x09: /* HT */
4097: case 0x20: /* SPACE */
4098: case 0xa0: /* NBSP */
4099: break;
4100: }
4101: }
4102: break;
4103:
4104: case OP_NOT_VSPACE:
4105: for (i = 1; i <= min; i++)
4106: {
4107: if (eptr >= md->end_subject)
4108: {
4109: SCHECK_PARTIAL();
4110: MRRETURN(MATCH_NOMATCH);
4111: }
4112: switch(*eptr++)
4113: {
4114: default: break;
4115: case 0x0a: /* LF */
4116: case 0x0b: /* VT */
4117: case 0x0c: /* FF */
4118: case 0x0d: /* CR */
4119: case 0x85: /* NEL */
4120: MRRETURN(MATCH_NOMATCH);
4121: }
4122: }
4123: break;
4124:
4125: case OP_VSPACE:
4126: for (i = 1; i <= min; i++)
4127: {
4128: if (eptr >= md->end_subject)
4129: {
4130: SCHECK_PARTIAL();
4131: MRRETURN(MATCH_NOMATCH);
4132: }
4133: switch(*eptr++)
4134: {
4135: default: MRRETURN(MATCH_NOMATCH);
4136: case 0x0a: /* LF */
4137: case 0x0b: /* VT */
4138: case 0x0c: /* FF */
4139: case 0x0d: /* CR */
4140: case 0x85: /* NEL */
4141: break;
4142: }
4143: }
4144: break;
4145:
4146: case OP_NOT_DIGIT:
4147: for (i = 1; i <= min; i++)
4148: {
4149: if (eptr >= md->end_subject)
4150: {
4151: SCHECK_PARTIAL();
4152: MRRETURN(MATCH_NOMATCH);
4153: }
4154: if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4155: }
4156: break;
4157:
4158: case OP_DIGIT:
4159: for (i = 1; i <= min; i++)
4160: {
4161: if (eptr >= md->end_subject)
4162: {
4163: SCHECK_PARTIAL();
4164: MRRETURN(MATCH_NOMATCH);
4165: }
4166: if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4167: }
4168: break;
4169:
4170: case OP_NOT_WHITESPACE:
4171: for (i = 1; i <= min; i++)
4172: {
4173: if (eptr >= md->end_subject)
4174: {
4175: SCHECK_PARTIAL();
4176: MRRETURN(MATCH_NOMATCH);
4177: }
4178: if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4179: }
4180: break;
4181:
4182: case OP_WHITESPACE:
4183: for (i = 1; i <= min; i++)
4184: {
4185: if (eptr >= md->end_subject)
4186: {
4187: SCHECK_PARTIAL();
4188: MRRETURN(MATCH_NOMATCH);
4189: }
4190: if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4191: }
4192: break;
4193:
4194: case OP_NOT_WORDCHAR:
4195: for (i = 1; i <= min; i++)
4196: {
4197: if (eptr >= md->end_subject)
4198: {
4199: SCHECK_PARTIAL();
4200: MRRETURN(MATCH_NOMATCH);
4201: }
4202: if ((md->ctypes[*eptr++] & ctype_word) != 0)
4203: MRRETURN(MATCH_NOMATCH);
4204: }
4205: break;
4206:
4207: case OP_WORDCHAR:
4208: for (i = 1; i <= min; i++)
4209: {
4210: if (eptr >= md->end_subject)
4211: {
4212: SCHECK_PARTIAL();
4213: MRRETURN(MATCH_NOMATCH);
4214: }
4215: if ((md->ctypes[*eptr++] & ctype_word) == 0)
4216: MRRETURN(MATCH_NOMATCH);
4217: }
4218: break;
4219:
4220: default:
4221: RRETURN(PCRE_ERROR_INTERNAL);
4222: }
4223: }
4224:
4225: /* If min = max, continue at the same level without recursing */
4226:
4227: if (min == max) continue;
4228:
4229: /* If minimizing, we have to test the rest of the pattern before each
4230: subsequent match. Again, separate the UTF-8 case for speed, and also
4231: separate the UCP cases. */
4232:
4233: if (minimize)
4234: {
4235: #ifdef SUPPORT_UCP
4236: if (prop_type >= 0)
4237: {
4238: switch(prop_type)
4239: {
4240: case PT_ANY:
4241: for (fi = min;; fi++)
4242: {
4243: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
4244: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4245: if (fi >= max) MRRETURN(MATCH_NOMATCH);
4246: if (eptr >= md->end_subject)
4247: {
4248: SCHECK_PARTIAL();
4249: MRRETURN(MATCH_NOMATCH);
4250: }
4251: GETCHARINCTEST(c, eptr);
4252: if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4253: }
4254: /* Control never gets here */
4255:
4256: case PT_LAMP:
4257: for (fi = min;; fi++)
4258: {
4259: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
4260: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4261: if (fi >= max) MRRETURN(MATCH_NOMATCH);
4262: if (eptr >= md->end_subject)
4263: {
4264: SCHECK_PARTIAL();
4265: MRRETURN(MATCH_NOMATCH);
4266: }
4267: GETCHARINCTEST(c, eptr);
4268: prop_chartype = UCD_CHARTYPE(c);
4269: if ((prop_chartype == ucp_Lu ||
4270: prop_chartype == ucp_Ll ||
4271: prop_chartype == ucp_Lt) == prop_fail_result)
4272: MRRETURN(MATCH_NOMATCH);
4273: }
4274: /* Control never gets here */
4275:
4276: case PT_GC:
4277: for (fi = min;; fi++)
4278: {
4279: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
4280: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4281: if (fi >= max) MRRETURN(MATCH_NOMATCH);
4282: if (eptr >= md->end_subject)
4283: {
4284: SCHECK_PARTIAL();
4285: MRRETURN(MATCH_NOMATCH);
4286: }
4287: GETCHARINCTEST(c, eptr);
4288: prop_category = UCD_CATEGORY(c);
4289: if ((prop_category == prop_value) == prop_fail_result)
4290: MRRETURN(MATCH_NOMATCH);
4291: }
4292: /* Control never gets here */
4293:
4294: case PT_PC:
4295: for (fi = min;; fi++)
4296: {
4297: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
4298: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4299: if (fi >= max) MRRETURN(MATCH_NOMATCH);
4300: if (eptr >= md->end_subject)
4301: {
4302: SCHECK_PARTIAL();
4303: MRRETURN(MATCH_NOMATCH);
4304: }
4305: GETCHARINCTEST(c, eptr);
4306: prop_chartype = UCD_CHARTYPE(c);
4307: if ((prop_chartype == prop_value) == prop_fail_result)
4308: MRRETURN(MATCH_NOMATCH);
4309: }
4310: /* Control never gets here */
4311:
4312: case PT_SC:
4313: for (fi = min;; fi++)
4314: {
4315: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
4316: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4317: if (fi >= max) MRRETURN(MATCH_NOMATCH);
4318: if (eptr >= md->end_subject)
4319: {
4320: SCHECK_PARTIAL();
4321: MRRETURN(MATCH_NOMATCH);
4322: }
4323: GETCHARINCTEST(c, eptr);
4324: prop_script = UCD_SCRIPT(c);
4325: if ((prop_script == prop_value) == prop_fail_result)
4326: MRRETURN(MATCH_NOMATCH);
4327: }
4328: /* Control never gets here */
4329:
4330: case PT_ALNUM:
4331: for (fi = min;; fi++)
4332: {
4333: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM59);
4334: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4335: if (fi >= max) MRRETURN(MATCH_NOMATCH);
4336: if (eptr >= md->end_subject)
4337: {
4338: SCHECK_PARTIAL();
4339: MRRETURN(MATCH_NOMATCH);
4340: }
4341: GETCHARINCTEST(c, eptr);
4342: prop_category = UCD_CATEGORY(c);
4343: if ((prop_category == ucp_L || prop_category == ucp_N)
4344: == prop_fail_result)
4345: MRRETURN(MATCH_NOMATCH);
4346: }
4347: /* Control never gets here */
4348:
4349: case PT_SPACE: /* Perl space */
4350: for (fi = min;; fi++)
4351: {
4352: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM60);
4353: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4354: if (fi >= max) MRRETURN(MATCH_NOMATCH);
4355: if (eptr >= md->end_subject)
4356: {
4357: SCHECK_PARTIAL();
4358: MRRETURN(MATCH_NOMATCH);
4359: }
4360: GETCHARINCTEST(c, eptr);
4361: prop_category = UCD_CATEGORY(c);
4362: if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4363: c == CHAR_FF || c == CHAR_CR)
4364: == prop_fail_result)
4365: MRRETURN(MATCH_NOMATCH);
4366: }
4367: /* Control never gets here */
4368:
4369: case PT_PXSPACE: /* POSIX space */
4370: for (fi = min;; fi++)
4371: {
4372: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM61);
4373: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4374: if (fi >= max) MRRETURN(MATCH_NOMATCH);
4375: if (eptr >= md->end_subject)
4376: {
4377: SCHECK_PARTIAL();
4378: MRRETURN(MATCH_NOMATCH);
4379: }
4380: GETCHARINCTEST(c, eptr);
4381: prop_category = UCD_CATEGORY(c);
4382: if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4383: c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4384: == prop_fail_result)
4385: MRRETURN(MATCH_NOMATCH);
4386: }
4387: /* Control never gets here */
4388:
4389: case PT_WORD:
4390: for (fi = min;; fi++)
4391: {
4392: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM62);
4393: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4394: if (fi >= max) MRRETURN(MATCH_NOMATCH);
4395: if (eptr >= md->end_subject)
4396: {
4397: SCHECK_PARTIAL();
4398: MRRETURN(MATCH_NOMATCH);
4399: }
4400: GETCHARINCTEST(c, eptr);
4401: prop_category = UCD_CATEGORY(c);
4402: if ((prop_category == ucp_L ||
4403: prop_category == ucp_N ||
4404: c == CHAR_UNDERSCORE)
4405: == prop_fail_result)
4406: MRRETURN(MATCH_NOMATCH);
4407: }
4408: /* Control never gets here */
4409:
4410: /* This should never occur */
4411:
4412: default:
4413: RRETURN(PCRE_ERROR_INTERNAL);
4414: }
4415: }
4416:
4417: /* Match extended Unicode sequences. We will get here only if the
4418: support is in the binary; otherwise a compile-time error occurs. */
4419:
4420: else if (ctype == OP_EXTUNI)
4421: {
4422: for (fi = min;; fi++)
4423: {
4424: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
4425: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4426: if (fi >= max) MRRETURN(MATCH_NOMATCH);
4427: if (eptr >= md->end_subject)
4428: {
4429: SCHECK_PARTIAL();
4430: MRRETURN(MATCH_NOMATCH);
4431: }
4432: GETCHARINCTEST(c, eptr);
4433: prop_category = UCD_CATEGORY(c);
4434: if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
4435: while (eptr < md->end_subject)
4436: {
4437: int len = 1;
4438: if (!utf8) c = *eptr;
4439: else { GETCHARLEN(c, eptr, len); }
4440: prop_category = UCD_CATEGORY(c);
4441: if (prop_category != ucp_M) break;
4442: eptr += len;
4443: }
4444: }
4445: }
4446:
4447: else
4448: #endif /* SUPPORT_UCP */
4449:
4450: #ifdef SUPPORT_UTF8
4451: /* UTF-8 mode */
4452: if (utf8)
4453: {
4454: for (fi = min;; fi++)
4455: {
4456: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
4457: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4458: if (fi >= max) MRRETURN(MATCH_NOMATCH);
4459: if (eptr >= md->end_subject)
4460: {
4461: SCHECK_PARTIAL();
4462: MRRETURN(MATCH_NOMATCH);
4463: }
4464: if (ctype == OP_ANY && IS_NEWLINE(eptr))
4465: MRRETURN(MATCH_NOMATCH);
4466: GETCHARINC(c, eptr);
4467: switch(ctype)
4468: {
4469: case OP_ANY: /* This is the non-NL case */
4470: case OP_ALLANY:
4471: case OP_ANYBYTE:
4472: break;
4473:
4474: case OP_ANYNL:
4475: switch(c)
4476: {
4477: default: MRRETURN(MATCH_NOMATCH);
4478: case 0x000d:
4479: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4480: break;
4481: case 0x000a:
4482: break;
4483:
4484: case 0x000b:
4485: case 0x000c:
4486: case 0x0085:
4487: case 0x2028:
4488: case 0x2029:
4489: if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4490: break;
4491: }
4492: break;
4493:
4494: case OP_NOT_HSPACE:
4495: switch(c)
4496: {
4497: default: break;
4498: case 0x09: /* HT */
4499: case 0x20: /* SPACE */
4500: case 0xa0: /* NBSP */
4501: case 0x1680: /* OGHAM SPACE MARK */
4502: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4503: case 0x2000: /* EN QUAD */
4504: case 0x2001: /* EM QUAD */
4505: case 0x2002: /* EN SPACE */
4506: case 0x2003: /* EM SPACE */
4507: case 0x2004: /* THREE-PER-EM SPACE */
4508: case 0x2005: /* FOUR-PER-EM SPACE */
4509: case 0x2006: /* SIX-PER-EM SPACE */
4510: case 0x2007: /* FIGURE SPACE */
4511: case 0x2008: /* PUNCTUATION SPACE */
4512: case 0x2009: /* THIN SPACE */
4513: case 0x200A: /* HAIR SPACE */
4514: case 0x202f: /* NARROW NO-BREAK SPACE */
4515: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4516: case 0x3000: /* IDEOGRAPHIC SPACE */
4517: MRRETURN(MATCH_NOMATCH);
4518: }
4519: break;
4520:
4521: case OP_HSPACE:
4522: switch(c)
4523: {
4524: default: MRRETURN(MATCH_NOMATCH);
4525: case 0x09: /* HT */
4526: case 0x20: /* SPACE */
4527: case 0xa0: /* NBSP */
4528: case 0x1680: /* OGHAM SPACE MARK */
4529: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4530: case 0x2000: /* EN QUAD */
4531: case 0x2001: /* EM QUAD */
4532: case 0x2002: /* EN SPACE */
4533: case 0x2003: /* EM SPACE */
4534: case 0x2004: /* THREE-PER-EM SPACE */
4535: case 0x2005: /* FOUR-PER-EM SPACE */
4536: case 0x2006: /* SIX-PER-EM SPACE */
4537: case 0x2007: /* FIGURE SPACE */
4538: case 0x2008: /* PUNCTUATION SPACE */
4539: case 0x2009: /* THIN SPACE */
4540: case 0x200A: /* HAIR SPACE */
4541: case 0x202f: /* NARROW NO-BREAK SPACE */
4542: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4543: case 0x3000: /* IDEOGRAPHIC SPACE */
4544: break;
4545: }
4546: break;
4547:
4548: case OP_NOT_VSPACE:
4549: switch(c)
4550: {
4551: default: break;
4552: case 0x0a: /* LF */
4553: case 0x0b: /* VT */
4554: case 0x0c: /* FF */
4555: case 0x0d: /* CR */
4556: case 0x85: /* NEL */
4557: case 0x2028: /* LINE SEPARATOR */
4558: case 0x2029: /* PARAGRAPH SEPARATOR */
4559: MRRETURN(MATCH_NOMATCH);
4560: }
4561: break;
4562:
4563: case OP_VSPACE:
4564: switch(c)
4565: {
4566: default: MRRETURN(MATCH_NOMATCH);
4567: case 0x0a: /* LF */
4568: case 0x0b: /* VT */
4569: case 0x0c: /* FF */
4570: case 0x0d: /* CR */
4571: case 0x85: /* NEL */
4572: case 0x2028: /* LINE SEPARATOR */
4573: case 0x2029: /* PARAGRAPH SEPARATOR */
4574: break;
4575: }
4576: break;
4577:
4578: case OP_NOT_DIGIT:
4579: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4580: MRRETURN(MATCH_NOMATCH);
4581: break;
4582:
4583: case OP_DIGIT:
4584: if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4585: MRRETURN(MATCH_NOMATCH);
4586: break;
4587:
4588: case OP_NOT_WHITESPACE:
4589: if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4590: MRRETURN(MATCH_NOMATCH);
4591: break;
4592:
4593: case OP_WHITESPACE:
4594: if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4595: MRRETURN(MATCH_NOMATCH);
4596: break;
4597:
4598: case OP_NOT_WORDCHAR:
4599: if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4600: MRRETURN(MATCH_NOMATCH);
4601: break;
4602:
4603: case OP_WORDCHAR:
4604: if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4605: MRRETURN(MATCH_NOMATCH);
4606: break;
4607:
4608: default:
4609: RRETURN(PCRE_ERROR_INTERNAL);
4610: }
4611: }
4612: }
4613: else
4614: #endif
4615: /* Not UTF-8 mode */
4616: {
4617: for (fi = min;; fi++)
4618: {
4619: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
4620: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4621: if (fi >= max) MRRETURN(MATCH_NOMATCH);
4622: if (eptr >= md->end_subject)
4623: {
4624: SCHECK_PARTIAL();
4625: MRRETURN(MATCH_NOMATCH);
4626: }
4627: if (ctype == OP_ANY && IS_NEWLINE(eptr))
4628: MRRETURN(MATCH_NOMATCH);
4629: c = *eptr++;
4630: switch(ctype)
4631: {
4632: case OP_ANY: /* This is the non-NL case */
4633: case OP_ALLANY:
4634: case OP_ANYBYTE:
4635: break;
4636:
4637: case OP_ANYNL:
4638: switch(c)
4639: {
4640: default: MRRETURN(MATCH_NOMATCH);
4641: case 0x000d:
4642: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4643: break;
4644:
4645: case 0x000a:
4646: break;
4647:
4648: case 0x000b:
4649: case 0x000c:
4650: case 0x0085:
4651: if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4652: break;
4653: }
4654: break;
4655:
4656: case OP_NOT_HSPACE:
4657: switch(c)
4658: {
4659: default: break;
4660: case 0x09: /* HT */
4661: case 0x20: /* SPACE */
4662: case 0xa0: /* NBSP */
4663: MRRETURN(MATCH_NOMATCH);
4664: }
4665: break;
4666:
4667: case OP_HSPACE:
4668: switch(c)
4669: {
4670: default: MRRETURN(MATCH_NOMATCH);
4671: case 0x09: /* HT */
4672: case 0x20: /* SPACE */
4673: case 0xa0: /* NBSP */
4674: break;
4675: }
4676: break;
4677:
4678: case OP_NOT_VSPACE:
4679: switch(c)
4680: {
4681: default: break;
4682: case 0x0a: /* LF */
4683: case 0x0b: /* VT */
4684: case 0x0c: /* FF */
4685: case 0x0d: /* CR */
4686: case 0x85: /* NEL */
4687: MRRETURN(MATCH_NOMATCH);
4688: }
4689: break;
4690:
4691: case OP_VSPACE:
4692: switch(c)
4693: {
4694: default: MRRETURN(MATCH_NOMATCH);
4695: case 0x0a: /* LF */
4696: case 0x0b: /* VT */
4697: case 0x0c: /* FF */
4698: case 0x0d: /* CR */
4699: case 0x85: /* NEL */
4700: break;
4701: }
4702: break;
4703:
4704: case OP_NOT_DIGIT:
4705: if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4706: break;
4707:
4708: case OP_DIGIT:
4709: if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4710: break;
4711:
4712: case OP_NOT_WHITESPACE:
4713: if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4714: break;
4715:
4716: case OP_WHITESPACE:
4717: if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4718: break;
4719:
4720: case OP_NOT_WORDCHAR:
4721: if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4722: break;
4723:
4724: case OP_WORDCHAR:
4725: if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4726: break;
4727:
4728: default:
4729: RRETURN(PCRE_ERROR_INTERNAL);
4730: }
4731: }
4732: }
4733: /* Control never gets here */
4734: }
4735:
4736: /* If maximizing, it is worth using inline code for speed, doing the type
4737: test once at the start (i.e. keep it out of the loop). Again, keep the
4738: UTF-8 and UCP stuff separate. */
4739:
4740: else
4741: {
4742: pp = eptr; /* Remember where we started */
4743:
4744: #ifdef SUPPORT_UCP
4745: if (prop_type >= 0)
4746: {
4747: switch(prop_type)
4748: {
4749: case PT_ANY:
4750: for (i = min; i < max; i++)
4751: {
4752: int len = 1;
4753: if (eptr >= md->end_subject)
4754: {
4755: SCHECK_PARTIAL();
4756: break;
4757: }
4758: GETCHARLENTEST(c, eptr, len);
4759: if (prop_fail_result) break;
4760: eptr+= len;
4761: }
4762: break;
4763:
4764: case PT_LAMP:
4765: for (i = min; i < max; i++)
4766: {
4767: int len = 1;
4768: if (eptr >= md->end_subject)
4769: {
4770: SCHECK_PARTIAL();
4771: break;
4772: }
4773: GETCHARLENTEST(c, eptr, len);
4774: prop_chartype = UCD_CHARTYPE(c);
4775: if ((prop_chartype == ucp_Lu ||
4776: prop_chartype == ucp_Ll ||
4777: prop_chartype == ucp_Lt) == prop_fail_result)
4778: break;
4779: eptr+= len;
4780: }
4781: break;
4782:
4783: case PT_GC:
4784: for (i = min; i < max; i++)
4785: {
4786: int len = 1;
4787: if (eptr >= md->end_subject)
4788: {
4789: SCHECK_PARTIAL();
4790: break;
4791: }
4792: GETCHARLENTEST(c, eptr, len);
4793: prop_category = UCD_CATEGORY(c);
4794: if ((prop_category == prop_value) == prop_fail_result)
4795: break;
4796: eptr+= len;
4797: }
4798: break;
4799:
4800: case PT_PC:
4801: for (i = min; i < max; i++)
4802: {
4803: int len = 1;
4804: if (eptr >= md->end_subject)
4805: {
4806: SCHECK_PARTIAL();
4807: break;
4808: }
4809: GETCHARLENTEST(c, eptr, len);
4810: prop_chartype = UCD_CHARTYPE(c);
4811: if ((prop_chartype == prop_value) == prop_fail_result)
4812: break;
4813: eptr+= len;
4814: }
4815: break;
4816:
4817: case PT_SC:
4818: for (i = min; i < max; i++)
4819: {
4820: int len = 1;
4821: if (eptr >= md->end_subject)
4822: {
4823: SCHECK_PARTIAL();
4824: break;
4825: }
4826: GETCHARLENTEST(c, eptr, len);
4827: prop_script = UCD_SCRIPT(c);
4828: if ((prop_script == prop_value) == prop_fail_result)
4829: break;
4830: eptr+= len;
4831: }
4832: break;
4833:
4834: case PT_ALNUM:
4835: for (i = min; i < max; i++)
4836: {
4837: int len = 1;
4838: if (eptr >= md->end_subject)
4839: {
4840: SCHECK_PARTIAL();
4841: break;
4842: }
4843: GETCHARLENTEST(c, eptr, len);
4844: prop_category = UCD_CATEGORY(c);
4845: if ((prop_category == ucp_L || prop_category == ucp_N)
4846: == prop_fail_result)
4847: break;
4848: eptr+= len;
4849: }
4850: break;
4851:
4852: case PT_SPACE: /* Perl space */
4853: for (i = min; i < max; i++)
4854: {
4855: int len = 1;
4856: if (eptr >= md->end_subject)
4857: {
4858: SCHECK_PARTIAL();
4859: break;
4860: }
4861: GETCHARLENTEST(c, eptr, len);
4862: prop_category = UCD_CATEGORY(c);
4863: if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4864: c == CHAR_FF || c == CHAR_CR)
4865: == prop_fail_result)
4866: break;
4867: eptr+= len;
4868: }
4869: break;
4870:
4871: case PT_PXSPACE: /* POSIX space */
4872: for (i = min; i < max; i++)
4873: {
4874: int len = 1;
4875: if (eptr >= md->end_subject)
4876: {
4877: SCHECK_PARTIAL();
4878: break;
4879: }
4880: GETCHARLENTEST(c, eptr, len);
4881: prop_category = UCD_CATEGORY(c);
4882: if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4883: c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4884: == prop_fail_result)
4885: break;
4886: eptr+= len;
4887: }
4888: break;
4889:
4890: case PT_WORD:
4891: for (i = min; i < max; i++)
4892: {
4893: int len = 1;
4894: if (eptr >= md->end_subject)
4895: {
4896: SCHECK_PARTIAL();
4897: break;
4898: }
4899: GETCHARLENTEST(c, eptr, len);
4900: prop_category = UCD_CATEGORY(c);
4901: if ((prop_category == ucp_L || prop_category == ucp_N ||
4902: c == CHAR_UNDERSCORE) == prop_fail_result)
4903: break;
4904: eptr+= len;
4905: }
4906: break;
4907:
4908: default:
4909: RRETURN(PCRE_ERROR_INTERNAL);
4910: }
4911:
4912: /* eptr is now past the end of the maximum run */
4913:
4914: if (possessive) continue;
4915: for(;;)
4916: {
4917: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
4918: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4919: if (eptr-- == pp) break; /* Stop if tried at original pos */
4920: if (utf8) BACKCHAR(eptr);
4921: }
4922: }
4923:
4924: /* Match extended Unicode sequences. We will get here only if the
4925: support is in the binary; otherwise a compile-time error occurs. */
4926:
4927: else if (ctype == OP_EXTUNI)
4928: {
4929: for (i = min; i < max; i++)
4930: {
4931: if (eptr >= md->end_subject)
4932: {
4933: SCHECK_PARTIAL();
4934: break;
4935: }
4936: GETCHARINCTEST(c, eptr);
4937: prop_category = UCD_CATEGORY(c);
4938: if (prop_category == ucp_M) break;
4939: while (eptr < md->end_subject)
4940: {
4941: int len = 1;
4942: if (!utf8) c = *eptr; else
4943: {
4944: GETCHARLEN(c, eptr, len);
4945: }
4946: prop_category = UCD_CATEGORY(c);
4947: if (prop_category != ucp_M) break;
4948: eptr += len;
4949: }
4950: }
4951:
4952: /* eptr is now past the end of the maximum run */
4953:
4954: if (possessive) continue;
4955:
4956: for(;;)
4957: {
4958: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
4959: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4960: if (eptr-- == pp) break; /* Stop if tried at original pos */
4961: for (;;) /* Move back over one extended */
4962: {
4963: int len = 1;
4964: if (!utf8) c = *eptr; else
4965: {
4966: BACKCHAR(eptr);
4967: GETCHARLEN(c, eptr, len);
4968: }
4969: prop_category = UCD_CATEGORY(c);
4970: if (prop_category != ucp_M) break;
4971: eptr--;
4972: }
4973: }
4974: }
4975:
4976: else
4977: #endif /* SUPPORT_UCP */
4978:
4979: #ifdef SUPPORT_UTF8
4980: /* UTF-8 mode */
4981:
4982: if (utf8)
4983: {
4984: switch(ctype)
4985: {
4986: case OP_ANY:
4987: if (max < INT_MAX)
4988: {
4989: for (i = min; i < max; i++)
4990: {
4991: if (eptr >= md->end_subject)
4992: {
4993: SCHECK_PARTIAL();
4994: break;
4995: }
4996: if (IS_NEWLINE(eptr)) break;
4997: eptr++;
4998: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4999: }
5000: }
5001:
5002: /* Handle unlimited UTF-8 repeat */
5003:
5004: else
5005: {
5006: for (i = min; i < max; i++)
5007: {
5008: if (eptr >= md->end_subject)
5009: {
5010: SCHECK_PARTIAL();
5011: break;
5012: }
5013: if (IS_NEWLINE(eptr)) break;
5014: eptr++;
5015: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5016: }
5017: }
5018: break;
5019:
5020: case OP_ALLANY:
5021: if (max < INT_MAX)
5022: {
5023: for (i = min; i < max; i++)
5024: {
5025: if (eptr >= md->end_subject)
5026: {
5027: SCHECK_PARTIAL();
5028: break;
5029: }
5030: eptr++;
5031: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5032: }
5033: }
5034: else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5035: break;
5036:
5037: /* The byte case is the same as non-UTF8 */
5038:
5039: case OP_ANYBYTE:
5040: c = max - min;
5041: if (c > (unsigned int)(md->end_subject - eptr))
5042: {
5043: eptr = md->end_subject;
5044: SCHECK_PARTIAL();
5045: }
5046: else eptr += c;
5047: break;
5048:
5049: case OP_ANYNL:
5050: for (i = min; i < max; i++)
5051: {
5052: int len = 1;
5053: if (eptr >= md->end_subject)
5054: {
5055: SCHECK_PARTIAL();
5056: break;
5057: }
5058: GETCHARLEN(c, eptr, len);
5059: if (c == 0x000d)
5060: {
5061: if (++eptr >= md->end_subject) break;
5062: if (*eptr == 0x000a) eptr++;
5063: }
5064: else
5065: {
5066: if (c != 0x000a &&
5067: (md->bsr_anycrlf ||
5068: (c != 0x000b && c != 0x000c &&
5069: c != 0x0085 && c != 0x2028 && c != 0x2029)))
5070: break;
5071: eptr += len;
5072: }
5073: }
5074: break;
5075:
5076: case OP_NOT_HSPACE:
5077: case OP_HSPACE:
5078: for (i = min; i < max; i++)
5079: {
5080: BOOL gotspace;
5081: int len = 1;
5082: if (eptr >= md->end_subject)
5083: {
5084: SCHECK_PARTIAL();
5085: break;
5086: }
5087: GETCHARLEN(c, eptr, len);
5088: switch(c)
5089: {
5090: default: gotspace = FALSE; break;
5091: case 0x09: /* HT */
5092: case 0x20: /* SPACE */
5093: case 0xa0: /* NBSP */
5094: case 0x1680: /* OGHAM SPACE MARK */
5095: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5096: case 0x2000: /* EN QUAD */
5097: case 0x2001: /* EM QUAD */
5098: case 0x2002: /* EN SPACE */
5099: case 0x2003: /* EM SPACE */
5100: case 0x2004: /* THREE-PER-EM SPACE */
5101: case 0x2005: /* FOUR-PER-EM SPACE */
5102: case 0x2006: /* SIX-PER-EM SPACE */
5103: case 0x2007: /* FIGURE SPACE */
5104: case 0x2008: /* PUNCTUATION SPACE */
5105: case 0x2009: /* THIN SPACE */
5106: case 0x200A: /* HAIR SPACE */
5107: case 0x202f: /* NARROW NO-BREAK SPACE */
5108: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5109: case 0x3000: /* IDEOGRAPHIC SPACE */
5110: gotspace = TRUE;
5111: break;
5112: }
5113: if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5114: eptr += len;
5115: }
5116: break;
5117:
5118: case OP_NOT_VSPACE:
5119: case OP_VSPACE:
5120: for (i = min; i < max; i++)
5121: {
5122: BOOL gotspace;
5123: int len = 1;
5124: if (eptr >= md->end_subject)
5125: {
5126: SCHECK_PARTIAL();
5127: break;
5128: }
5129: GETCHARLEN(c, eptr, len);
5130: switch(c)
5131: {
5132: default: gotspace = FALSE; break;
5133: case 0x0a: /* LF */
5134: case 0x0b: /* VT */
5135: case 0x0c: /* FF */
5136: case 0x0d: /* CR */
5137: case 0x85: /* NEL */
5138: case 0x2028: /* LINE SEPARATOR */
5139: case 0x2029: /* PARAGRAPH SEPARATOR */
5140: gotspace = TRUE;
5141: break;
5142: }
5143: if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5144: eptr += len;
5145: }
5146: break;
5147:
5148: case OP_NOT_DIGIT:
5149: for (i = min; i < max; i++)
5150: {
5151: int len = 1;
5152: if (eptr >= md->end_subject)
5153: {
5154: SCHECK_PARTIAL();
5155: break;
5156: }
5157: GETCHARLEN(c, eptr, len);
5158: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5159: eptr+= len;
5160: }
5161: break;
5162:
5163: case OP_DIGIT:
5164: for (i = min; i < max; i++)
5165: {
5166: int len = 1;
5167: if (eptr >= md->end_subject)
5168: {
5169: SCHECK_PARTIAL();
5170: break;
5171: }
5172: GETCHARLEN(c, eptr, len);
5173: if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5174: eptr+= len;
5175: }
5176: break;
5177:
5178: case OP_NOT_WHITESPACE:
5179: for (i = min; i < max; i++)
5180: {
5181: int len = 1;
5182: if (eptr >= md->end_subject)
5183: {
5184: SCHECK_PARTIAL();
5185: break;
5186: }
5187: GETCHARLEN(c, eptr, len);
5188: if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5189: eptr+= len;
5190: }
5191: break;
5192:
5193: case OP_WHITESPACE:
5194: for (i = min; i < max; i++)
5195: {
5196: int len = 1;
5197: if (eptr >= md->end_subject)
5198: {
5199: SCHECK_PARTIAL();
5200: break;
5201: }
5202: GETCHARLEN(c, eptr, len);
5203: if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5204: eptr+= len;
5205: }
5206: break;
5207:
5208: case OP_NOT_WORDCHAR:
5209: for (i = min; i < max; i++)
5210: {
5211: int len = 1;
5212: if (eptr >= md->end_subject)
5213: {
5214: SCHECK_PARTIAL();
5215: break;
5216: }
5217: GETCHARLEN(c, eptr, len);
5218: if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5219: eptr+= len;
5220: }
5221: break;
5222:
5223: case OP_WORDCHAR:
5224: for (i = min; i < max; i++)
5225: {
5226: int len = 1;
5227: if (eptr >= md->end_subject)
5228: {
5229: SCHECK_PARTIAL();
5230: break;
5231: }
5232: GETCHARLEN(c, eptr, len);
5233: if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5234: eptr+= len;
5235: }
5236: break;
5237:
5238: default:
5239: RRETURN(PCRE_ERROR_INTERNAL);
5240: }
5241:
5242: /* eptr is now past the end of the maximum run */
5243:
5244: if (possessive) continue;
5245: for(;;)
5246: {
5247: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
5248: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5249: if (eptr-- == pp) break; /* Stop if tried at original pos */
5250: BACKCHAR(eptr);
5251: }
5252: }
5253: else
5254: #endif /* SUPPORT_UTF8 */
5255:
5256: /* Not UTF-8 mode */
5257: {
5258: switch(ctype)
5259: {
5260: case OP_ANY:
5261: for (i = min; i < max; i++)
5262: {
5263: if (eptr >= md->end_subject)
5264: {
5265: SCHECK_PARTIAL();
5266: break;
5267: }
5268: if (IS_NEWLINE(eptr)) break;
5269: eptr++;
5270: }
5271: break;
5272:
5273: case OP_ALLANY:
5274: case OP_ANYBYTE:
5275: c = max - min;
5276: if (c > (unsigned int)(md->end_subject - eptr))
5277: {
5278: eptr = md->end_subject;
5279: SCHECK_PARTIAL();
5280: }
5281: else eptr += c;
5282: break;
5283:
5284: case OP_ANYNL:
5285: for (i = min; i < max; i++)
5286: {
5287: if (eptr >= md->end_subject)
5288: {
5289: SCHECK_PARTIAL();
5290: break;
5291: }
5292: c = *eptr;
5293: if (c == 0x000d)
5294: {
5295: if (++eptr >= md->end_subject) break;
5296: if (*eptr == 0x000a) eptr++;
5297: }
5298: else
5299: {
5300: if (c != 0x000a &&
5301: (md->bsr_anycrlf ||
5302: (c != 0x000b && c != 0x000c && c != 0x0085)))
5303: break;
5304: eptr++;
5305: }
5306: }
5307: break;
5308:
5309: case OP_NOT_HSPACE:
5310: for (i = min; i < max; i++)
5311: {
5312: if (eptr >= md->end_subject)
5313: {
5314: SCHECK_PARTIAL();
5315: break;
5316: }
5317: c = *eptr;
5318: if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5319: eptr++;
5320: }
5321: break;
5322:
5323: case OP_HSPACE:
5324: for (i = min; i < max; i++)
5325: {
5326: if (eptr >= md->end_subject)
5327: {
5328: SCHECK_PARTIAL();
5329: break;
5330: }
5331: c = *eptr;
5332: if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5333: eptr++;
5334: }
5335: break;
5336:
5337: case OP_NOT_VSPACE:
5338: for (i = min; i < max; i++)
5339: {
5340: if (eptr >= md->end_subject)
5341: {
5342: SCHECK_PARTIAL();
5343: break;
5344: }
5345: c = *eptr;
5346: if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5347: break;
5348: eptr++;
5349: }
5350: break;
5351:
5352: case OP_VSPACE:
5353: for (i = min; i < max; i++)
5354: {
5355: if (eptr >= md->end_subject)
5356: {
5357: SCHECK_PARTIAL();
5358: break;
5359: }
5360: c = *eptr;
5361: if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5362: break;
5363: eptr++;
5364: }
5365: break;
5366:
5367: case OP_NOT_DIGIT:
5368: for (i = min; i < max; i++)
5369: {
5370: if (eptr >= md->end_subject)
5371: {
5372: SCHECK_PARTIAL();
5373: break;
5374: }
5375: if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5376: eptr++;
5377: }
5378: break;
5379:
5380: case OP_DIGIT:
5381: for (i = min; i < max; i++)
5382: {
5383: if (eptr >= md->end_subject)
5384: {
5385: SCHECK_PARTIAL();
5386: break;
5387: }
5388: if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5389: eptr++;
5390: }
5391: break;
5392:
5393: case OP_NOT_WHITESPACE:
5394: for (i = min; i < max; i++)
5395: {
5396: if (eptr >= md->end_subject)
5397: {
5398: SCHECK_PARTIAL();
5399: break;
5400: }
5401: if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5402: eptr++;
5403: }
5404: break;
5405:
5406: case OP_WHITESPACE:
5407: for (i = min; i < max; i++)
5408: {
5409: if (eptr >= md->end_subject)
5410: {
5411: SCHECK_PARTIAL();
5412: break;
5413: }
5414: if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5415: eptr++;
5416: }
5417: break;
5418:
5419: case OP_NOT_WORDCHAR:
5420: for (i = min; i < max; i++)
5421: {
5422: if (eptr >= md->end_subject)
5423: {
5424: SCHECK_PARTIAL();
5425: break;
5426: }
5427: if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5428: eptr++;
5429: }
5430: break;
5431:
5432: case OP_WORDCHAR:
5433: for (i = min; i < max; i++)
5434: {
5435: if (eptr >= md->end_subject)
5436: {
5437: SCHECK_PARTIAL();
5438: break;
5439: }
5440: if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5441: eptr++;
5442: }
5443: break;
5444:
5445: default:
5446: RRETURN(PCRE_ERROR_INTERNAL);
5447: }
5448:
5449: /* eptr is now past the end of the maximum run */
5450:
5451: if (possessive) continue;
5452: while (eptr >= pp)
5453: {
5454: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
5455: eptr--;
5456: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5457: }
5458: }
5459:
5460: /* Get here if we can't make it match with any permitted repetitions */
5461:
5462: MRRETURN(MATCH_NOMATCH);
5463: }
5464: /* Control never gets here */
5465:
5466: /* There's been some horrible disaster. Arrival here can only mean there is
5467: something seriously wrong in the code above or the OP_xxx definitions. */
5468:
5469: default:
5470: DPRINTF(("Unknown opcode %d\n", *ecode));
5471: RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5472: }
5473:
5474: /* Do not stick any code in here without much thought; it is assumed
5475: that "continue" in the code above comes out to here to repeat the main
5476: loop. */
5477:
5478: } /* End of main loop */
5479: /* Control never reaches here */
5480:
5481:
5482: /* When compiling to use the heap rather than the stack for recursive calls to
5483: match(), the RRETURN() macro jumps here. The number that is saved in
5484: frame->Xwhere indicates which label we actually want to return to. */
5485:
5486: #ifdef NO_RECURSE
5487: #define LBL(val) case val: goto L_RM##val;
5488: HEAP_RETURN:
5489: switch (frame->Xwhere)
5490: {
5491: LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5492: LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5493: LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5494: LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5495: LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58)
5496: #ifdef SUPPORT_UTF8
5497: LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5498: LBL(32) LBL(34) LBL(42) LBL(46)
5499: #ifdef SUPPORT_UCP
5500: LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5501: LBL(59) LBL(60) LBL(61) LBL(62)
5502: #endif /* SUPPORT_UCP */
5503: #endif /* SUPPORT_UTF8 */
5504: default:
5505: DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5506: return PCRE_ERROR_INTERNAL;
5507: }
5508: #undef LBL
5509: #endif /* NO_RECURSE */
5510: }
5511:
5512:
5513: /***************************************************************************
5514: ****************************************************************************
5515: RECURSION IN THE match() FUNCTION
5516:
5517: Undefine all the macros that were defined above to handle this. */
5518:
5519: #ifdef NO_RECURSE
5520: #undef eptr
5521: #undef ecode
5522: #undef mstart
5523: #undef offset_top
5524: #undef ims
5525: #undef eptrb
5526: #undef flags
5527:
5528: #undef callpat
5529: #undef charptr
5530: #undef data
5531: #undef next
5532: #undef pp
5533: #undef prev
5534: #undef saved_eptr
5535:
5536: #undef new_recursive
5537:
5538: #undef cur_is_word
5539: #undef condition
5540: #undef prev_is_word
5541:
5542: #undef original_ims
5543:
5544: #undef ctype
5545: #undef length
5546: #undef max
5547: #undef min
5548: #undef number
5549: #undef offset
5550: #undef op
5551: #undef save_capture_last
5552: #undef save_offset1
5553: #undef save_offset2
5554: #undef save_offset3
5555: #undef stacksave
5556:
5557: #undef newptrb
5558:
5559: #endif
5560:
5561: /* These two are defined as macros in both cases */
5562:
5563: #undef fc
5564: #undef fi
5565:
5566: /***************************************************************************
5567: ***************************************************************************/
5568:
5569:
5570:
5571: /*************************************************
5572: * Execute a Regular Expression *
5573: *************************************************/
5574:
5575: /* This function applies a compiled re to a subject string and picks out
5576: portions of the string if it matches. Two elements in the vector are set for
5577: each substring: the offsets to the start and end of the substring.
5578:
5579: Arguments:
5580: argument_re points to the compiled expression
5581: extra_data points to extra data or is NULL
5582: subject points to the subject string
5583: length length of subject string (may contain binary zeros)
5584: start_offset where to start in the subject string
5585: options option bits
5586: offsets points to a vector of ints to be filled in with offsets
5587: offsetcount the number of elements in the vector
5588:
5589: Returns: > 0 => success; value is the number of elements filled in
5590: = 0 => success, but offsets is not big enough
5591: -1 => failed to match
5592: < -1 => some kind of unexpected problem
5593: */
5594:
5595: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5596: pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5597: PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5598: int offsetcount)
5599: {
5600: int rc, resetcount, ocount;
5601: int first_byte = -1;
5602: int req_byte = -1;
5603: int req_byte2 = -1;
5604: int newline;
5605: unsigned long int ims;
5606: BOOL using_temporary_offsets = FALSE;
5607: BOOL anchored;
5608: BOOL startline;
5609: BOOL firstline;
5610: BOOL first_byte_caseless = FALSE;
5611: BOOL req_byte_caseless = FALSE;
5612: BOOL utf8;
5613: match_data match_block;
5614: match_data *md = &match_block;
5615: const uschar *tables;
5616: const uschar *start_bits = NULL;
5617: USPTR start_match = (USPTR)subject + start_offset;
5618: USPTR end_subject;
5619: USPTR start_partial = NULL;
5620: USPTR req_byte_ptr = start_match - 1;
5621:
5622: pcre_study_data internal_study;
5623: const pcre_study_data *study;
5624:
5625: real_pcre internal_re;
5626: const real_pcre *external_re = (const real_pcre *)argument_re;
5627: const real_pcre *re = external_re;
5628:
5629: /* Plausibility checks */
5630:
5631: if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5632: if (re == NULL || subject == NULL ||
5633: (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5634: if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5635: if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5636:
5637: /* This information is for finding all the numbers associated with a given
5638: name, for condition testing. */
5639:
5640: md->name_table = (uschar *)re + re->name_table_offset;
5641: md->name_count = re->name_count;
5642: md->name_entry_size = re->name_entry_size;
5643:
5644: /* Fish out the optional data from the extra_data structure, first setting
5645: the default values. */
5646:
5647: study = NULL;
5648: md->match_limit = MATCH_LIMIT;
5649: md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5650: md->callout_data = NULL;
5651:
5652: /* The table pointer is always in native byte order. */
5653:
5654: tables = external_re->tables;
5655:
5656: if (extra_data != NULL)
5657: {
5658: register unsigned int flags = extra_data->flags;
5659: if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5660: study = (const pcre_study_data *)extra_data->study_data;
5661: if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5662: md->match_limit = extra_data->match_limit;
5663: if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5664: md->match_limit_recursion = extra_data->match_limit_recursion;
5665: if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5666: md->callout_data = extra_data->callout_data;
5667: if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5668: }
5669:
5670: /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5671: is a feature that makes it possible to save compiled regex and re-use them
5672: in other programs later. */
5673:
5674: if (tables == NULL) tables = _pcre_default_tables;
5675:
5676: /* Check that the first field in the block is the magic number. If it is not,
5677: test for a regex that was compiled on a host of opposite endianness. If this is
5678: the case, flipped values are put in internal_re and internal_study if there was
5679: study data too. */
5680:
5681: if (re->magic_number != MAGIC_NUMBER)
5682: {
5683: re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5684: if (re == NULL) return PCRE_ERROR_BADMAGIC;
5685: if (study != NULL) study = &internal_study;
5686: }
5687:
5688: /* Set up other data */
5689:
5690: anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5691: startline = (re->flags & PCRE_STARTLINE) != 0;
5692: firstline = (re->options & PCRE_FIRSTLINE) != 0;
5693:
5694: /* The code starts after the real_pcre block and the capture name table. */
5695:
5696: md->start_code = (const uschar *)external_re + re->name_table_offset +
5697: re->name_count * re->name_entry_size;
5698:
5699: md->start_subject = (USPTR)subject;
5700: md->start_offset = start_offset;
5701: md->end_subject = md->start_subject + length;
5702: end_subject = md->end_subject;
5703:
5704: md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5705: utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5706: md->use_ucp = (re->options & PCRE_UCP) != 0;
5707: md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5708:
5709: md->notbol = (options & PCRE_NOTBOL) != 0;
5710: md->noteol = (options & PCRE_NOTEOL) != 0;
5711: md->notempty = (options & PCRE_NOTEMPTY) != 0;
5712: md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5713: md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5714: ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5715: md->hitend = FALSE;
5716: md->mark = NULL; /* In case never set */
5717:
5718: md->recursive = NULL; /* No recursion at top level */
5719:
5720: md->lcc = tables + lcc_offset;
5721: md->ctypes = tables + ctypes_offset;
5722:
5723: /* Handle different \R options. */
5724:
5725: switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5726: {
5727: case 0:
5728: if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5729: md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5730: else
5731: #ifdef BSR_ANYCRLF
5732: md->bsr_anycrlf = TRUE;
5733: #else
5734: md->bsr_anycrlf = FALSE;
5735: #endif
5736: break;
5737:
5738: case PCRE_BSR_ANYCRLF:
5739: md->bsr_anycrlf = TRUE;
5740: break;
5741:
5742: case PCRE_BSR_UNICODE:
5743: md->bsr_anycrlf = FALSE;
5744: break;
5745:
5746: default: return PCRE_ERROR_BADNEWLINE;
5747: }
5748:
5749: /* Handle different types of newline. The three bits give eight cases. If
5750: nothing is set at run time, whatever was used at compile time applies. */
5751:
5752: switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5753: (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5754: {
5755: case 0: newline = NEWLINE; break; /* Compile-time default */
5756: case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5757: case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5758: case PCRE_NEWLINE_CR+
5759: PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5760: case PCRE_NEWLINE_ANY: newline = -1; break;
5761: case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5762: default: return PCRE_ERROR_BADNEWLINE;
5763: }
5764:
5765: if (newline == -2)
5766: {
5767: md->nltype = NLTYPE_ANYCRLF;
5768: }
5769: else if (newline < 0)
5770: {
5771: md->nltype = NLTYPE_ANY;
5772: }
5773: else
5774: {
5775: md->nltype = NLTYPE_FIXED;
5776: if (newline > 255)
5777: {
5778: md->nllen = 2;
5779: md->nl[0] = (newline >> 8) & 255;
5780: md->nl[1] = newline & 255;
5781: }
5782: else
5783: {
5784: md->nllen = 1;
5785: md->nl[0] = newline;
5786: }
5787: }
5788:
5789: /* Partial matching was originally supported only for a restricted set of
5790: regexes; from release 8.00 there are no restrictions, but the bits are still
5791: defined (though never set). So there's no harm in leaving this code. */
5792:
5793: if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5794: return PCRE_ERROR_BADPARTIAL;
5795:
5796: /* Check a UTF-8 string if required. Unfortunately there's no way of passing
5797: back the character offset. */
5798:
5799: #ifdef SUPPORT_UTF8
5800: if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5801: {
5802: int tb;
5803: if ((tb = _pcre_valid_utf8((USPTR)subject, length)) >= 0)
5804: return (tb == length && md->partial > 1)?
5805: PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
5806: if (start_offset > 0 && start_offset < length)
5807: {
5808: tb = ((USPTR)subject)[start_offset] & 0xc0;
5809: if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET;
5810: }
5811: }
5812: #endif
5813:
5814: /* The ims options can vary during the matching as a result of the presence
5815: of (?ims) items in the pattern. They are kept in a local variable so that
5816: restoring at the exit of a group is easy. */
5817:
5818: ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
5819:
5820: /* If the expression has got more back references than the offsets supplied can
5821: hold, we get a temporary chunk of working store to use during the matching.
5822: Otherwise, we can use the vector supplied, rounding down its size to a multiple
5823: of 3. */
5824:
5825: ocount = offsetcount - (offsetcount % 3);
5826:
5827: if (re->top_backref > 0 && re->top_backref >= ocount/3)
5828: {
5829: ocount = re->top_backref * 3 + 3;
5830: md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
5831: if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
5832: using_temporary_offsets = TRUE;
5833: DPRINTF(("Got memory to hold back references\n"));
5834: }
5835: else md->offset_vector = offsets;
5836:
5837: md->offset_end = ocount;
5838: md->offset_max = (2*ocount)/3;
5839: md->offset_overflow = FALSE;
5840: md->capture_last = -1;
5841:
5842: /* Compute the minimum number of offsets that we need to reset each time. Doing
5843: this makes a huge difference to execution time when there aren't many brackets
5844: in the pattern. */
5845:
5846: resetcount = 2 + re->top_bracket * 2;
5847: if (resetcount > offsetcount) resetcount = ocount;
5848:
5849: /* Reset the working variable associated with each extraction. These should
5850: never be used unless previously set, but they get saved and restored, and so we
5851: initialize them to avoid reading uninitialized locations. */
5852:
5853: if (md->offset_vector != NULL)
5854: {
5855: register int *iptr = md->offset_vector + ocount;
5856: register int *iend = iptr - resetcount/2 + 1;
5857: while (--iptr >= iend) *iptr = -1;
5858: }
5859:
5860: /* Set up the first character to match, if available. The first_byte value is
5861: never set for an anchored regular expression, but the anchoring may be forced
5862: at run time, so we have to test for anchoring. The first char may be unset for
5863: an unanchored pattern, of course. If there's no first char and the pattern was
5864: studied, there may be a bitmap of possible first characters. */
5865:
5866: if (!anchored)
5867: {
5868: if ((re->flags & PCRE_FIRSTSET) != 0)
5869: {
5870: first_byte = re->first_byte & 255;
5871: if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
5872: first_byte = md->lcc[first_byte];
5873: }
5874: else
5875: if (!startline && study != NULL &&
5876: (study->flags & PCRE_STUDY_MAPPED) != 0)
5877: start_bits = study->start_bits;
5878: }
5879:
5880: /* For anchored or unanchored matches, there may be a "last known required
5881: character" set. */
5882:
5883: if ((re->flags & PCRE_REQCHSET) != 0)
5884: {
5885: req_byte = re->req_byte & 255;
5886: req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
5887: req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
5888: }
5889:
5890:
5891: /* ==========================================================================*/
5892:
5893: /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5894: the loop runs just once. */
5895:
5896: for(;;)
5897: {
5898: USPTR save_end_subject = end_subject;
5899: USPTR new_start_match;
5900:
5901: /* Reset the maximum number of extractions we might see. */
5902:
5903: if (md->offset_vector != NULL)
5904: {
5905: register int *iptr = md->offset_vector;
5906: register int *iend = iptr + resetcount;
5907: while (iptr < iend) *iptr++ = -1;
5908: }
5909:
5910: /* If firstline is TRUE, the start of the match is constrained to the first
5911: line of a multiline string. That is, the match must be before or at the first
5912: newline. Implement this by temporarily adjusting end_subject so that we stop
5913: scanning at a newline. If the match fails at the newline, later code breaks
5914: this loop. */
5915:
5916: if (firstline)
5917: {
5918: USPTR t = start_match;
5919: #ifdef SUPPORT_UTF8
5920: if (utf8)
5921: {
5922: while (t < md->end_subject && !IS_NEWLINE(t))
5923: {
5924: t++;
5925: while (t < end_subject && (*t & 0xc0) == 0x80) t++;
5926: }
5927: }
5928: else
5929: #endif
5930: while (t < md->end_subject && !IS_NEWLINE(t)) t++;
5931: end_subject = t;
5932: }
5933:
5934: /* There are some optimizations that avoid running the match if a known
5935: starting point is not found, or if a known later character is not present.
5936: However, there is an option that disables these, for testing and for ensuring
5937: that all callouts do actually occur. The option can be set in the regex by
5938: (*NO_START_OPT) or passed in match-time options. */
5939:
5940: if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
5941: {
5942: /* Advance to a unique first byte if there is one. */
5943:
5944: if (first_byte >= 0)
5945: {
5946: if (first_byte_caseless)
5947: while (start_match < end_subject && md->lcc[*start_match] != first_byte)
5948: start_match++;
5949: else
5950: while (start_match < end_subject && *start_match != first_byte)
5951: start_match++;
5952: }
5953:
5954: /* Or to just after a linebreak for a multiline match */
5955:
5956: else if (startline)
5957: {
5958: if (start_match > md->start_subject + start_offset)
5959: {
5960: #ifdef SUPPORT_UTF8
5961: if (utf8)
5962: {
5963: while (start_match < end_subject && !WAS_NEWLINE(start_match))
5964: {
5965: start_match++;
5966: while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
5967: start_match++;
5968: }
5969: }
5970: else
5971: #endif
5972: while (start_match < end_subject && !WAS_NEWLINE(start_match))
5973: start_match++;
5974:
5975: /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
5976: and we are now at a LF, advance the match position by one more character.
5977: */
5978:
5979: if (start_match[-1] == CHAR_CR &&
5980: (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
5981: start_match < end_subject &&
5982: *start_match == CHAR_NL)
5983: start_match++;
5984: }
5985: }
5986:
5987: /* Or to a non-unique first byte after study */
5988:
5989: else if (start_bits != NULL)
5990: {
5991: while (start_match < end_subject)
5992: {
5993: register unsigned int c = *start_match;
5994: if ((start_bits[c/8] & (1 << (c&7))) == 0)
5995: {
5996: start_match++;
5997: #ifdef SUPPORT_UTF8
5998: if (utf8)
5999: while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6000: start_match++;
6001: #endif
6002: }
6003: else break;
6004: }
6005: }
6006: } /* Starting optimizations */
6007:
6008: /* Restore fudged end_subject */
6009:
6010: end_subject = save_end_subject;
6011:
6012: /* The following two optimizations are disabled for partial matching or if
6013: disabling is explicitly requested. */
6014:
6015: if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6016: {
6017: /* If the pattern was studied, a minimum subject length may be set. This is
6018: a lower bound; no actual string of that length may actually match the
6019: pattern. Although the value is, strictly, in characters, we treat it as
6020: bytes to avoid spending too much time in this optimization. */
6021:
6022: if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6023: (pcre_uint32)(end_subject - start_match) < study->minlength)
6024: {
6025: rc = MATCH_NOMATCH;
6026: break;
6027: }
6028:
6029: /* If req_byte is set, we know that that character must appear in the
6030: subject for the match to succeed. If the first character is set, req_byte
6031: must be later in the subject; otherwise the test starts at the match point.
6032: This optimization can save a huge amount of backtracking in patterns with
6033: nested unlimited repeats that aren't going to match. Writing separate code
6034: for cased/caseless versions makes it go faster, as does using an
6035: autoincrement and backing off on a match.
6036:
6037: HOWEVER: when the subject string is very, very long, searching to its end
6038: can take a long time, and give bad performance on quite ordinary patterns.
6039: This showed up when somebody was matching something like /^\d+C/ on a
6040: 32-megabyte string... so we don't do this when the string is sufficiently
6041: long. */
6042:
6043: if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6044: {
6045: register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6046:
6047: /* We don't need to repeat the search if we haven't yet reached the
6048: place we found it at last time. */
6049:
6050: if (p > req_byte_ptr)
6051: {
6052: if (req_byte_caseless)
6053: {
6054: while (p < end_subject)
6055: {
6056: register int pp = *p++;
6057: if (pp == req_byte || pp == req_byte2) { p--; break; }
6058: }
6059: }
6060: else
6061: {
6062: while (p < end_subject)
6063: {
6064: if (*p++ == req_byte) { p--; break; }
6065: }
6066: }
6067:
6068: /* If we can't find the required character, break the matching loop,
6069: forcing a match failure. */
6070:
6071: if (p >= end_subject)
6072: {
6073: rc = MATCH_NOMATCH;
6074: break;
6075: }
6076:
6077: /* If we have found the required character, save the point where we
6078: found it, so that we don't search again next time round the loop if
6079: the start hasn't passed this character yet. */
6080:
6081: req_byte_ptr = p;
6082: }
6083: }
6084: }
6085:
6086: #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6087: printf(">>>> Match against: ");
6088: pchars(start_match, end_subject - start_match, TRUE, md);
6089: printf("\n");
6090: #endif
6091:
6092: /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6093: first starting point for which a partial match was found. */
6094:
6095: md->start_match_ptr = start_match;
6096: md->start_used_ptr = start_match;
6097: md->match_call_count = 0;
6098: rc = match(start_match, md->start_code, start_match, NULL, 2, md, ims, NULL,
6099: 0, 0);
6100: if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6101:
6102: switch(rc)
6103: {
6104: /* SKIP passes back the next starting point explicitly, but if it is the
6105: same as the match we have just done, treat it as NOMATCH. */
6106:
6107: case MATCH_SKIP:
6108: if (md->start_match_ptr != start_match)
6109: {
6110: new_start_match = md->start_match_ptr;
6111: break;
6112: }
6113: /* Fall through */
6114:
6115: /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6116: the SKIP's arg was not found. We also treat this as NOMATCH. */
6117:
6118: case MATCH_SKIP_ARG:
6119: /* Fall through */
6120:
6121: /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6122: exactly like PRUNE. */
6123:
6124: case MATCH_NOMATCH:
6125: case MATCH_PRUNE:
6126: case MATCH_THEN:
6127: new_start_match = start_match + 1;
6128: #ifdef SUPPORT_UTF8
6129: if (utf8)
6130: while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6131: new_start_match++;
6132: #endif
6133: break;
6134:
6135: /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6136:
6137: case MATCH_COMMIT:
6138: rc = MATCH_NOMATCH;
6139: goto ENDLOOP;
6140:
6141: /* Any other return is either a match, or some kind of error. */
6142:
6143: default:
6144: goto ENDLOOP;
6145: }
6146:
6147: /* Control reaches here for the various types of "no match at this point"
6148: result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6149:
6150: rc = MATCH_NOMATCH;
6151:
6152: /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6153: newline in the subject (though it may continue over the newline). Therefore,
6154: if we have just failed to match, starting at a newline, do not continue. */
6155:
6156: if (firstline && IS_NEWLINE(start_match)) break;
6157:
6158: /* Advance to new matching position */
6159:
6160: start_match = new_start_match;
6161:
6162: /* Break the loop if the pattern is anchored or if we have passed the end of
6163: the subject. */
6164:
6165: if (anchored || start_match > end_subject) break;
6166:
6167: /* If we have just passed a CR and we are now at a LF, and the pattern does
6168: not contain any explicit matches for \r or \n, and the newline option is CRLF
6169: or ANY or ANYCRLF, advance the match position by one more character. */
6170:
6171: if (start_match[-1] == CHAR_CR &&
6172: start_match < end_subject &&
6173: *start_match == CHAR_NL &&
6174: (re->flags & PCRE_HASCRORLF) == 0 &&
6175: (md->nltype == NLTYPE_ANY ||
6176: md->nltype == NLTYPE_ANYCRLF ||
6177: md->nllen == 2))
6178: start_match++;
6179:
6180: md->mark = NULL; /* Reset for start of next match attempt */
6181: } /* End of for(;;) "bumpalong" loop */
6182:
6183: /* ==========================================================================*/
6184:
6185: /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6186: conditions is true:
6187:
6188: (1) The pattern is anchored or the match was failed by (*COMMIT);
6189:
6190: (2) We are past the end of the subject;
6191:
6192: (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6193: this option requests that a match occur at or before the first newline in
6194: the subject.
6195:
6196: When we have a match and the offset vector is big enough to deal with any
6197: backreferences, captured substring offsets will already be set up. In the case
6198: where we had to get some local store to hold offsets for backreference
6199: processing, copy those that we can. In this case there need not be overflow if
6200: certain parts of the pattern were not used, even though there are more
6201: capturing parentheses than vector slots. */
6202:
6203: ENDLOOP:
6204:
6205: if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6206: {
6207: if (using_temporary_offsets)
6208: {
6209: if (offsetcount >= 4)
6210: {
6211: memcpy(offsets + 2, md->offset_vector + 2,
6212: (offsetcount - 2) * sizeof(int));
6213: DPRINTF(("Copied offsets from temporary memory\n"));
6214: }
6215: if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
6216: DPRINTF(("Freeing temporary memory\n"));
6217: (pcre_free)(md->offset_vector);
6218: }
6219:
6220: /* Set the return code to the number of captured strings, or 0 if there are
6221: too many to fit into the vector. */
6222:
6223: rc = md->offset_overflow? 0 : md->end_offset_top/2;
6224:
6225: /* If there is space, set up the whole thing as substring 0. The value of
6226: md->start_match_ptr might be modified if \K was encountered on the success
6227: matching path. */
6228:
6229: if (offsetcount < 2) rc = 0; else
6230: {
6231: offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6232: offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6233: }
6234:
6235: DPRINTF((">>>> returning %d\n", rc));
6236: goto RETURN_MARK;
6237: }
6238:
6239: /* Control gets here if there has been an error, or if the overall match
6240: attempt has failed at all permitted starting positions. */
6241:
6242: if (using_temporary_offsets)
6243: {
6244: DPRINTF(("Freeing temporary memory\n"));
6245: (pcre_free)(md->offset_vector);
6246: }
6247:
6248: /* For anything other than nomatch or partial match, just return the code. */
6249:
6250: if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6251: {
6252: DPRINTF((">>>> error: returning %d\n", rc));
6253: return rc;
6254: }
6255:
6256: /* Handle partial matches - disable any mark data */
6257:
6258: if (start_partial != NULL)
6259: {
6260: DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6261: md->mark = NULL;
6262: if (offsetcount > 1)
6263: {
6264: offsets[0] = (int)(start_partial - (USPTR)subject);
6265: offsets[1] = (int)(end_subject - (USPTR)subject);
6266: }
6267: rc = PCRE_ERROR_PARTIAL;
6268: }
6269:
6270: /* This is the classic nomatch case */
6271:
6272: else
6273: {
6274: DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6275: rc = PCRE_ERROR_NOMATCH;
6276: }
6277:
6278: /* Return the MARK data if it has been requested. */
6279:
6280: RETURN_MARK:
6281:
6282: if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6283: *(extra_data->mark) = (unsigned char *)(md->mark);
6284: return rc;
6285: }
6286:
6287: /* End of pcre_exec.c */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>