Annotation of embedaddon/pcre/pcre_exec.c, revision 1.1.1.3
1.1 misho 1: /*************************************************
2: * Perl-Compatible Regular Expressions *
3: *************************************************/
4:
5: /* PCRE is a library of functions to support regular expressions whose syntax
6: and semantics are as close as possible to those of the Perl 5 language.
7:
8: Written by Philip Hazel
1.1.1.2 misho 9: Copyright (c) 1997-2012 University of Cambridge
1.1 misho 10:
11: -----------------------------------------------------------------------------
12: Redistribution and use in source and binary forms, with or without
13: modification, are permitted provided that the following conditions are met:
14:
15: * Redistributions of source code must retain the above copyright notice,
16: this list of conditions and the following disclaimer.
17:
18: * Redistributions in binary form must reproduce the above copyright
19: notice, this list of conditions and the following disclaimer in the
20: documentation and/or other materials provided with the distribution.
21:
22: * Neither the name of the University of Cambridge nor the names of its
23: contributors may be used to endorse or promote products derived from
24: this software without specific prior written permission.
25:
26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36: POSSIBILITY OF SUCH DAMAGE.
37: -----------------------------------------------------------------------------
38: */
39:
40: /* This module contains pcre_exec(), the externally visible function that does
41: pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42: possible. There are also some static supporting functions. */
43:
44: #ifdef HAVE_CONFIG_H
45: #include "config.h"
46: #endif
47:
48: #define NLBLOCK md /* Block containing newline information */
49: #define PSSTART start_subject /* Field containing processed string start */
50: #define PSEND end_subject /* Field containing processed string end */
51:
52: #include "pcre_internal.h"
53:
54: /* Undefine some potentially clashing cpp symbols */
55:
56: #undef min
57: #undef max
58:
59: /* Values for setting in md->match_function_type to indicate two special types
60: of call to match(). We do it this way to save on using another stack variable,
61: as stack usage is to be discouraged. */
62:
63: #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
64: #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
65:
66: /* Non-error returns from the match() function. Error returns are externally
67: defined PCRE_ERROR_xxx codes, which are all negative. */
68:
69: #define MATCH_MATCH 1
70: #define MATCH_NOMATCH 0
71:
72: /* Special internal returns from the match() function. Make them sufficiently
73: negative to avoid the external error codes. */
74:
75: #define MATCH_ACCEPT (-999)
76: #define MATCH_COMMIT (-998)
77: #define MATCH_KETRPOS (-997)
78: #define MATCH_ONCE (-996)
79: #define MATCH_PRUNE (-995)
80: #define MATCH_SKIP (-994)
81: #define MATCH_SKIP_ARG (-993)
82: #define MATCH_THEN (-992)
83:
84: /* Maximum number of ints of offset to save on the stack for recursive calls.
85: If the offset vector is bigger, malloc is used. This should be a multiple of 3,
86: because the offset vector is always a multiple of 3 long. */
87:
88: #define REC_STACK_SAVE_MAX 30
89:
90: /* Min and max values for the common repeats; for the maxima, 0 => infinity */
91:
92: static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
93: static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
94:
95:
96:
97: #ifdef PCRE_DEBUG
98: /*************************************************
99: * Debugging function to print chars *
100: *************************************************/
101:
102: /* Print a sequence of chars in printable format, stopping at the end of the
103: subject if the requested.
104:
105: Arguments:
106: p points to characters
107: length number to print
108: is_subject TRUE if printing from within md->start_subject
109: md pointer to matching data block, if is_subject is TRUE
110:
111: Returns: nothing
112: */
113:
114: static void
1.1.1.2 misho 115: pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
1.1 misho 116: {
117: unsigned int c;
118: if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
119: while (length-- > 0)
120: if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
121: }
122: #endif
123:
124:
125:
126: /*************************************************
127: * Match a back-reference *
128: *************************************************/
129:
130: /* Normally, if a back reference hasn't been set, the length that is passed is
131: negative, so the match always fails. However, in JavaScript compatibility mode,
132: the length passed is zero. Note that in caseless UTF-8 mode, the number of
133: subject bytes matched may be different to the number of reference bytes.
134:
135: Arguments:
136: offset index into the offset vector
137: eptr pointer into the subject
138: length length of reference to be matched (number of bytes)
139: md points to match data block
140: caseless TRUE if caseless
141:
1.1.1.3 ! misho 142: Returns: >= 0 the number of subject bytes matched
! 143: -1 no match
! 144: -2 partial match; always given if at end subject
1.1 misho 145: */
146:
147: static int
1.1.1.2 misho 148: match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
1.1 misho 149: BOOL caseless)
150: {
1.1.1.2 misho 151: PCRE_PUCHAR eptr_start = eptr;
152: register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
1.1 misho 153:
154: #ifdef PCRE_DEBUG
155: if (eptr >= md->end_subject)
156: printf("matching subject <null>");
157: else
158: {
159: printf("matching subject ");
160: pchars(eptr, length, TRUE, md);
161: }
162: printf(" against backref ");
163: pchars(p, length, FALSE, md);
164: printf("\n");
165: #endif
166:
1.1.1.3 ! misho 167: /* Always fail if reference not set (and not JavaScript compatible - in that
! 168: case the length is passed as zero). */
1.1 misho 169:
170: if (length < 0) return -1;
171:
172: /* Separate the caseless case for speed. In UTF-8 mode we can only do this
173: properly if Unicode properties are supported. Otherwise, we can check only
174: ASCII characters. */
175:
176: if (caseless)
177: {
1.1.1.2 misho 178: #ifdef SUPPORT_UTF
1.1 misho 179: #ifdef SUPPORT_UCP
1.1.1.2 misho 180: if (md->utf)
1.1 misho 181: {
182: /* Match characters up to the end of the reference. NOTE: the number of
183: bytes matched may differ, because there are some characters whose upper and
184: lower case versions code as different numbers of bytes. For example, U+023A
185: (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
186: a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
187: the latter. It is important, therefore, to check the length along the
188: reference, not along the subject (earlier code did this wrong). */
189:
1.1.1.2 misho 190: PCRE_PUCHAR endptr = p + length;
1.1 misho 191: while (p < endptr)
192: {
193: int c, d;
1.1.1.3 ! misho 194: if (eptr >= md->end_subject) return -2; /* Partial match */
1.1 misho 195: GETCHARINC(c, eptr);
196: GETCHARINC(d, p);
197: if (c != d && c != UCD_OTHERCASE(d)) return -1;
198: }
199: }
200: else
201: #endif
202: #endif
203:
204: /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
205: is no UCP support. */
206: {
207: while (length-- > 0)
1.1.1.2 misho 208: {
1.1.1.3 ! misho 209: if (eptr >= md->end_subject) return -2; /* Partial match */
1.1.1.2 misho 210: if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1;
211: p++;
212: eptr++;
213: }
1.1 misho 214: }
215: }
216:
217: /* In the caseful case, we can just compare the bytes, whether or not we
218: are in UTF-8 mode. */
219:
220: else
221: {
1.1.1.3 ! misho 222: while (length-- > 0)
! 223: {
! 224: if (eptr >= md->end_subject) return -2; /* Partial match */
! 225: if (*p++ != *eptr++) return -1;
! 226: }
1.1 misho 227: }
228:
229: return (int)(eptr - eptr_start);
230: }
231:
232:
233:
234: /***************************************************************************
235: ****************************************************************************
236: RECURSION IN THE match() FUNCTION
237:
238: The match() function is highly recursive, though not every recursive call
239: increases the recursive depth. Nevertheless, some regular expressions can cause
240: it to recurse to a great depth. I was writing for Unix, so I just let it call
241: itself recursively. This uses the stack for saving everything that has to be
242: saved for a recursive call. On Unix, the stack can be large, and this works
243: fine.
244:
245: It turns out that on some non-Unix-like systems there are problems with
246: programs that use a lot of stack. (This despite the fact that every last chip
247: has oodles of memory these days, and techniques for extending the stack have
248: been known for decades.) So....
249:
250: There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
251: calls by keeping local variables that need to be preserved in blocks of memory
252: obtained from malloc() instead instead of on the stack. Macros are used to
253: achieve this so that the actual code doesn't look very different to what it
254: always used to.
255:
256: The original heap-recursive code used longjmp(). However, it seems that this
257: can be very slow on some operating systems. Following a suggestion from Stan
258: Switzer, the use of longjmp() has been abolished, at the cost of having to
259: provide a unique number for each call to RMATCH. There is no way of generating
260: a sequence of numbers at compile time in C. I have given them names, to make
261: them stand out more clearly.
262:
263: Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
264: FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
265: tests. Furthermore, not using longjmp() means that local dynamic variables
266: don't have indeterminate values; this has meant that the frame size can be
267: reduced because the result can be "passed back" by straight setting of the
268: variable instead of being passed in the frame.
269: ****************************************************************************
270: ***************************************************************************/
271:
272: /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
273: below must be updated in sync. */
274:
275: enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
276: RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
277: RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
278: RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
279: RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
280: RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
281: RM61, RM62, RM63, RM64, RM65, RM66 };
282:
283: /* These versions of the macros use the stack, as normal. There are debugging
284: versions and production versions. Note that the "rw" argument of RMATCH isn't
285: actually used in this definition. */
286:
287: #ifndef NO_RECURSE
288: #define REGISTER register
289:
290: #ifdef PCRE_DEBUG
291: #define RMATCH(ra,rb,rc,rd,re,rw) \
292: { \
293: printf("match() called in line %d\n", __LINE__); \
294: rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
295: printf("to line %d\n", __LINE__); \
296: }
297: #define RRETURN(ra) \
298: { \
299: printf("match() returned %d from line %d ", ra, __LINE__); \
300: return ra; \
301: }
302: #else
303: #define RMATCH(ra,rb,rc,rd,re,rw) \
304: rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
305: #define RRETURN(ra) return ra
306: #endif
307:
308: #else
309:
310:
311: /* These versions of the macros manage a private stack on the heap. Note that
312: the "rd" argument of RMATCH isn't actually used in this definition. It's the md
313: argument of match(), which never changes. */
314:
315: #define REGISTER
316:
317: #define RMATCH(ra,rb,rc,rd,re,rw)\
318: {\
1.1.1.3 ! misho 319: heapframe *newframe = frame->Xnextframe;\
! 320: if (newframe == NULL)\
! 321: {\
! 322: newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
! 323: if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
! 324: newframe->Xnextframe = NULL;\
! 325: frame->Xnextframe = newframe;\
! 326: }\
! 327: frame->Xwhere = rw;\
1.1 misho 328: newframe->Xeptr = ra;\
329: newframe->Xecode = rb;\
330: newframe->Xmstart = mstart;\
331: newframe->Xoffset_top = rc;\
332: newframe->Xeptrb = re;\
333: newframe->Xrdepth = frame->Xrdepth + 1;\
334: newframe->Xprevframe = frame;\
335: frame = newframe;\
336: DPRINTF(("restarting from line %d\n", __LINE__));\
337: goto HEAP_RECURSE;\
338: L_##rw:\
339: DPRINTF(("jumped back to line %d\n", __LINE__));\
340: }
341:
342: #define RRETURN(ra)\
343: {\
344: heapframe *oldframe = frame;\
345: frame = oldframe->Xprevframe;\
346: if (frame != NULL)\
347: {\
348: rrc = ra;\
349: goto HEAP_RETURN;\
350: }\
351: return ra;\
352: }
353:
354:
355: /* Structure for remembering the local variables in a private frame */
356:
357: typedef struct heapframe {
358: struct heapframe *Xprevframe;
1.1.1.3 ! misho 359: struct heapframe *Xnextframe;
1.1 misho 360:
361: /* Function arguments that may change */
362:
1.1.1.2 misho 363: PCRE_PUCHAR Xeptr;
364: const pcre_uchar *Xecode;
365: PCRE_PUCHAR Xmstart;
1.1 misho 366: int Xoffset_top;
367: eptrblock *Xeptrb;
368: unsigned int Xrdepth;
369:
370: /* Function local variables */
371:
1.1.1.2 misho 372: PCRE_PUCHAR Xcallpat;
373: #ifdef SUPPORT_UTF
374: PCRE_PUCHAR Xcharptr;
375: #endif
376: PCRE_PUCHAR Xdata;
377: PCRE_PUCHAR Xnext;
378: PCRE_PUCHAR Xpp;
379: PCRE_PUCHAR Xprev;
380: PCRE_PUCHAR Xsaved_eptr;
1.1 misho 381:
382: recursion_info Xnew_recursive;
383:
384: BOOL Xcur_is_word;
385: BOOL Xcondition;
386: BOOL Xprev_is_word;
387:
388: #ifdef SUPPORT_UCP
389: int Xprop_type;
390: int Xprop_value;
391: int Xprop_fail_result;
392: int Xoclength;
1.1.1.2 misho 393: pcre_uchar Xocchars[6];
1.1 misho 394: #endif
395:
396: int Xcodelink;
397: int Xctype;
398: unsigned int Xfc;
399: int Xfi;
400: int Xlength;
401: int Xmax;
402: int Xmin;
403: int Xnumber;
404: int Xoffset;
405: int Xop;
406: int Xsave_capture_last;
407: int Xsave_offset1, Xsave_offset2, Xsave_offset3;
408: int Xstacksave[REC_STACK_SAVE_MAX];
409:
410: eptrblock Xnewptrb;
411:
412: /* Where to jump back to */
413:
414: int Xwhere;
415:
416: } heapframe;
417:
418: #endif
419:
420:
421: /***************************************************************************
422: ***************************************************************************/
423:
424:
425:
426: /*************************************************
427: * Match from current position *
428: *************************************************/
429:
430: /* This function is called recursively in many circumstances. Whenever it
431: returns a negative (error) response, the outer incarnation must also return the
432: same response. */
433:
434: /* These macros pack up tests that are used for partial matching, and which
435: appear several times in the code. We set the "hit end" flag if the pointer is
436: at the end of the subject and also past the start of the subject (i.e.
437: something has been matched). For hard partial matching, we then return
438: immediately. The second one is used when we already know we are past the end of
439: the subject. */
440:
441: #define CHECK_PARTIAL()\
442: if (md->partial != 0 && eptr >= md->end_subject && \
443: eptr > md->start_used_ptr) \
444: { \
445: md->hitend = TRUE; \
446: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
447: }
448:
449: #define SCHECK_PARTIAL()\
450: if (md->partial != 0 && eptr > md->start_used_ptr) \
451: { \
452: md->hitend = TRUE; \
453: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
454: }
455:
456:
457: /* Performance note: It might be tempting to extract commonly used fields from
1.1.1.2 misho 458: the md structure (e.g. utf, end_subject) into individual variables to improve
1.1 misho 459: performance. Tests using gcc on a SPARC disproved this; in the first case, it
460: made performance worse.
461:
462: Arguments:
463: eptr pointer to current character in subject
464: ecode pointer to current position in compiled code
465: mstart pointer to the current match start position (can be modified
466: by encountering \K)
467: offset_top current top pointer
468: md pointer to "static" info for the match
469: eptrb pointer to chain of blocks containing eptr at start of
470: brackets - for testing for empty matches
471: rdepth the recursion depth
472:
473: Returns: MATCH_MATCH if matched ) these values are >= 0
474: MATCH_NOMATCH if failed to match )
475: a negative MATCH_xxx value for PRUNE, SKIP, etc
476: a negative PCRE_ERROR_xxx value if aborted by an error condition
477: (e.g. stopped by repeated call or recursion limit)
478: */
479:
480: static int
1.1.1.2 misho 481: match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
482: PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
483: unsigned int rdepth)
1.1 misho 484: {
485: /* These variables do not need to be preserved over recursion in this function,
486: so they can be ordinary variables in all cases. Mark some of them with
487: "register" because they are used a lot in loops. */
488:
489: register int rrc; /* Returns from recursive calls */
490: register int i; /* Used for loops not involving calls to RMATCH() */
491: register unsigned int c; /* Character values not kept over RMATCH() calls */
1.1.1.2 misho 492: register BOOL utf; /* Local copy of UTF flag for speed */
1.1 misho 493:
494: BOOL minimize, possessive; /* Quantifier options */
495: BOOL caseless;
496: int condcode;
497:
498: /* When recursion is not being used, all "local" variables that have to be
1.1.1.2 misho 499: preserved over calls to RMATCH() are part of a "frame". We set up the top-level
500: frame on the stack here; subsequent instantiations are obtained from the heap
501: whenever RMATCH() does a "recursion". See the macro definitions above. Putting
502: the top-level on the stack rather than malloc-ing them all gives a performance
503: boost in many cases where there is not much "recursion". */
1.1 misho 504:
505: #ifdef NO_RECURSE
1.1.1.3 ! misho 506: heapframe *frame = (heapframe *)md->match_frames_base;
1.1 misho 507:
508: /* Copy in the original argument variables */
509:
510: frame->Xeptr = eptr;
511: frame->Xecode = ecode;
512: frame->Xmstart = mstart;
513: frame->Xoffset_top = offset_top;
514: frame->Xeptrb = eptrb;
515: frame->Xrdepth = rdepth;
516:
517: /* This is where control jumps back to to effect "recursion" */
518:
519: HEAP_RECURSE:
520:
521: /* Macros make the argument variables come from the current frame */
522:
523: #define eptr frame->Xeptr
524: #define ecode frame->Xecode
525: #define mstart frame->Xmstart
526: #define offset_top frame->Xoffset_top
527: #define eptrb frame->Xeptrb
528: #define rdepth frame->Xrdepth
529:
530: /* Ditto for the local variables */
531:
1.1.1.2 misho 532: #ifdef SUPPORT_UTF
1.1 misho 533: #define charptr frame->Xcharptr
534: #endif
535: #define callpat frame->Xcallpat
536: #define codelink frame->Xcodelink
537: #define data frame->Xdata
538: #define next frame->Xnext
539: #define pp frame->Xpp
540: #define prev frame->Xprev
541: #define saved_eptr frame->Xsaved_eptr
542:
543: #define new_recursive frame->Xnew_recursive
544:
545: #define cur_is_word frame->Xcur_is_word
546: #define condition frame->Xcondition
547: #define prev_is_word frame->Xprev_is_word
548:
549: #ifdef SUPPORT_UCP
550: #define prop_type frame->Xprop_type
551: #define prop_value frame->Xprop_value
552: #define prop_fail_result frame->Xprop_fail_result
553: #define oclength frame->Xoclength
554: #define occhars frame->Xocchars
555: #endif
556:
557: #define ctype frame->Xctype
558: #define fc frame->Xfc
559: #define fi frame->Xfi
560: #define length frame->Xlength
561: #define max frame->Xmax
562: #define min frame->Xmin
563: #define number frame->Xnumber
564: #define offset frame->Xoffset
565: #define op frame->Xop
566: #define save_capture_last frame->Xsave_capture_last
567: #define save_offset1 frame->Xsave_offset1
568: #define save_offset2 frame->Xsave_offset2
569: #define save_offset3 frame->Xsave_offset3
570: #define stacksave frame->Xstacksave
571:
572: #define newptrb frame->Xnewptrb
573:
574: /* When recursion is being used, local variables are allocated on the stack and
575: get preserved during recursion in the normal way. In this environment, fi and
576: i, and fc and c, can be the same variables. */
577:
578: #else /* NO_RECURSE not defined */
579: #define fi i
580: #define fc c
581:
582: /* Many of the following variables are used only in small blocks of the code.
583: My normal style of coding would have declared them within each of those blocks.
584: However, in order to accommodate the version of this code that uses an external
585: "stack" implemented on the heap, it is easier to declare them all here, so the
586: declarations can be cut out in a block. The only declarations within blocks
587: below are for variables that do not have to be preserved over a recursive call
588: to RMATCH(). */
589:
1.1.1.2 misho 590: #ifdef SUPPORT_UTF
591: const pcre_uchar *charptr;
1.1 misho 592: #endif
1.1.1.2 misho 593: const pcre_uchar *callpat;
594: const pcre_uchar *data;
595: const pcre_uchar *next;
596: PCRE_PUCHAR pp;
597: const pcre_uchar *prev;
598: PCRE_PUCHAR saved_eptr;
1.1 misho 599:
600: recursion_info new_recursive;
601:
602: BOOL cur_is_word;
603: BOOL condition;
604: BOOL prev_is_word;
605:
606: #ifdef SUPPORT_UCP
607: int prop_type;
608: int prop_value;
609: int prop_fail_result;
610: int oclength;
1.1.1.2 misho 611: pcre_uchar occhars[6];
1.1 misho 612: #endif
613:
614: int codelink;
615: int ctype;
616: int length;
617: int max;
618: int min;
619: int number;
620: int offset;
621: int op;
622: int save_capture_last;
623: int save_offset1, save_offset2, save_offset3;
624: int stacksave[REC_STACK_SAVE_MAX];
625:
626: eptrblock newptrb;
1.1.1.2 misho 627:
628: /* There is a special fudge for calling match() in a way that causes it to
629: measure the size of its basic stack frame when the stack is being used for
630: recursion. The second argument (ecode) being NULL triggers this behaviour. It
631: cannot normally ever be NULL. The return is the negated value of the frame
632: size. */
633:
634: if (ecode == NULL)
635: {
636: if (rdepth == 0)
637: return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
638: else
639: {
640: int len = (char *)&rdepth - (char *)eptr;
641: return (len > 0)? -len : len;
642: }
643: }
1.1 misho 644: #endif /* NO_RECURSE */
645:
646: /* To save space on the stack and in the heap frame, I have doubled up on some
647: of the local variables that are used only in localised parts of the code, but
648: still need to be preserved over recursive calls of match(). These macros define
649: the alternative names that are used. */
650:
651: #define allow_zero cur_is_word
652: #define cbegroup condition
653: #define code_offset codelink
654: #define condassert condition
655: #define matched_once prev_is_word
1.1.1.2 misho 656: #define foc number
657: #define save_mark data
1.1 misho 658:
659: /* These statements are here to stop the compiler complaining about unitialized
660: variables. */
661:
662: #ifdef SUPPORT_UCP
663: prop_value = 0;
664: prop_fail_result = 0;
665: #endif
666:
667:
668: /* This label is used for tail recursion, which is used in a few cases even
669: when NO_RECURSE is not defined, in order to reduce the amount of stack that is
670: used. Thanks to Ian Taylor for noticing this possibility and sending the
671: original patch. */
672:
673: TAIL_RECURSE:
674:
675: /* OK, now we can get on with the real code of the function. Recursive calls
676: are specified by the macro RMATCH and RRETURN is used to return. When
677: NO_RECURSE is *not* defined, these just turn into a recursive call to match()
678: and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
679: defined). However, RMATCH isn't like a function call because it's quite a
680: complicated macro. It has to be used in one particular way. This shouldn't,
681: however, impact performance when true recursion is being used. */
682:
1.1.1.2 misho 683: #ifdef SUPPORT_UTF
684: utf = md->utf; /* Local copy of the flag */
1.1 misho 685: #else
1.1.1.2 misho 686: utf = FALSE;
1.1 misho 687: #endif
688:
689: /* First check that we haven't called match() too many times, or that we
690: haven't exceeded the recursive call limit. */
691:
692: if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
693: if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
694:
695: /* At the start of a group with an unlimited repeat that may match an empty
696: string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
697: done this way to save having to use another function argument, which would take
698: up space on the stack. See also MATCH_CONDASSERT below.
699:
700: When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
701: such remembered pointers, to be checked when we hit the closing ket, in order
702: to break infinite loops that match no characters. When match() is called in
703: other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
704: NOT be used with tail recursion, because the memory block that is used is on
705: the stack, so a new one may be required for each match(). */
706:
707: if (md->match_function_type == MATCH_CBEGROUP)
708: {
709: newptrb.epb_saved_eptr = eptr;
710: newptrb.epb_prev = eptrb;
711: eptrb = &newptrb;
712: md->match_function_type = 0;
713: }
714:
715: /* Now start processing the opcodes. */
716:
717: for (;;)
718: {
719: minimize = possessive = FALSE;
720: op = *ecode;
721:
722: switch(op)
723: {
724: case OP_MARK:
725: md->nomatch_mark = ecode + 2;
726: md->mark = NULL; /* In case previously set by assertion */
1.1.1.2 misho 727: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
1.1 misho 728: eptrb, RM55);
729: if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
730: md->mark == NULL) md->mark = ecode + 2;
731:
732: /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
733: argument, and we must check whether that argument matches this MARK's
734: argument. It is passed back in md->start_match_ptr (an overloading of that
735: variable). If it does match, we reset that variable to the current subject
736: position and return MATCH_SKIP. Otherwise, pass back the return code
737: unaltered. */
738:
739: else if (rrc == MATCH_SKIP_ARG &&
1.1.1.2 misho 740: STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0)
1.1 misho 741: {
742: md->start_match_ptr = eptr;
743: RRETURN(MATCH_SKIP);
744: }
745: RRETURN(rrc);
746:
747: case OP_FAIL:
748: RRETURN(MATCH_NOMATCH);
749:
750: /* COMMIT overrides PRUNE, SKIP, and THEN */
751:
752: case OP_COMMIT:
1.1.1.2 misho 753: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1.1 misho 754: eptrb, RM52);
755: if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
756: rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
757: rrc != MATCH_THEN)
758: RRETURN(rrc);
759: RRETURN(MATCH_COMMIT);
760:
761: /* PRUNE overrides THEN */
762:
763: case OP_PRUNE:
1.1.1.2 misho 764: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1.1 misho 765: eptrb, RM51);
766: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
767: RRETURN(MATCH_PRUNE);
768:
769: case OP_PRUNE_ARG:
770: md->nomatch_mark = ecode + 2;
771: md->mark = NULL; /* In case previously set by assertion */
1.1.1.2 misho 772: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
1.1 misho 773: eptrb, RM56);
774: if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
775: md->mark == NULL) md->mark = ecode + 2;
776: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
777: RRETURN(MATCH_PRUNE);
778:
779: /* SKIP overrides PRUNE and THEN */
780:
781: case OP_SKIP:
1.1.1.2 misho 782: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1.1 misho 783: eptrb, RM53);
784: if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
785: RRETURN(rrc);
786: md->start_match_ptr = eptr; /* Pass back current position */
787: RRETURN(MATCH_SKIP);
788:
789: /* Note that, for Perl compatibility, SKIP with an argument does NOT set
790: nomatch_mark. There is a flag that disables this opcode when re-matching a
791: pattern that ended with a SKIP for which there was not a matching MARK. */
792:
793: case OP_SKIP_ARG:
794: if (md->ignore_skip_arg)
795: {
1.1.1.2 misho 796: ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
1.1 misho 797: break;
798: }
1.1.1.2 misho 799: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
1.1 misho 800: eptrb, RM57);
801: if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
802: RRETURN(rrc);
803:
804: /* Pass back the current skip name by overloading md->start_match_ptr and
805: returning the special MATCH_SKIP_ARG return code. This will either be
806: caught by a matching MARK, or get to the top, where it causes a rematch
807: with the md->ignore_skip_arg flag set. */
808:
809: md->start_match_ptr = ecode + 2;
810: RRETURN(MATCH_SKIP_ARG);
811:
812: /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
813: the branch in which it occurs can be determined. Overload the start of
814: match pointer to do this. */
815:
816: case OP_THEN:
1.1.1.2 misho 817: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1.1 misho 818: eptrb, RM54);
819: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
820: md->start_match_ptr = ecode;
821: RRETURN(MATCH_THEN);
822:
823: case OP_THEN_ARG:
824: md->nomatch_mark = ecode + 2;
825: md->mark = NULL; /* In case previously set by assertion */
1.1.1.2 misho 826: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
1.1 misho 827: md, eptrb, RM58);
828: if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
829: md->mark == NULL) md->mark = ecode + 2;
830: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
831: md->start_match_ptr = ecode;
832: RRETURN(MATCH_THEN);
833:
834: /* Handle an atomic group that does not contain any capturing parentheses.
835: This can be handled like an assertion. Prior to 8.13, all atomic groups
836: were handled this way. In 8.13, the code was changed as below for ONCE, so
837: that backups pass through the group and thereby reset captured values.
838: However, this uses a lot more stack, so in 8.20, atomic groups that do not
839: contain any captures generate OP_ONCE_NC, which can be handled in the old,
840: less stack intensive way.
841:
842: Check the alternative branches in turn - the matching won't pass the KET
843: for this kind of subpattern. If any one branch matches, we carry on as at
844: the end of a normal bracket, leaving the subject pointer, but resetting
845: the start-of-match value in case it was changed by \K. */
846:
847: case OP_ONCE_NC:
848: prev = ecode;
849: saved_eptr = eptr;
1.1.1.2 misho 850: save_mark = md->mark;
1.1 misho 851: do
852: {
853: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
854: if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
855: {
856: mstart = md->start_match_ptr;
857: break;
858: }
859: if (rrc == MATCH_THEN)
860: {
861: next = ecode + GET(ecode,1);
862: if (md->start_match_ptr < next &&
863: (*ecode == OP_ALT || *next == OP_ALT))
864: rrc = MATCH_NOMATCH;
865: }
866:
867: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
868: ecode += GET(ecode,1);
1.1.1.2 misho 869: md->mark = save_mark;
1.1 misho 870: }
871: while (*ecode == OP_ALT);
872:
873: /* If hit the end of the group (which could be repeated), fail */
874:
875: if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
876:
877: /* Continue as from after the group, updating the offsets high water
878: mark, since extracts may have been taken. */
879:
880: do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
881:
882: offset_top = md->end_offset_top;
883: eptr = md->end_match_ptr;
884:
885: /* For a non-repeating ket, just continue at this level. This also
886: happens for a repeating ket if no characters were matched in the group.
887: This is the forcible breaking of infinite loops as implemented in Perl
888: 5.005. */
889:
890: if (*ecode == OP_KET || eptr == saved_eptr)
891: {
892: ecode += 1+LINK_SIZE;
893: break;
894: }
895:
896: /* The repeating kets try the rest of the pattern or restart from the
897: preceding bracket, in the appropriate order. The second "call" of match()
898: uses tail recursion, to avoid using another stack frame. */
899:
900: if (*ecode == OP_KETRMIN)
901: {
902: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
903: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
904: ecode = prev;
905: goto TAIL_RECURSE;
906: }
907: else /* OP_KETRMAX */
908: {
909: RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
910: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
911: ecode += 1 + LINK_SIZE;
912: goto TAIL_RECURSE;
913: }
914: /* Control never gets here */
915:
916: /* Handle a capturing bracket, other than those that are possessive with an
917: unlimited repeat. If there is space in the offset vector, save the current
918: subject position in the working slot at the top of the vector. We mustn't
919: change the current values of the data slot, because they may be set from a
920: previous iteration of this group, and be referred to by a reference inside
921: the group. A failure to match might occur after the group has succeeded,
922: if something later on doesn't match. For this reason, we need to restore
923: the working value and also the values of the final offsets, in case they
924: were set by a previous iteration of the same bracket.
925:
926: If there isn't enough space in the offset vector, treat this as if it were
927: a non-capturing bracket. Don't worry about setting the flag for the error
928: case here; that is handled in the code for KET. */
929:
930: case OP_CBRA:
931: case OP_SCBRA:
932: number = GET2(ecode, 1+LINK_SIZE);
933: offset = number << 1;
934:
935: #ifdef PCRE_DEBUG
936: printf("start bracket %d\n", number);
937: printf("subject=");
938: pchars(eptr, 16, TRUE, md);
939: printf("\n");
940: #endif
941:
942: if (offset < md->offset_max)
943: {
944: save_offset1 = md->offset_vector[offset];
945: save_offset2 = md->offset_vector[offset+1];
946: save_offset3 = md->offset_vector[md->offset_end - number];
947: save_capture_last = md->capture_last;
1.1.1.2 misho 948: save_mark = md->mark;
1.1 misho 949:
950: DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
951: md->offset_vector[md->offset_end - number] =
952: (int)(eptr - md->start_subject);
953:
954: for (;;)
955: {
956: if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1.1.1.2 misho 957: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1.1 misho 958: eptrb, RM1);
959: if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
960:
961: /* If we backed up to a THEN, check whether it is within the current
962: branch by comparing the address of the THEN that is passed back with
963: the end of the branch. If it is within the current branch, and the
964: branch is one of two or more alternatives (it either starts or ends
965: with OP_ALT), we have reached the limit of THEN's action, so convert
966: the return code to NOMATCH, which will cause normal backtracking to
967: happen from now on. Otherwise, THEN is passed back to an outer
968: alternative. This implements Perl's treatment of parenthesized groups,
969: where a group not containing | does not affect the current alternative,
970: that is, (X) is NOT the same as (X|(*F)). */
971:
972: if (rrc == MATCH_THEN)
973: {
974: next = ecode + GET(ecode,1);
975: if (md->start_match_ptr < next &&
976: (*ecode == OP_ALT || *next == OP_ALT))
977: rrc = MATCH_NOMATCH;
978: }
979:
980: /* Anything other than NOMATCH is passed back. */
981:
982: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
983: md->capture_last = save_capture_last;
984: ecode += GET(ecode, 1);
1.1.1.2 misho 985: md->mark = save_mark;
1.1 misho 986: if (*ecode != OP_ALT) break;
987: }
988:
989: DPRINTF(("bracket %d failed\n", number));
990: md->offset_vector[offset] = save_offset1;
991: md->offset_vector[offset+1] = save_offset2;
992: md->offset_vector[md->offset_end - number] = save_offset3;
993:
994: /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
995:
996: RRETURN(rrc);
997: }
998:
999: /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1000: as a non-capturing bracket. */
1001:
1002: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1003: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1004:
1005: DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1006:
1007: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1008: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1009:
1010: /* Non-capturing or atomic group, except for possessive with unlimited
1011: repeat and ONCE group with no captures. Loop for all the alternatives.
1012:
1013: When we get to the final alternative within the brackets, we used to return
1014: the result of a recursive call to match() whatever happened so it was
1015: possible to reduce stack usage by turning this into a tail recursion,
1016: except in the case of a possibly empty group. However, now that there is
1017: the possiblity of (*THEN) occurring in the final alternative, this
1018: optimization is no longer always possible.
1019:
1020: We can optimize if we know there are no (*THEN)s in the pattern; at present
1021: this is the best that can be done.
1022:
1023: MATCH_ONCE is returned when the end of an atomic group is successfully
1024: reached, but subsequent matching fails. It passes back up the tree (causing
1025: captured values to be reset) until the original atomic group level is
1026: reached. This is tested by comparing md->once_target with the start of the
1027: group. At this point, the return is converted into MATCH_NOMATCH so that
1028: previous backup points can be taken. */
1029:
1030: case OP_ONCE:
1031: case OP_BRA:
1032: case OP_SBRA:
1033: DPRINTF(("start non-capturing bracket\n"));
1034:
1035: for (;;)
1036: {
1.1.1.3 ! misho 1037: if (op >= OP_SBRA || op == OP_ONCE)
! 1038: md->match_function_type = MATCH_CBEGROUP;
1.1 misho 1039:
1040: /* If this is not a possibly empty group, and there are no (*THEN)s in
1041: the pattern, and this is the final alternative, optimize as described
1042: above. */
1043:
1044: else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1045: {
1.1.1.2 misho 1046: ecode += PRIV(OP_lengths)[*ecode];
1.1 misho 1047: goto TAIL_RECURSE;
1048: }
1049:
1050: /* In all other cases, we have to make another call to match(). */
1051:
1.1.1.2 misho 1052: save_mark = md->mark;
1053: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1.1 misho 1054: RM2);
1055:
1056: /* See comment in the code for capturing groups above about handling
1057: THEN. */
1058:
1059: if (rrc == MATCH_THEN)
1060: {
1061: next = ecode + GET(ecode,1);
1062: if (md->start_match_ptr < next &&
1063: (*ecode == OP_ALT || *next == OP_ALT))
1064: rrc = MATCH_NOMATCH;
1065: }
1066:
1067: if (rrc != MATCH_NOMATCH)
1068: {
1069: if (rrc == MATCH_ONCE)
1070: {
1.1.1.2 misho 1071: const pcre_uchar *scode = ecode;
1.1 misho 1072: if (*scode != OP_ONCE) /* If not at start, find it */
1073: {
1074: while (*scode == OP_ALT) scode += GET(scode, 1);
1075: scode -= GET(scode, 1);
1076: }
1077: if (md->once_target == scode) rrc = MATCH_NOMATCH;
1078: }
1079: RRETURN(rrc);
1080: }
1081: ecode += GET(ecode, 1);
1.1.1.2 misho 1082: md->mark = save_mark;
1.1 misho 1083: if (*ecode != OP_ALT) break;
1084: }
1085:
1086: RRETURN(MATCH_NOMATCH);
1087:
1088: /* Handle possessive capturing brackets with an unlimited repeat. We come
1089: here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1090: handled similarly to the normal case above. However, the matching is
1091: different. The end of these brackets will always be OP_KETRPOS, which
1092: returns MATCH_KETRPOS without going further in the pattern. By this means
1093: we can handle the group by iteration rather than recursion, thereby
1094: reducing the amount of stack needed. */
1095:
1096: case OP_CBRAPOS:
1097: case OP_SCBRAPOS:
1098: allow_zero = FALSE;
1099:
1100: POSSESSIVE_CAPTURE:
1101: number = GET2(ecode, 1+LINK_SIZE);
1102: offset = number << 1;
1103:
1104: #ifdef PCRE_DEBUG
1105: printf("start possessive bracket %d\n", number);
1106: printf("subject=");
1107: pchars(eptr, 16, TRUE, md);
1108: printf("\n");
1109: #endif
1110:
1111: if (offset < md->offset_max)
1112: {
1113: matched_once = FALSE;
1114: code_offset = (int)(ecode - md->start_code);
1115:
1116: save_offset1 = md->offset_vector[offset];
1117: save_offset2 = md->offset_vector[offset+1];
1118: save_offset3 = md->offset_vector[md->offset_end - number];
1119: save_capture_last = md->capture_last;
1120:
1121: DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1122:
1123: /* Each time round the loop, save the current subject position for use
1124: when the group matches. For MATCH_MATCH, the group has matched, so we
1125: restart it with a new subject starting position, remembering that we had
1126: at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1127: usual. If we haven't matched any alternatives in any iteration, check to
1128: see if a previous iteration matched. If so, the group has matched;
1129: continue from afterwards. Otherwise it has failed; restore the previous
1130: capture values before returning NOMATCH. */
1131:
1132: for (;;)
1133: {
1134: md->offset_vector[md->offset_end - number] =
1135: (int)(eptr - md->start_subject);
1136: if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1.1.1.2 misho 1137: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1.1 misho 1138: eptrb, RM63);
1139: if (rrc == MATCH_KETRPOS)
1140: {
1141: offset_top = md->end_offset_top;
1142: eptr = md->end_match_ptr;
1143: ecode = md->start_code + code_offset;
1144: save_capture_last = md->capture_last;
1145: matched_once = TRUE;
1146: continue;
1147: }
1148:
1149: /* See comment in the code for capturing groups above about handling
1150: THEN. */
1151:
1152: if (rrc == MATCH_THEN)
1153: {
1154: next = ecode + GET(ecode,1);
1155: if (md->start_match_ptr < next &&
1156: (*ecode == OP_ALT || *next == OP_ALT))
1157: rrc = MATCH_NOMATCH;
1158: }
1159:
1160: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1161: md->capture_last = save_capture_last;
1162: ecode += GET(ecode, 1);
1163: if (*ecode != OP_ALT) break;
1164: }
1165:
1166: if (!matched_once)
1167: {
1168: md->offset_vector[offset] = save_offset1;
1169: md->offset_vector[offset+1] = save_offset2;
1170: md->offset_vector[md->offset_end - number] = save_offset3;
1171: }
1172:
1173: if (allow_zero || matched_once)
1174: {
1175: ecode += 1 + LINK_SIZE;
1176: break;
1177: }
1178:
1179: RRETURN(MATCH_NOMATCH);
1180: }
1181:
1182: /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1183: as a non-capturing bracket. */
1184:
1185: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1186: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1187:
1188: DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1189:
1190: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1191: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1192:
1193: /* Non-capturing possessive bracket with unlimited repeat. We come here
1194: from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1195: without the capturing complication. It is written out separately for speed
1196: and cleanliness. */
1197:
1198: case OP_BRAPOS:
1199: case OP_SBRAPOS:
1200: allow_zero = FALSE;
1201:
1202: POSSESSIVE_NON_CAPTURE:
1203: matched_once = FALSE;
1204: code_offset = (int)(ecode - md->start_code);
1205:
1206: for (;;)
1207: {
1208: if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1.1.1.2 misho 1209: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1.1 misho 1210: eptrb, RM48);
1211: if (rrc == MATCH_KETRPOS)
1212: {
1213: offset_top = md->end_offset_top;
1214: eptr = md->end_match_ptr;
1215: ecode = md->start_code + code_offset;
1216: matched_once = TRUE;
1217: continue;
1218: }
1219:
1220: /* See comment in the code for capturing groups above about handling
1221: THEN. */
1222:
1223: if (rrc == MATCH_THEN)
1224: {
1225: next = ecode + GET(ecode,1);
1226: if (md->start_match_ptr < next &&
1227: (*ecode == OP_ALT || *next == OP_ALT))
1228: rrc = MATCH_NOMATCH;
1229: }
1230:
1231: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1232: ecode += GET(ecode, 1);
1233: if (*ecode != OP_ALT) break;
1234: }
1235:
1236: if (matched_once || allow_zero)
1237: {
1238: ecode += 1 + LINK_SIZE;
1239: break;
1240: }
1241: RRETURN(MATCH_NOMATCH);
1242:
1243: /* Control never reaches here. */
1244:
1245: /* Conditional group: compilation checked that there are no more than
1246: two branches. If the condition is false, skipping the first branch takes us
1247: past the end if there is only one branch, but that's OK because that is
1248: exactly what going to the ket would do. */
1249:
1250: case OP_COND:
1251: case OP_SCOND:
1252: codelink = GET(ecode, 1);
1253:
1254: /* Because of the way auto-callout works during compile, a callout item is
1255: inserted between OP_COND and an assertion condition. */
1256:
1257: if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1258: {
1.1.1.2 misho 1259: if (PUBL(callout) != NULL)
1.1 misho 1260: {
1.1.1.2 misho 1261: PUBL(callout_block) cb;
1.1 misho 1262: cb.version = 2; /* Version 1 of the callout block */
1263: cb.callout_number = ecode[LINK_SIZE+2];
1264: cb.offset_vector = md->offset_vector;
1.1.1.2 misho 1265: #ifdef COMPILE_PCRE8
1.1 misho 1266: cb.subject = (PCRE_SPTR)md->start_subject;
1.1.1.2 misho 1267: #else
1268: cb.subject = (PCRE_SPTR16)md->start_subject;
1269: #endif
1.1 misho 1270: cb.subject_length = (int)(md->end_subject - md->start_subject);
1271: cb.start_match = (int)(mstart - md->start_subject);
1272: cb.current_position = (int)(eptr - md->start_subject);
1273: cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1274: cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1275: cb.capture_top = offset_top/2;
1276: cb.capture_last = md->capture_last;
1277: cb.callout_data = md->callout_data;
1278: cb.mark = md->nomatch_mark;
1.1.1.2 misho 1279: if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1.1 misho 1280: if (rrc < 0) RRETURN(rrc);
1281: }
1.1.1.2 misho 1282: ecode += PRIV(OP_lengths)[OP_CALLOUT];
1.1 misho 1283: }
1284:
1285: condcode = ecode[LINK_SIZE+1];
1286:
1287: /* Now see what the actual condition is */
1288:
1289: if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1290: {
1291: if (md->recursive == NULL) /* Not recursing => FALSE */
1292: {
1293: condition = FALSE;
1294: ecode += GET(ecode, 1);
1295: }
1296: else
1297: {
1298: int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1299: condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1300:
1301: /* If the test is for recursion into a specific subpattern, and it is
1302: false, but the test was set up by name, scan the table to see if the
1303: name refers to any other numbers, and test them. The condition is true
1304: if any one is set. */
1305:
1306: if (!condition && condcode == OP_NRREF)
1307: {
1.1.1.2 misho 1308: pcre_uchar *slotA = md->name_table;
1.1 misho 1309: for (i = 0; i < md->name_count; i++)
1310: {
1311: if (GET2(slotA, 0) == recno) break;
1312: slotA += md->name_entry_size;
1313: }
1314:
1315: /* Found a name for the number - there can be only one; duplicate
1316: names for different numbers are allowed, but not vice versa. First
1317: scan down for duplicates. */
1318:
1319: if (i < md->name_count)
1320: {
1.1.1.2 misho 1321: pcre_uchar *slotB = slotA;
1.1 misho 1322: while (slotB > md->name_table)
1323: {
1324: slotB -= md->name_entry_size;
1.1.1.2 misho 1325: if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1.1 misho 1326: {
1327: condition = GET2(slotB, 0) == md->recursive->group_num;
1328: if (condition) break;
1329: }
1330: else break;
1331: }
1332:
1333: /* Scan up for duplicates */
1334:
1335: if (!condition)
1336: {
1337: slotB = slotA;
1338: for (i++; i < md->name_count; i++)
1339: {
1340: slotB += md->name_entry_size;
1.1.1.2 misho 1341: if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1.1 misho 1342: {
1343: condition = GET2(slotB, 0) == md->recursive->group_num;
1344: if (condition) break;
1345: }
1346: else break;
1347: }
1348: }
1349: }
1350: }
1351:
1352: /* Chose branch according to the condition */
1353:
1.1.1.2 misho 1354: ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1.1 misho 1355: }
1356: }
1357:
1358: else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1359: {
1360: offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1361: condition = offset < offset_top && md->offset_vector[offset] >= 0;
1362:
1363: /* If the numbered capture is unset, but the reference was by name,
1364: scan the table to see if the name refers to any other numbers, and test
1365: them. The condition is true if any one is set. This is tediously similar
1366: to the code above, but not close enough to try to amalgamate. */
1367:
1368: if (!condition && condcode == OP_NCREF)
1369: {
1370: int refno = offset >> 1;
1.1.1.2 misho 1371: pcre_uchar *slotA = md->name_table;
1.1 misho 1372:
1373: for (i = 0; i < md->name_count; i++)
1374: {
1375: if (GET2(slotA, 0) == refno) break;
1376: slotA += md->name_entry_size;
1377: }
1378:
1379: /* Found a name for the number - there can be only one; duplicate names
1380: for different numbers are allowed, but not vice versa. First scan down
1381: for duplicates. */
1382:
1383: if (i < md->name_count)
1384: {
1.1.1.2 misho 1385: pcre_uchar *slotB = slotA;
1.1 misho 1386: while (slotB > md->name_table)
1387: {
1388: slotB -= md->name_entry_size;
1.1.1.2 misho 1389: if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1.1 misho 1390: {
1391: offset = GET2(slotB, 0) << 1;
1392: condition = offset < offset_top &&
1393: md->offset_vector[offset] >= 0;
1394: if (condition) break;
1395: }
1396: else break;
1397: }
1398:
1399: /* Scan up for duplicates */
1400:
1401: if (!condition)
1402: {
1403: slotB = slotA;
1404: for (i++; i < md->name_count; i++)
1405: {
1406: slotB += md->name_entry_size;
1.1.1.2 misho 1407: if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1.1 misho 1408: {
1409: offset = GET2(slotB, 0) << 1;
1410: condition = offset < offset_top &&
1411: md->offset_vector[offset] >= 0;
1412: if (condition) break;
1413: }
1414: else break;
1415: }
1416: }
1417: }
1418: }
1419:
1420: /* Chose branch according to the condition */
1421:
1.1.1.2 misho 1422: ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1.1 misho 1423: }
1424:
1425: else if (condcode == OP_DEF) /* DEFINE - always false */
1426: {
1427: condition = FALSE;
1428: ecode += GET(ecode, 1);
1429: }
1430:
1431: /* The condition is an assertion. Call match() to evaluate it - setting
1432: md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1433: an assertion. */
1434:
1435: else
1436: {
1437: md->match_function_type = MATCH_CONDASSERT;
1438: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1439: if (rrc == MATCH_MATCH)
1440: {
1441: if (md->end_offset_top > offset_top)
1442: offset_top = md->end_offset_top; /* Captures may have happened */
1443: condition = TRUE;
1444: ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1445: while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1446: }
1447:
1448: /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1449: assertion; it is therefore treated as NOMATCH. */
1450:
1451: else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1452: {
1453: RRETURN(rrc); /* Need braces because of following else */
1454: }
1455: else
1456: {
1457: condition = FALSE;
1458: ecode += codelink;
1459: }
1460: }
1461:
1462: /* We are now at the branch that is to be obeyed. As there is only one, can
1463: use tail recursion to avoid using another stack frame, except when there is
1464: unlimited repeat of a possibly empty group. In the latter case, a recursive
1465: call to match() is always required, unless the second alternative doesn't
1466: exist, in which case we can just plough on. Note that, for compatibility
1467: with Perl, the | in a conditional group is NOT treated as creating two
1468: alternatives. If a THEN is encountered in the branch, it propagates out to
1469: the enclosing alternative (unless nested in a deeper set of alternatives,
1470: of course). */
1471:
1472: if (condition || *ecode == OP_ALT)
1473: {
1474: if (op != OP_SCOND)
1475: {
1476: ecode += 1 + LINK_SIZE;
1477: goto TAIL_RECURSE;
1478: }
1479:
1480: md->match_function_type = MATCH_CBEGROUP;
1481: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1482: RRETURN(rrc);
1483: }
1484:
1485: /* Condition false & no alternative; continue after the group. */
1486:
1487: else
1488: {
1489: ecode += 1 + LINK_SIZE;
1490: }
1491: break;
1492:
1493:
1494: /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1495: to close any currently open capturing brackets. */
1496:
1497: case OP_CLOSE:
1498: number = GET2(ecode, 1);
1499: offset = number << 1;
1500:
1501: #ifdef PCRE_DEBUG
1502: printf("end bracket %d at *ACCEPT", number);
1503: printf("\n");
1504: #endif
1505:
1506: md->capture_last = number;
1507: if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1508: {
1509: md->offset_vector[offset] =
1510: md->offset_vector[md->offset_end - number];
1511: md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1512: if (offset_top <= offset) offset_top = offset + 2;
1513: }
1.1.1.2 misho 1514: ecode += 1 + IMM2_SIZE;
1.1 misho 1515: break;
1516:
1517:
1518: /* End of the pattern, either real or forced. */
1519:
1520: case OP_END:
1521: case OP_ACCEPT:
1522: case OP_ASSERT_ACCEPT:
1523:
1524: /* If we have matched an empty string, fail if not in an assertion and not
1525: in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1526: is set and we have matched at the start of the subject. In both cases,
1527: backtracking will then try other alternatives, if any. */
1528:
1529: if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1530: md->recursive == NULL &&
1531: (md->notempty ||
1532: (md->notempty_atstart &&
1533: mstart == md->start_subject + md->start_offset)))
1534: RRETURN(MATCH_NOMATCH);
1535:
1536: /* Otherwise, we have a match. */
1537:
1538: md->end_match_ptr = eptr; /* Record where we ended */
1539: md->end_offset_top = offset_top; /* and how many extracts were taken */
1540: md->start_match_ptr = mstart; /* and the start (\K can modify) */
1541:
1542: /* For some reason, the macros don't work properly if an expression is
1543: given as the argument to RRETURN when the heap is in use. */
1544:
1545: rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1546: RRETURN(rrc);
1547:
1548: /* Assertion brackets. Check the alternative branches in turn - the
1549: matching won't pass the KET for an assertion. If any one branch matches,
1550: the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1551: start of each branch to move the current point backwards, so the code at
1552: this level is identical to the lookahead case. When the assertion is part
1553: of a condition, we want to return immediately afterwards. The caller of
1554: this incarnation of the match() function will have set MATCH_CONDASSERT in
1555: md->match_function type, and one of these opcodes will be the first opcode
1556: that is processed. We use a local variable that is preserved over calls to
1557: match() to remember this case. */
1558:
1559: case OP_ASSERT:
1560: case OP_ASSERTBACK:
1.1.1.2 misho 1561: save_mark = md->mark;
1.1 misho 1562: if (md->match_function_type == MATCH_CONDASSERT)
1563: {
1564: condassert = TRUE;
1565: md->match_function_type = 0;
1566: }
1567: else condassert = FALSE;
1568:
1569: do
1570: {
1571: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1572: if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1573: {
1574: mstart = md->start_match_ptr; /* In case \K reset it */
1575: break;
1576: }
1.1.1.3 ! misho 1577: md->mark = save_mark;
1.1 misho 1578:
1.1.1.3 ! misho 1579: /* A COMMIT failure must fail the entire assertion, without trying any
! 1580: subsequent branches. */
! 1581:
! 1582: if (rrc == MATCH_COMMIT) RRETURN(MATCH_NOMATCH);
! 1583:
! 1584: /* PCRE does not allow THEN to escape beyond an assertion; it
! 1585: is treated as NOMATCH. */
1.1 misho 1586:
1587: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1588: ecode += GET(ecode, 1);
1589: }
1590: while (*ecode == OP_ALT);
1591:
1592: if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1593:
1594: /* If checking an assertion for a condition, return MATCH_MATCH. */
1595:
1596: if (condassert) RRETURN(MATCH_MATCH);
1597:
1598: /* Continue from after the assertion, updating the offsets high water
1599: mark, since extracts may have been taken during the assertion. */
1600:
1601: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1602: ecode += 1 + LINK_SIZE;
1603: offset_top = md->end_offset_top;
1604: continue;
1605:
1606: /* Negative assertion: all branches must fail to match. Encountering SKIP,
1607: PRUNE, or COMMIT means we must assume failure without checking subsequent
1608: branches. */
1609:
1610: case OP_ASSERT_NOT:
1611: case OP_ASSERTBACK_NOT:
1.1.1.2 misho 1612: save_mark = md->mark;
1.1 misho 1613: if (md->match_function_type == MATCH_CONDASSERT)
1614: {
1615: condassert = TRUE;
1616: md->match_function_type = 0;
1617: }
1618: else condassert = FALSE;
1619:
1620: do
1621: {
1622: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1.1.1.2 misho 1623: md->mark = save_mark;
1.1 misho 1624: if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1625: if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1626: {
1627: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1628: break;
1629: }
1630:
1631: /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1632: as NOMATCH. */
1633:
1634: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1635: ecode += GET(ecode,1);
1636: }
1637: while (*ecode == OP_ALT);
1638:
1639: if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1640:
1641: ecode += 1 + LINK_SIZE;
1642: continue;
1643:
1644: /* Move the subject pointer back. This occurs only at the start of
1645: each branch of a lookbehind assertion. If we are too close to the start to
1646: move back, this match function fails. When working with UTF-8 we move
1647: back a number of characters, not bytes. */
1648:
1649: case OP_REVERSE:
1.1.1.2 misho 1650: #ifdef SUPPORT_UTF
1651: if (utf)
1.1 misho 1652: {
1653: i = GET(ecode, 1);
1654: while (i-- > 0)
1655: {
1656: eptr--;
1657: if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1658: BACKCHAR(eptr);
1659: }
1660: }
1661: else
1662: #endif
1663:
1664: /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1665:
1666: {
1667: eptr -= GET(ecode, 1);
1668: if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1669: }
1670:
1671: /* Save the earliest consulted character, then skip to next op code */
1672:
1673: if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1674: ecode += 1 + LINK_SIZE;
1675: break;
1676:
1677: /* The callout item calls an external function, if one is provided, passing
1678: details of the match so far. This is mainly for debugging, though the
1679: function is able to force a failure. */
1680:
1681: case OP_CALLOUT:
1.1.1.2 misho 1682: if (PUBL(callout) != NULL)
1.1 misho 1683: {
1.1.1.2 misho 1684: PUBL(callout_block) cb;
1.1 misho 1685: cb.version = 2; /* Version 1 of the callout block */
1686: cb.callout_number = ecode[1];
1687: cb.offset_vector = md->offset_vector;
1.1.1.2 misho 1688: #ifdef COMPILE_PCRE8
1.1 misho 1689: cb.subject = (PCRE_SPTR)md->start_subject;
1.1.1.2 misho 1690: #else
1691: cb.subject = (PCRE_SPTR16)md->start_subject;
1692: #endif
1.1 misho 1693: cb.subject_length = (int)(md->end_subject - md->start_subject);
1694: cb.start_match = (int)(mstart - md->start_subject);
1695: cb.current_position = (int)(eptr - md->start_subject);
1696: cb.pattern_position = GET(ecode, 2);
1697: cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1698: cb.capture_top = offset_top/2;
1699: cb.capture_last = md->capture_last;
1700: cb.callout_data = md->callout_data;
1701: cb.mark = md->nomatch_mark;
1.1.1.2 misho 1702: if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1.1 misho 1703: if (rrc < 0) RRETURN(rrc);
1704: }
1705: ecode += 2 + 2*LINK_SIZE;
1706: break;
1707:
1708: /* Recursion either matches the current regex, or some subexpression. The
1709: offset data is the offset to the starting bracket from the start of the
1710: whole pattern. (This is so that it works from duplicated subpatterns.)
1711:
1712: The state of the capturing groups is preserved over recursion, and
1713: re-instated afterwards. We don't know how many are started and not yet
1714: finished (offset_top records the completed total) so we just have to save
1715: all the potential data. There may be up to 65535 such values, which is too
1716: large to put on the stack, but using malloc for small numbers seems
1717: expensive. As a compromise, the stack is used when there are no more than
1718: REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1719:
1720: There are also other values that have to be saved. We use a chained
1721: sequence of blocks that actually live on the stack. Thanks to Robin Houston
1722: for the original version of this logic. It has, however, been hacked around
1723: a lot, so he is not to blame for the current way it works. */
1724:
1725: case OP_RECURSE:
1726: {
1727: recursion_info *ri;
1728: int recno;
1729:
1730: callpat = md->start_code + GET(ecode, 1);
1731: recno = (callpat == md->start_code)? 0 :
1732: GET2(callpat, 1 + LINK_SIZE);
1733:
1734: /* Check for repeating a recursion without advancing the subject pointer.
1735: This should catch convoluted mutual recursions. (Some simple cases are
1736: caught at compile time.) */
1737:
1738: for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1739: if (recno == ri->group_num && eptr == ri->subject_position)
1740: RRETURN(PCRE_ERROR_RECURSELOOP);
1741:
1742: /* Add to "recursing stack" */
1743:
1744: new_recursive.group_num = recno;
1745: new_recursive.subject_position = eptr;
1746: new_recursive.prevrec = md->recursive;
1747: md->recursive = &new_recursive;
1748:
1749: /* Where to continue from afterwards */
1750:
1751: ecode += 1 + LINK_SIZE;
1752:
1753: /* Now save the offset data */
1754:
1755: new_recursive.saved_max = md->offset_end;
1756: if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1757: new_recursive.offset_save = stacksave;
1758: else
1759: {
1760: new_recursive.offset_save =
1.1.1.2 misho 1761: (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1.1 misho 1762: if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1763: }
1764: memcpy(new_recursive.offset_save, md->offset_vector,
1765: new_recursive.saved_max * sizeof(int));
1766:
1767: /* OK, now we can do the recursion. After processing each alternative,
1768: restore the offset data. If there were nested recursions, md->recursive
1769: might be changed, so reset it before looping. */
1770:
1771: DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1772: cbegroup = (*callpat >= OP_SBRA);
1773: do
1774: {
1775: if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1.1.1.2 misho 1776: RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1.1 misho 1777: md, eptrb, RM6);
1778: memcpy(md->offset_vector, new_recursive.offset_save,
1779: new_recursive.saved_max * sizeof(int));
1780: md->recursive = new_recursive.prevrec;
1781: if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1782: {
1783: DPRINTF(("Recursion matched\n"));
1784: if (new_recursive.offset_save != stacksave)
1.1.1.2 misho 1785: (PUBL(free))(new_recursive.offset_save);
1.1 misho 1786:
1787: /* Set where we got to in the subject, and reset the start in case
1788: it was changed by \K. This *is* propagated back out of a recursion,
1789: for Perl compatibility. */
1790:
1791: eptr = md->end_match_ptr;
1792: mstart = md->start_match_ptr;
1793: goto RECURSION_MATCHED; /* Exit loop; end processing */
1794: }
1795:
1.1.1.3 ! misho 1796: /* PCRE does not allow THEN or COMMIT to escape beyond a recursion; it
! 1797: is treated as NOMATCH. */
1.1 misho 1798:
1.1.1.3 ! misho 1799: else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN &&
! 1800: rrc != MATCH_COMMIT)
1.1 misho 1801: {
1802: DPRINTF(("Recursion gave error %d\n", rrc));
1803: if (new_recursive.offset_save != stacksave)
1.1.1.2 misho 1804: (PUBL(free))(new_recursive.offset_save);
1.1 misho 1805: RRETURN(rrc);
1806: }
1807:
1808: md->recursive = &new_recursive;
1809: callpat += GET(callpat, 1);
1810: }
1811: while (*callpat == OP_ALT);
1812:
1813: DPRINTF(("Recursion didn't match\n"));
1814: md->recursive = new_recursive.prevrec;
1815: if (new_recursive.offset_save != stacksave)
1.1.1.2 misho 1816: (PUBL(free))(new_recursive.offset_save);
1.1 misho 1817: RRETURN(MATCH_NOMATCH);
1818: }
1819:
1820: RECURSION_MATCHED:
1821: break;
1822:
1823: /* An alternation is the end of a branch; scan along to find the end of the
1824: bracketed group and go to there. */
1825:
1826: case OP_ALT:
1827: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1828: break;
1829:
1830: /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1831: indicating that it may occur zero times. It may repeat infinitely, or not
1832: at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1833: with fixed upper repeat limits are compiled as a number of copies, with the
1834: optional ones preceded by BRAZERO or BRAMINZERO. */
1835:
1836: case OP_BRAZERO:
1837: next = ecode + 1;
1838: RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1839: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1840: do next += GET(next, 1); while (*next == OP_ALT);
1841: ecode = next + 1 + LINK_SIZE;
1842: break;
1843:
1844: case OP_BRAMINZERO:
1845: next = ecode + 1;
1846: do next += GET(next, 1); while (*next == OP_ALT);
1847: RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1848: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1849: ecode++;
1850: break;
1851:
1852: case OP_SKIPZERO:
1853: next = ecode+1;
1854: do next += GET(next,1); while (*next == OP_ALT);
1855: ecode = next + 1 + LINK_SIZE;
1856: break;
1857:
1858: /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1859: here; just jump to the group, with allow_zero set TRUE. */
1860:
1861: case OP_BRAPOSZERO:
1862: op = *(++ecode);
1863: allow_zero = TRUE;
1864: if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1865: goto POSSESSIVE_NON_CAPTURE;
1866:
1867: /* End of a group, repeated or non-repeating. */
1868:
1869: case OP_KET:
1870: case OP_KETRMIN:
1871: case OP_KETRMAX:
1872: case OP_KETRPOS:
1873: prev = ecode - GET(ecode, 1);
1874:
1875: /* If this was a group that remembered the subject start, in order to break
1876: infinite repeats of empty string matches, retrieve the subject start from
1877: the chain. Otherwise, set it NULL. */
1878:
1879: if (*prev >= OP_SBRA || *prev == OP_ONCE)
1880: {
1881: saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1882: eptrb = eptrb->epb_prev; /* Backup to previous group */
1883: }
1884: else saved_eptr = NULL;
1885:
1886: /* If we are at the end of an assertion group or a non-capturing atomic
1887: group, stop matching and return MATCH_MATCH, but record the current high
1888: water mark for use by positive assertions. We also need to record the match
1889: start in case it was changed by \K. */
1890:
1891: if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1892: *prev == OP_ONCE_NC)
1893: {
1894: md->end_match_ptr = eptr; /* For ONCE_NC */
1895: md->end_offset_top = offset_top;
1896: md->start_match_ptr = mstart;
1897: RRETURN(MATCH_MATCH); /* Sets md->mark */
1898: }
1899:
1900: /* For capturing groups we have to check the group number back at the start
1901: and if necessary complete handling an extraction by setting the offsets and
1902: bumping the high water mark. Whole-pattern recursion is coded as a recurse
1903: into group 0, so it won't be picked up here. Instead, we catch it when the
1904: OP_END is reached. Other recursion is handled here. We just have to record
1905: the current subject position and start match pointer and give a MATCH
1906: return. */
1907:
1908: if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1909: *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1910: {
1911: number = GET2(prev, 1+LINK_SIZE);
1912: offset = number << 1;
1913:
1914: #ifdef PCRE_DEBUG
1915: printf("end bracket %d", number);
1916: printf("\n");
1917: #endif
1918:
1919: /* Handle a recursively called group. */
1920:
1921: if (md->recursive != NULL && md->recursive->group_num == number)
1922: {
1923: md->end_match_ptr = eptr;
1924: md->start_match_ptr = mstart;
1925: RRETURN(MATCH_MATCH);
1926: }
1927:
1928: /* Deal with capturing */
1929:
1930: md->capture_last = number;
1931: if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1932: {
1933: /* If offset is greater than offset_top, it means that we are
1934: "skipping" a capturing group, and that group's offsets must be marked
1935: unset. In earlier versions of PCRE, all the offsets were unset at the
1936: start of matching, but this doesn't work because atomic groups and
1937: assertions can cause a value to be set that should later be unset.
1938: Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1939: part of the atomic group, but this is not on the final matching path,
1940: so must be unset when 2 is set. (If there is no group 2, there is no
1941: problem, because offset_top will then be 2, indicating no capture.) */
1942:
1943: if (offset > offset_top)
1944: {
1945: register int *iptr = md->offset_vector + offset_top;
1946: register int *iend = md->offset_vector + offset;
1947: while (iptr < iend) *iptr++ = -1;
1948: }
1949:
1950: /* Now make the extraction */
1951:
1952: md->offset_vector[offset] =
1953: md->offset_vector[md->offset_end - number];
1954: md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1955: if (offset_top <= offset) offset_top = offset + 2;
1956: }
1957: }
1958:
1959: /* For an ordinary non-repeating ket, just continue at this level. This
1960: also happens for a repeating ket if no characters were matched in the
1961: group. This is the forcible breaking of infinite loops as implemented in
1962: Perl 5.005. For a non-repeating atomic group that includes captures,
1963: establish a backup point by processing the rest of the pattern at a lower
1964: level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1965: original OP_ONCE level, thereby bypassing intermediate backup points, but
1966: resetting any captures that happened along the way. */
1967:
1968: if (*ecode == OP_KET || eptr == saved_eptr)
1969: {
1970: if (*prev == OP_ONCE)
1971: {
1972: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1973: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1974: md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1975: RRETURN(MATCH_ONCE);
1976: }
1977: ecode += 1 + LINK_SIZE; /* Carry on at this level */
1978: break;
1979: }
1980:
1981: /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1982: and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1983: at a time from the outer level, thus saving stack. */
1984:
1985: if (*ecode == OP_KETRPOS)
1986: {
1987: md->end_match_ptr = eptr;
1988: md->end_offset_top = offset_top;
1989: RRETURN(MATCH_KETRPOS);
1990: }
1991:
1992: /* The normal repeating kets try the rest of the pattern or restart from
1993: the preceding bracket, in the appropriate order. In the second case, we can
1994: use tail recursion to avoid using another stack frame, unless we have an
1995: an atomic group or an unlimited repeat of a group that can match an empty
1996: string. */
1997:
1998: if (*ecode == OP_KETRMIN)
1999: {
2000: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2001: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2002: if (*prev == OP_ONCE)
2003: {
2004: RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2005: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2006: md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2007: RRETURN(MATCH_ONCE);
2008: }
2009: if (*prev >= OP_SBRA) /* Could match an empty string */
2010: {
2011: RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2012: RRETURN(rrc);
2013: }
2014: ecode = prev;
2015: goto TAIL_RECURSE;
2016: }
2017: else /* OP_KETRMAX */
2018: {
2019: RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2020: if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2021: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2022: if (*prev == OP_ONCE)
2023: {
2024: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2025: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2026: md->once_target = prev;
2027: RRETURN(MATCH_ONCE);
2028: }
2029: ecode += 1 + LINK_SIZE;
2030: goto TAIL_RECURSE;
2031: }
2032: /* Control never gets here */
2033:
2034: /* Not multiline mode: start of subject assertion, unless notbol. */
2035:
2036: case OP_CIRC:
2037: if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2038:
2039: /* Start of subject assertion */
2040:
2041: case OP_SOD:
2042: if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2043: ecode++;
2044: break;
2045:
2046: /* Multiline mode: start of subject unless notbol, or after any newline. */
2047:
2048: case OP_CIRCM:
2049: if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2050: if (eptr != md->start_subject &&
2051: (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2052: RRETURN(MATCH_NOMATCH);
2053: ecode++;
2054: break;
2055:
2056: /* Start of match assertion */
2057:
2058: case OP_SOM:
2059: if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2060: ecode++;
2061: break;
2062:
2063: /* Reset the start of match point */
2064:
2065: case OP_SET_SOM:
2066: mstart = eptr;
2067: ecode++;
2068: break;
2069:
2070: /* Multiline mode: assert before any newline, or before end of subject
2071: unless noteol is set. */
2072:
2073: case OP_DOLLM:
2074: if (eptr < md->end_subject)
1.1.1.3 ! misho 2075: {
! 2076: if (!IS_NEWLINE(eptr))
! 2077: {
! 2078: if (md->partial != 0 &&
! 2079: eptr + 1 >= md->end_subject &&
! 2080: NLBLOCK->nltype == NLTYPE_FIXED &&
! 2081: NLBLOCK->nllen == 2 &&
! 2082: *eptr == NLBLOCK->nl[0])
! 2083: {
! 2084: md->hitend = TRUE;
! 2085: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
! 2086: }
! 2087: RRETURN(MATCH_NOMATCH);
! 2088: }
! 2089: }
1.1 misho 2090: else
2091: {
2092: if (md->noteol) RRETURN(MATCH_NOMATCH);
2093: SCHECK_PARTIAL();
2094: }
2095: ecode++;
2096: break;
2097:
2098: /* Not multiline mode: assert before a terminating newline or before end of
2099: subject unless noteol is set. */
2100:
2101: case OP_DOLL:
2102: if (md->noteol) RRETURN(MATCH_NOMATCH);
2103: if (!md->endonly) goto ASSERT_NL_OR_EOS;
2104:
2105: /* ... else fall through for endonly */
2106:
2107: /* End of subject assertion (\z) */
2108:
2109: case OP_EOD:
2110: if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2111: SCHECK_PARTIAL();
2112: ecode++;
2113: break;
2114:
2115: /* End of subject or ending \n assertion (\Z) */
2116:
2117: case OP_EODN:
2118: ASSERT_NL_OR_EOS:
2119: if (eptr < md->end_subject &&
2120: (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1.1.1.3 ! misho 2121: {
! 2122: if (md->partial != 0 &&
! 2123: eptr + 1 >= md->end_subject &&
! 2124: NLBLOCK->nltype == NLTYPE_FIXED &&
! 2125: NLBLOCK->nllen == 2 &&
! 2126: *eptr == NLBLOCK->nl[0])
! 2127: {
! 2128: md->hitend = TRUE;
! 2129: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
! 2130: }
1.1 misho 2131: RRETURN(MATCH_NOMATCH);
1.1.1.3 ! misho 2132: }
1.1 misho 2133:
2134: /* Either at end of string or \n before end. */
2135:
2136: SCHECK_PARTIAL();
2137: ecode++;
2138: break;
2139:
2140: /* Word boundary assertions */
2141:
2142: case OP_NOT_WORD_BOUNDARY:
2143: case OP_WORD_BOUNDARY:
2144: {
2145:
2146: /* Find out if the previous and current characters are "word" characters.
2147: It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2148: be "non-word" characters. Remember the earliest consulted character for
2149: partial matching. */
2150:
1.1.1.2 misho 2151: #ifdef SUPPORT_UTF
2152: if (utf)
1.1 misho 2153: {
2154: /* Get status of previous character */
2155:
2156: if (eptr == md->start_subject) prev_is_word = FALSE; else
2157: {
1.1.1.2 misho 2158: PCRE_PUCHAR lastptr = eptr - 1;
2159: BACKCHAR(lastptr);
1.1 misho 2160: if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2161: GETCHAR(c, lastptr);
2162: #ifdef SUPPORT_UCP
2163: if (md->use_ucp)
2164: {
2165: if (c == '_') prev_is_word = TRUE; else
2166: {
2167: int cat = UCD_CATEGORY(c);
2168: prev_is_word = (cat == ucp_L || cat == ucp_N);
2169: }
2170: }
2171: else
2172: #endif
2173: prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2174: }
2175:
2176: /* Get status of next character */
2177:
2178: if (eptr >= md->end_subject)
2179: {
2180: SCHECK_PARTIAL();
2181: cur_is_word = FALSE;
2182: }
2183: else
2184: {
2185: GETCHAR(c, eptr);
2186: #ifdef SUPPORT_UCP
2187: if (md->use_ucp)
2188: {
2189: if (c == '_') cur_is_word = TRUE; else
2190: {
2191: int cat = UCD_CATEGORY(c);
2192: cur_is_word = (cat == ucp_L || cat == ucp_N);
2193: }
2194: }
2195: else
2196: #endif
2197: cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2198: }
2199: }
2200: else
2201: #endif
2202:
2203: /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2204: consistency with the behaviour of \w we do use it in this case. */
2205:
2206: {
2207: /* Get status of previous character */
2208:
2209: if (eptr == md->start_subject) prev_is_word = FALSE; else
2210: {
2211: if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2212: #ifdef SUPPORT_UCP
2213: if (md->use_ucp)
2214: {
2215: c = eptr[-1];
2216: if (c == '_') prev_is_word = TRUE; else
2217: {
2218: int cat = UCD_CATEGORY(c);
2219: prev_is_word = (cat == ucp_L || cat == ucp_N);
2220: }
2221: }
2222: else
2223: #endif
1.1.1.2 misho 2224: prev_is_word = MAX_255(eptr[-1])
2225: && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1.1 misho 2226: }
2227:
2228: /* Get status of next character */
2229:
2230: if (eptr >= md->end_subject)
2231: {
2232: SCHECK_PARTIAL();
2233: cur_is_word = FALSE;
2234: }
2235: else
2236: #ifdef SUPPORT_UCP
2237: if (md->use_ucp)
2238: {
2239: c = *eptr;
2240: if (c == '_') cur_is_word = TRUE; else
2241: {
2242: int cat = UCD_CATEGORY(c);
2243: cur_is_word = (cat == ucp_L || cat == ucp_N);
2244: }
2245: }
2246: else
2247: #endif
1.1.1.2 misho 2248: cur_is_word = MAX_255(*eptr)
2249: && ((md->ctypes[*eptr] & ctype_word) != 0);
1.1 misho 2250: }
2251:
2252: /* Now see if the situation is what we want */
2253:
2254: if ((*ecode++ == OP_WORD_BOUNDARY)?
2255: cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2256: RRETURN(MATCH_NOMATCH);
2257: }
2258: break;
2259:
1.1.1.3 ! misho 2260: /* Match any single character type except newline; have to take care with
! 2261: CRLF newlines and partial matching. */
1.1 misho 2262:
2263: case OP_ANY:
2264: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1.1.1.3 ! misho 2265: if (md->partial != 0 &&
! 2266: eptr + 1 >= md->end_subject &&
! 2267: NLBLOCK->nltype == NLTYPE_FIXED &&
! 2268: NLBLOCK->nllen == 2 &&
! 2269: *eptr == NLBLOCK->nl[0])
! 2270: {
! 2271: md->hitend = TRUE;
! 2272: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
! 2273: }
! 2274:
1.1 misho 2275: /* Fall through */
2276:
1.1.1.3 ! misho 2277: /* Match any single character whatsoever. */
! 2278:
1.1 misho 2279: case OP_ALLANY:
2280: if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2281: { /* not be updated before SCHECK_PARTIAL. */
2282: SCHECK_PARTIAL();
2283: RRETURN(MATCH_NOMATCH);
2284: }
2285: eptr++;
1.1.1.2 misho 2286: #ifdef SUPPORT_UTF
2287: if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2288: #endif
1.1 misho 2289: ecode++;
2290: break;
2291:
2292: /* Match a single byte, even in UTF-8 mode. This opcode really does match
2293: any byte, even newline, independent of the setting of PCRE_DOTALL. */
2294:
2295: case OP_ANYBYTE:
2296: if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2297: { /* not be updated before SCHECK_PARTIAL. */
2298: SCHECK_PARTIAL();
2299: RRETURN(MATCH_NOMATCH);
2300: }
2301: eptr++;
2302: ecode++;
2303: break;
2304:
2305: case OP_NOT_DIGIT:
2306: if (eptr >= md->end_subject)
2307: {
2308: SCHECK_PARTIAL();
2309: RRETURN(MATCH_NOMATCH);
2310: }
2311: GETCHARINCTEST(c, eptr);
2312: if (
1.1.1.2 misho 2313: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
1.1 misho 2314: c < 256 &&
2315: #endif
2316: (md->ctypes[c] & ctype_digit) != 0
2317: )
2318: RRETURN(MATCH_NOMATCH);
2319: ecode++;
2320: break;
2321:
2322: case OP_DIGIT:
2323: if (eptr >= md->end_subject)
2324: {
2325: SCHECK_PARTIAL();
2326: RRETURN(MATCH_NOMATCH);
2327: }
2328: GETCHARINCTEST(c, eptr);
2329: if (
1.1.1.2 misho 2330: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2331: c > 255 ||
1.1 misho 2332: #endif
2333: (md->ctypes[c] & ctype_digit) == 0
2334: )
2335: RRETURN(MATCH_NOMATCH);
2336: ecode++;
2337: break;
2338:
2339: case OP_NOT_WHITESPACE:
2340: if (eptr >= md->end_subject)
2341: {
2342: SCHECK_PARTIAL();
2343: RRETURN(MATCH_NOMATCH);
2344: }
2345: GETCHARINCTEST(c, eptr);
2346: if (
1.1.1.2 misho 2347: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
1.1 misho 2348: c < 256 &&
2349: #endif
2350: (md->ctypes[c] & ctype_space) != 0
2351: )
2352: RRETURN(MATCH_NOMATCH);
2353: ecode++;
2354: break;
2355:
2356: case OP_WHITESPACE:
2357: if (eptr >= md->end_subject)
2358: {
2359: SCHECK_PARTIAL();
2360: RRETURN(MATCH_NOMATCH);
2361: }
2362: GETCHARINCTEST(c, eptr);
2363: if (
1.1.1.2 misho 2364: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2365: c > 255 ||
1.1 misho 2366: #endif
2367: (md->ctypes[c] & ctype_space) == 0
2368: )
2369: RRETURN(MATCH_NOMATCH);
2370: ecode++;
2371: break;
2372:
2373: case OP_NOT_WORDCHAR:
2374: if (eptr >= md->end_subject)
2375: {
2376: SCHECK_PARTIAL();
2377: RRETURN(MATCH_NOMATCH);
2378: }
2379: GETCHARINCTEST(c, eptr);
2380: if (
1.1.1.2 misho 2381: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
1.1 misho 2382: c < 256 &&
2383: #endif
2384: (md->ctypes[c] & ctype_word) != 0
2385: )
2386: RRETURN(MATCH_NOMATCH);
2387: ecode++;
2388: break;
2389:
2390: case OP_WORDCHAR:
2391: if (eptr >= md->end_subject)
2392: {
2393: SCHECK_PARTIAL();
2394: RRETURN(MATCH_NOMATCH);
2395: }
2396: GETCHARINCTEST(c, eptr);
2397: if (
1.1.1.2 misho 2398: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2399: c > 255 ||
1.1 misho 2400: #endif
2401: (md->ctypes[c] & ctype_word) == 0
2402: )
2403: RRETURN(MATCH_NOMATCH);
2404: ecode++;
2405: break;
2406:
2407: case OP_ANYNL:
2408: if (eptr >= md->end_subject)
2409: {
2410: SCHECK_PARTIAL();
2411: RRETURN(MATCH_NOMATCH);
2412: }
2413: GETCHARINCTEST(c, eptr);
2414: switch(c)
2415: {
2416: default: RRETURN(MATCH_NOMATCH);
2417:
2418: case 0x000d:
1.1.1.3 ! misho 2419: if (eptr >= md->end_subject)
! 2420: {
! 2421: SCHECK_PARTIAL();
! 2422: }
! 2423: else if (*eptr == 0x0a) eptr++;
1.1 misho 2424: break;
2425:
2426: case 0x000a:
2427: break;
2428:
2429: case 0x000b:
2430: case 0x000c:
2431: case 0x0085:
2432: case 0x2028:
2433: case 0x2029:
2434: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2435: break;
2436: }
2437: ecode++;
2438: break;
2439:
2440: case OP_NOT_HSPACE:
2441: if (eptr >= md->end_subject)
2442: {
2443: SCHECK_PARTIAL();
2444: RRETURN(MATCH_NOMATCH);
2445: }
2446: GETCHARINCTEST(c, eptr);
2447: switch(c)
2448: {
2449: default: break;
2450: case 0x09: /* HT */
2451: case 0x20: /* SPACE */
2452: case 0xa0: /* NBSP */
2453: case 0x1680: /* OGHAM SPACE MARK */
2454: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2455: case 0x2000: /* EN QUAD */
2456: case 0x2001: /* EM QUAD */
2457: case 0x2002: /* EN SPACE */
2458: case 0x2003: /* EM SPACE */
2459: case 0x2004: /* THREE-PER-EM SPACE */
2460: case 0x2005: /* FOUR-PER-EM SPACE */
2461: case 0x2006: /* SIX-PER-EM SPACE */
2462: case 0x2007: /* FIGURE SPACE */
2463: case 0x2008: /* PUNCTUATION SPACE */
2464: case 0x2009: /* THIN SPACE */
2465: case 0x200A: /* HAIR SPACE */
2466: case 0x202f: /* NARROW NO-BREAK SPACE */
2467: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2468: case 0x3000: /* IDEOGRAPHIC SPACE */
2469: RRETURN(MATCH_NOMATCH);
2470: }
2471: ecode++;
2472: break;
2473:
2474: case OP_HSPACE:
2475: if (eptr >= md->end_subject)
2476: {
2477: SCHECK_PARTIAL();
2478: RRETURN(MATCH_NOMATCH);
2479: }
2480: GETCHARINCTEST(c, eptr);
2481: switch(c)
2482: {
2483: default: RRETURN(MATCH_NOMATCH);
2484: case 0x09: /* HT */
2485: case 0x20: /* SPACE */
2486: case 0xa0: /* NBSP */
2487: case 0x1680: /* OGHAM SPACE MARK */
2488: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2489: case 0x2000: /* EN QUAD */
2490: case 0x2001: /* EM QUAD */
2491: case 0x2002: /* EN SPACE */
2492: case 0x2003: /* EM SPACE */
2493: case 0x2004: /* THREE-PER-EM SPACE */
2494: case 0x2005: /* FOUR-PER-EM SPACE */
2495: case 0x2006: /* SIX-PER-EM SPACE */
2496: case 0x2007: /* FIGURE SPACE */
2497: case 0x2008: /* PUNCTUATION SPACE */
2498: case 0x2009: /* THIN SPACE */
2499: case 0x200A: /* HAIR SPACE */
2500: case 0x202f: /* NARROW NO-BREAK SPACE */
2501: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2502: case 0x3000: /* IDEOGRAPHIC SPACE */
2503: break;
2504: }
2505: ecode++;
2506: break;
2507:
2508: case OP_NOT_VSPACE:
2509: if (eptr >= md->end_subject)
2510: {
2511: SCHECK_PARTIAL();
2512: RRETURN(MATCH_NOMATCH);
2513: }
2514: GETCHARINCTEST(c, eptr);
2515: switch(c)
2516: {
2517: default: break;
2518: case 0x0a: /* LF */
2519: case 0x0b: /* VT */
2520: case 0x0c: /* FF */
2521: case 0x0d: /* CR */
2522: case 0x85: /* NEL */
2523: case 0x2028: /* LINE SEPARATOR */
2524: case 0x2029: /* PARAGRAPH SEPARATOR */
2525: RRETURN(MATCH_NOMATCH);
2526: }
2527: ecode++;
2528: break;
2529:
2530: case OP_VSPACE:
2531: if (eptr >= md->end_subject)
2532: {
2533: SCHECK_PARTIAL();
2534: RRETURN(MATCH_NOMATCH);
2535: }
2536: GETCHARINCTEST(c, eptr);
2537: switch(c)
2538: {
2539: default: RRETURN(MATCH_NOMATCH);
2540: case 0x0a: /* LF */
2541: case 0x0b: /* VT */
2542: case 0x0c: /* FF */
2543: case 0x0d: /* CR */
2544: case 0x85: /* NEL */
2545: case 0x2028: /* LINE SEPARATOR */
2546: case 0x2029: /* PARAGRAPH SEPARATOR */
2547: break;
2548: }
2549: ecode++;
2550: break;
2551:
2552: #ifdef SUPPORT_UCP
2553: /* Check the next character by Unicode property. We will get here only
2554: if the support is in the binary; otherwise a compile-time error occurs. */
2555:
2556: case OP_PROP:
2557: case OP_NOTPROP:
2558: if (eptr >= md->end_subject)
2559: {
2560: SCHECK_PARTIAL();
2561: RRETURN(MATCH_NOMATCH);
2562: }
2563: GETCHARINCTEST(c, eptr);
2564: {
2565: const ucd_record *prop = GET_UCD(c);
2566:
2567: switch(ecode[1])
2568: {
2569: case PT_ANY:
2570: if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2571: break;
2572:
2573: case PT_LAMP:
2574: if ((prop->chartype == ucp_Lu ||
2575: prop->chartype == ucp_Ll ||
2576: prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2577: RRETURN(MATCH_NOMATCH);
2578: break;
2579:
2580: case PT_GC:
1.1.1.2 misho 2581: if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
1.1 misho 2582: RRETURN(MATCH_NOMATCH);
2583: break;
2584:
2585: case PT_PC:
2586: if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2587: RRETURN(MATCH_NOMATCH);
2588: break;
2589:
2590: case PT_SC:
2591: if ((ecode[2] != prop->script) == (op == OP_PROP))
2592: RRETURN(MATCH_NOMATCH);
2593: break;
2594:
2595: /* These are specials */
2596:
2597: case PT_ALNUM:
1.1.1.2 misho 2598: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2599: PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
1.1 misho 2600: RRETURN(MATCH_NOMATCH);
2601: break;
2602:
2603: case PT_SPACE: /* Perl space */
1.1.1.2 misho 2604: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.1 misho 2605: c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2606: == (op == OP_NOTPROP))
2607: RRETURN(MATCH_NOMATCH);
2608: break;
2609:
2610: case PT_PXSPACE: /* POSIX space */
1.1.1.2 misho 2611: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.1 misho 2612: c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2613: c == CHAR_FF || c == CHAR_CR)
2614: == (op == OP_NOTPROP))
2615: RRETURN(MATCH_NOMATCH);
2616: break;
2617:
2618: case PT_WORD:
1.1.1.2 misho 2619: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2620: PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1.1 misho 2621: c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2622: RRETURN(MATCH_NOMATCH);
2623: break;
2624:
2625: /* This should never occur */
2626:
2627: default:
2628: RRETURN(PCRE_ERROR_INTERNAL);
2629: }
2630:
2631: ecode += 3;
2632: }
2633: break;
2634:
2635: /* Match an extended Unicode sequence. We will get here only if the support
2636: is in the binary; otherwise a compile-time error occurs. */
2637:
2638: case OP_EXTUNI:
2639: if (eptr >= md->end_subject)
2640: {
2641: SCHECK_PARTIAL();
2642: RRETURN(MATCH_NOMATCH);
2643: }
2644: GETCHARINCTEST(c, eptr);
2645: if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
2646: while (eptr < md->end_subject)
2647: {
2648: int len = 1;
1.1.1.2 misho 2649: if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
1.1 misho 2650: if (UCD_CATEGORY(c) != ucp_M) break;
2651: eptr += len;
2652: }
1.1.1.3 ! misho 2653: CHECK_PARTIAL();
1.1 misho 2654: ecode++;
2655: break;
2656: #endif
2657:
2658:
2659: /* Match a back reference, possibly repeatedly. Look past the end of the
2660: item to see if there is repeat information following. The code is similar
2661: to that for character classes, but repeated for efficiency. Then obey
2662: similar code to character type repeats - written out again for speed.
2663: However, if the referenced string is the empty string, always treat
2664: it as matched, any number of times (otherwise there could be infinite
2665: loops). */
2666:
2667: case OP_REF:
2668: case OP_REFI:
2669: caseless = op == OP_REFI;
2670: offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1.1.1.2 misho 2671: ecode += 1 + IMM2_SIZE;
1.1 misho 2672:
2673: /* If the reference is unset, there are two possibilities:
2674:
2675: (a) In the default, Perl-compatible state, set the length negative;
2676: this ensures that every attempt at a match fails. We can't just fail
2677: here, because of the possibility of quantifiers with zero minima.
2678:
2679: (b) If the JavaScript compatibility flag is set, set the length to zero
2680: so that the back reference matches an empty string.
2681:
2682: Otherwise, set the length to the length of what was matched by the
2683: referenced subpattern. */
2684:
2685: if (offset >= offset_top || md->offset_vector[offset] < 0)
2686: length = (md->jscript_compat)? 0 : -1;
2687: else
2688: length = md->offset_vector[offset+1] - md->offset_vector[offset];
2689:
2690: /* Set up for repetition, or handle the non-repeated case */
2691:
2692: switch (*ecode)
2693: {
2694: case OP_CRSTAR:
2695: case OP_CRMINSTAR:
2696: case OP_CRPLUS:
2697: case OP_CRMINPLUS:
2698: case OP_CRQUERY:
2699: case OP_CRMINQUERY:
2700: c = *ecode++ - OP_CRSTAR;
2701: minimize = (c & 1) != 0;
2702: min = rep_min[c]; /* Pick up values from tables; */
2703: max = rep_max[c]; /* zero for max => infinity */
2704: if (max == 0) max = INT_MAX;
2705: break;
2706:
2707: case OP_CRRANGE:
2708: case OP_CRMINRANGE:
2709: minimize = (*ecode == OP_CRMINRANGE);
2710: min = GET2(ecode, 1);
1.1.1.2 misho 2711: max = GET2(ecode, 1 + IMM2_SIZE);
1.1 misho 2712: if (max == 0) max = INT_MAX;
1.1.1.2 misho 2713: ecode += 1 + 2 * IMM2_SIZE;
1.1 misho 2714: break;
2715:
2716: default: /* No repeat follows */
2717: if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2718: {
1.1.1.3 ! misho 2719: if (length == -2) eptr = md->end_subject; /* Partial match */
1.1 misho 2720: CHECK_PARTIAL();
2721: RRETURN(MATCH_NOMATCH);
2722: }
2723: eptr += length;
2724: continue; /* With the main loop */
2725: }
2726:
2727: /* Handle repeated back references. If the length of the reference is
1.1.1.2 misho 2728: zero, just continue with the main loop. If the length is negative, it
2729: means the reference is unset in non-Java-compatible mode. If the minimum is
2730: zero, we can continue at the same level without recursion. For any other
2731: minimum, carrying on will result in NOMATCH. */
1.1 misho 2732:
2733: if (length == 0) continue;
1.1.1.2 misho 2734: if (length < 0 && min == 0) continue;
1.1 misho 2735:
2736: /* First, ensure the minimum number of matches are present. We get back
2737: the length of the reference string explicitly rather than passing the
2738: address of eptr, so that eptr can be a register variable. */
2739:
2740: for (i = 1; i <= min; i++)
2741: {
2742: int slength;
2743: if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2744: {
1.1.1.3 ! misho 2745: if (slength == -2) eptr = md->end_subject; /* Partial match */
1.1 misho 2746: CHECK_PARTIAL();
2747: RRETURN(MATCH_NOMATCH);
2748: }
2749: eptr += slength;
2750: }
2751:
2752: /* If min = max, continue at the same level without recursion.
2753: They are not both allowed to be zero. */
2754:
2755: if (min == max) continue;
2756:
2757: /* If minimizing, keep trying and advancing the pointer */
2758:
2759: if (minimize)
2760: {
2761: for (fi = min;; fi++)
2762: {
2763: int slength;
2764: RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2765: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2766: if (fi >= max) RRETURN(MATCH_NOMATCH);
2767: if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2768: {
1.1.1.3 ! misho 2769: if (slength == -2) eptr = md->end_subject; /* Partial match */
1.1 misho 2770: CHECK_PARTIAL();
2771: RRETURN(MATCH_NOMATCH);
2772: }
2773: eptr += slength;
2774: }
2775: /* Control never gets here */
2776: }
2777:
2778: /* If maximizing, find the longest string and work backwards */
2779:
2780: else
2781: {
2782: pp = eptr;
2783: for (i = min; i < max; i++)
2784: {
2785: int slength;
2786: if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2787: {
1.1.1.3 ! misho 2788: /* Can't use CHECK_PARTIAL because we don't want to update eptr in
! 2789: the soft partial matching case. */
! 2790:
! 2791: if (slength == -2 && md->partial != 0 &&
! 2792: md->end_subject > md->start_used_ptr)
! 2793: {
! 2794: md->hitend = TRUE;
! 2795: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
! 2796: }
1.1 misho 2797: break;
2798: }
2799: eptr += slength;
2800: }
1.1.1.3 ! misho 2801:
1.1 misho 2802: while (eptr >= pp)
2803: {
2804: RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2805: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2806: eptr -= length;
2807: }
2808: RRETURN(MATCH_NOMATCH);
2809: }
2810: /* Control never gets here */
2811:
2812: /* Match a bit-mapped character class, possibly repeatedly. This op code is
2813: used when all the characters in the class have values in the range 0-255,
2814: and either the matching is caseful, or the characters are in the range
2815: 0-127 when UTF-8 processing is enabled. The only difference between
2816: OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2817: encountered.
2818:
2819: First, look past the end of the item to see if there is repeat information
2820: following. Then obey similar code to character type repeats - written out
2821: again for speed. */
2822:
2823: case OP_NCLASS:
2824: case OP_CLASS:
2825: {
1.1.1.2 misho 2826: /* The data variable is saved across frames, so the byte map needs to
2827: be stored there. */
2828: #define BYTE_MAP ((pcre_uint8 *)data)
1.1 misho 2829: data = ecode + 1; /* Save for matching */
1.1.1.2 misho 2830: ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
1.1 misho 2831:
2832: switch (*ecode)
2833: {
2834: case OP_CRSTAR:
2835: case OP_CRMINSTAR:
2836: case OP_CRPLUS:
2837: case OP_CRMINPLUS:
2838: case OP_CRQUERY:
2839: case OP_CRMINQUERY:
2840: c = *ecode++ - OP_CRSTAR;
2841: minimize = (c & 1) != 0;
2842: min = rep_min[c]; /* Pick up values from tables; */
2843: max = rep_max[c]; /* zero for max => infinity */
2844: if (max == 0) max = INT_MAX;
2845: break;
2846:
2847: case OP_CRRANGE:
2848: case OP_CRMINRANGE:
2849: minimize = (*ecode == OP_CRMINRANGE);
2850: min = GET2(ecode, 1);
1.1.1.2 misho 2851: max = GET2(ecode, 1 + IMM2_SIZE);
1.1 misho 2852: if (max == 0) max = INT_MAX;
1.1.1.2 misho 2853: ecode += 1 + 2 * IMM2_SIZE;
1.1 misho 2854: break;
2855:
2856: default: /* No repeat follows */
2857: min = max = 1;
2858: break;
2859: }
2860:
2861: /* First, ensure the minimum number of matches are present. */
2862:
1.1.1.2 misho 2863: #ifdef SUPPORT_UTF
2864: if (utf)
1.1 misho 2865: {
2866: for (i = 1; i <= min; i++)
2867: {
2868: if (eptr >= md->end_subject)
2869: {
2870: SCHECK_PARTIAL();
2871: RRETURN(MATCH_NOMATCH);
2872: }
2873: GETCHARINC(c, eptr);
2874: if (c > 255)
2875: {
2876: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2877: }
2878: else
1.1.1.2 misho 2879: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1.1 misho 2880: }
2881: }
2882: else
2883: #endif
1.1.1.2 misho 2884: /* Not UTF mode */
1.1 misho 2885: {
2886: for (i = 1; i <= min; i++)
2887: {
2888: if (eptr >= md->end_subject)
2889: {
2890: SCHECK_PARTIAL();
2891: RRETURN(MATCH_NOMATCH);
2892: }
2893: c = *eptr++;
1.1.1.2 misho 2894: #ifndef COMPILE_PCRE8
2895: if (c > 255)
2896: {
2897: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2898: }
2899: else
2900: #endif
2901: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1.1 misho 2902: }
2903: }
2904:
2905: /* If max == min we can continue with the main loop without the
2906: need to recurse. */
2907:
2908: if (min == max) continue;
2909:
2910: /* If minimizing, keep testing the rest of the expression and advancing
2911: the pointer while it matches the class. */
2912:
2913: if (minimize)
2914: {
1.1.1.2 misho 2915: #ifdef SUPPORT_UTF
2916: if (utf)
1.1 misho 2917: {
2918: for (fi = min;; fi++)
2919: {
2920: RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2921: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2922: if (fi >= max) RRETURN(MATCH_NOMATCH);
2923: if (eptr >= md->end_subject)
2924: {
2925: SCHECK_PARTIAL();
2926: RRETURN(MATCH_NOMATCH);
2927: }
2928: GETCHARINC(c, eptr);
2929: if (c > 255)
2930: {
2931: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2932: }
2933: else
1.1.1.2 misho 2934: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1.1 misho 2935: }
2936: }
2937: else
2938: #endif
1.1.1.2 misho 2939: /* Not UTF mode */
1.1 misho 2940: {
2941: for (fi = min;; fi++)
2942: {
2943: RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2944: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2945: if (fi >= max) RRETURN(MATCH_NOMATCH);
2946: if (eptr >= md->end_subject)
2947: {
2948: SCHECK_PARTIAL();
2949: RRETURN(MATCH_NOMATCH);
2950: }
2951: c = *eptr++;
1.1.1.2 misho 2952: #ifndef COMPILE_PCRE8
2953: if (c > 255)
2954: {
2955: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2956: }
2957: else
2958: #endif
2959: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1.1 misho 2960: }
2961: }
2962: /* Control never gets here */
2963: }
2964:
2965: /* If maximizing, find the longest possible run, then work backwards. */
2966:
2967: else
2968: {
2969: pp = eptr;
2970:
1.1.1.2 misho 2971: #ifdef SUPPORT_UTF
2972: if (utf)
1.1 misho 2973: {
2974: for (i = min; i < max; i++)
2975: {
2976: int len = 1;
2977: if (eptr >= md->end_subject)
2978: {
2979: SCHECK_PARTIAL();
2980: break;
2981: }
2982: GETCHARLEN(c, eptr, len);
2983: if (c > 255)
2984: {
2985: if (op == OP_CLASS) break;
2986: }
2987: else
1.1.1.2 misho 2988: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
1.1 misho 2989: eptr += len;
2990: }
2991: for (;;)
2992: {
2993: RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2994: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2995: if (eptr-- == pp) break; /* Stop if tried at original pos */
2996: BACKCHAR(eptr);
2997: }
2998: }
2999: else
3000: #endif
1.1.1.2 misho 3001: /* Not UTF mode */
1.1 misho 3002: {
3003: for (i = min; i < max; i++)
3004: {
3005: if (eptr >= md->end_subject)
3006: {
3007: SCHECK_PARTIAL();
3008: break;
3009: }
3010: c = *eptr;
1.1.1.2 misho 3011: #ifndef COMPILE_PCRE8
3012: if (c > 255)
3013: {
3014: if (op == OP_CLASS) break;
3015: }
3016: else
3017: #endif
3018: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
1.1 misho 3019: eptr++;
3020: }
3021: while (eptr >= pp)
3022: {
3023: RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3024: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3025: eptr--;
3026: }
3027: }
3028:
3029: RRETURN(MATCH_NOMATCH);
3030: }
1.1.1.2 misho 3031: #undef BYTE_MAP
1.1 misho 3032: }
3033: /* Control never gets here */
3034:
3035:
3036: /* Match an extended character class. This opcode is encountered only
3037: when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3038: mode, because Unicode properties are supported in non-UTF-8 mode. */
3039:
1.1.1.2 misho 3040: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1.1 misho 3041: case OP_XCLASS:
3042: {
3043: data = ecode + 1 + LINK_SIZE; /* Save for matching */
3044: ecode += GET(ecode, 1); /* Advance past the item */
3045:
3046: switch (*ecode)
3047: {
3048: case OP_CRSTAR:
3049: case OP_CRMINSTAR:
3050: case OP_CRPLUS:
3051: case OP_CRMINPLUS:
3052: case OP_CRQUERY:
3053: case OP_CRMINQUERY:
3054: c = *ecode++ - OP_CRSTAR;
3055: minimize = (c & 1) != 0;
3056: min = rep_min[c]; /* Pick up values from tables; */
3057: max = rep_max[c]; /* zero for max => infinity */
3058: if (max == 0) max = INT_MAX;
3059: break;
3060:
3061: case OP_CRRANGE:
3062: case OP_CRMINRANGE:
3063: minimize = (*ecode == OP_CRMINRANGE);
3064: min = GET2(ecode, 1);
1.1.1.2 misho 3065: max = GET2(ecode, 1 + IMM2_SIZE);
1.1 misho 3066: if (max == 0) max = INT_MAX;
1.1.1.2 misho 3067: ecode += 1 + 2 * IMM2_SIZE;
1.1 misho 3068: break;
3069:
3070: default: /* No repeat follows */
3071: min = max = 1;
3072: break;
3073: }
3074:
3075: /* First, ensure the minimum number of matches are present. */
3076:
3077: for (i = 1; i <= min; i++)
3078: {
3079: if (eptr >= md->end_subject)
3080: {
3081: SCHECK_PARTIAL();
3082: RRETURN(MATCH_NOMATCH);
3083: }
3084: GETCHARINCTEST(c, eptr);
1.1.1.2 misho 3085: if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
1.1 misho 3086: }
3087:
3088: /* If max == min we can continue with the main loop without the
3089: need to recurse. */
3090:
3091: if (min == max) continue;
3092:
3093: /* If minimizing, keep testing the rest of the expression and advancing
3094: the pointer while it matches the class. */
3095:
3096: if (minimize)
3097: {
3098: for (fi = min;; fi++)
3099: {
3100: RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3101: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3102: if (fi >= max) RRETURN(MATCH_NOMATCH);
3103: if (eptr >= md->end_subject)
3104: {
3105: SCHECK_PARTIAL();
3106: RRETURN(MATCH_NOMATCH);
3107: }
3108: GETCHARINCTEST(c, eptr);
1.1.1.2 misho 3109: if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
1.1 misho 3110: }
3111: /* Control never gets here */
3112: }
3113:
3114: /* If maximizing, find the longest possible run, then work backwards. */
3115:
3116: else
3117: {
3118: pp = eptr;
3119: for (i = min; i < max; i++)
3120: {
3121: int len = 1;
3122: if (eptr >= md->end_subject)
3123: {
3124: SCHECK_PARTIAL();
3125: break;
3126: }
1.1.1.2 misho 3127: #ifdef SUPPORT_UTF
1.1 misho 3128: GETCHARLENTEST(c, eptr, len);
1.1.1.2 misho 3129: #else
3130: c = *eptr;
3131: #endif
3132: if (!PRIV(xclass)(c, data, utf)) break;
1.1 misho 3133: eptr += len;
3134: }
3135: for(;;)
3136: {
3137: RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3138: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3139: if (eptr-- == pp) break; /* Stop if tried at original pos */
1.1.1.2 misho 3140: #ifdef SUPPORT_UTF
3141: if (utf) BACKCHAR(eptr);
3142: #endif
1.1 misho 3143: }
3144: RRETURN(MATCH_NOMATCH);
3145: }
3146:
3147: /* Control never gets here */
3148: }
3149: #endif /* End of XCLASS */
3150:
3151: /* Match a single character, casefully */
3152:
3153: case OP_CHAR:
1.1.1.2 misho 3154: #ifdef SUPPORT_UTF
3155: if (utf)
1.1 misho 3156: {
3157: length = 1;
3158: ecode++;
3159: GETCHARLEN(fc, ecode, length);
3160: if (length > md->end_subject - eptr)
3161: {
3162: CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3163: RRETURN(MATCH_NOMATCH);
3164: }
3165: while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
3166: }
3167: else
3168: #endif
1.1.1.2 misho 3169: /* Not UTF mode */
1.1 misho 3170: {
3171: if (md->end_subject - eptr < 1)
3172: {
3173: SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3174: RRETURN(MATCH_NOMATCH);
3175: }
3176: if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3177: ecode += 2;
3178: }
3179: break;
3180:
3181: /* Match a single character, caselessly. If we are at the end of the
3182: subject, give up immediately. */
3183:
3184: case OP_CHARI:
3185: if (eptr >= md->end_subject)
3186: {
3187: SCHECK_PARTIAL();
3188: RRETURN(MATCH_NOMATCH);
3189: }
3190:
1.1.1.2 misho 3191: #ifdef SUPPORT_UTF
3192: if (utf)
1.1 misho 3193: {
3194: length = 1;
3195: ecode++;
3196: GETCHARLEN(fc, ecode, length);
3197:
3198: /* If the pattern character's value is < 128, we have only one byte, and
3199: we know that its other case must also be one byte long, so we can use the
3200: fast lookup table. We know that there is at least one byte left in the
3201: subject. */
3202:
3203: if (fc < 128)
3204: {
1.1.1.2 misho 3205: if (md->lcc[fc]
3206: != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3207: ecode++;
3208: eptr++;
1.1 misho 3209: }
3210:
3211: /* Otherwise we must pick up the subject character. Note that we cannot
3212: use the value of "length" to check for sufficient bytes left, because the
3213: other case of the character may have more or fewer bytes. */
3214:
3215: else
3216: {
3217: unsigned int dc;
3218: GETCHARINC(dc, eptr);
3219: ecode += length;
3220:
3221: /* If we have Unicode property support, we can use it to test the other
3222: case of the character, if there is one. */
3223:
3224: if (fc != dc)
3225: {
3226: #ifdef SUPPORT_UCP
3227: if (dc != UCD_OTHERCASE(fc))
3228: #endif
3229: RRETURN(MATCH_NOMATCH);
3230: }
3231: }
3232: }
3233: else
1.1.1.2 misho 3234: #endif /* SUPPORT_UTF */
1.1 misho 3235:
1.1.1.2 misho 3236: /* Not UTF mode */
1.1 misho 3237: {
1.1.1.2 misho 3238: if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3239: != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3240: eptr++;
1.1 misho 3241: ecode += 2;
3242: }
3243: break;
3244:
3245: /* Match a single character repeatedly. */
3246:
3247: case OP_EXACT:
3248: case OP_EXACTI:
3249: min = max = GET2(ecode, 1);
1.1.1.2 misho 3250: ecode += 1 + IMM2_SIZE;
1.1 misho 3251: goto REPEATCHAR;
3252:
3253: case OP_POSUPTO:
3254: case OP_POSUPTOI:
3255: possessive = TRUE;
3256: /* Fall through */
3257:
3258: case OP_UPTO:
3259: case OP_UPTOI:
3260: case OP_MINUPTO:
3261: case OP_MINUPTOI:
3262: min = 0;
3263: max = GET2(ecode, 1);
3264: minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
1.1.1.2 misho 3265: ecode += 1 + IMM2_SIZE;
1.1 misho 3266: goto REPEATCHAR;
3267:
3268: case OP_POSSTAR:
3269: case OP_POSSTARI:
3270: possessive = TRUE;
3271: min = 0;
3272: max = INT_MAX;
3273: ecode++;
3274: goto REPEATCHAR;
3275:
3276: case OP_POSPLUS:
3277: case OP_POSPLUSI:
3278: possessive = TRUE;
3279: min = 1;
3280: max = INT_MAX;
3281: ecode++;
3282: goto REPEATCHAR;
3283:
3284: case OP_POSQUERY:
3285: case OP_POSQUERYI:
3286: possessive = TRUE;
3287: min = 0;
3288: max = 1;
3289: ecode++;
3290: goto REPEATCHAR;
3291:
3292: case OP_STAR:
3293: case OP_STARI:
3294: case OP_MINSTAR:
3295: case OP_MINSTARI:
3296: case OP_PLUS:
3297: case OP_PLUSI:
3298: case OP_MINPLUS:
3299: case OP_MINPLUSI:
3300: case OP_QUERY:
3301: case OP_QUERYI:
3302: case OP_MINQUERY:
3303: case OP_MINQUERYI:
3304: c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3305: minimize = (c & 1) != 0;
3306: min = rep_min[c]; /* Pick up values from tables; */
3307: max = rep_max[c]; /* zero for max => infinity */
3308: if (max == 0) max = INT_MAX;
3309:
3310: /* Common code for all repeated single-character matches. */
3311:
3312: REPEATCHAR:
1.1.1.2 misho 3313: #ifdef SUPPORT_UTF
3314: if (utf)
1.1 misho 3315: {
3316: length = 1;
3317: charptr = ecode;
3318: GETCHARLEN(fc, ecode, length);
3319: ecode += length;
3320:
3321: /* Handle multibyte character matching specially here. There is
3322: support for caseless matching if UCP support is present. */
3323:
3324: if (length > 1)
3325: {
3326: #ifdef SUPPORT_UCP
3327: unsigned int othercase;
3328: if (op >= OP_STARI && /* Caseless */
3329: (othercase = UCD_OTHERCASE(fc)) != fc)
1.1.1.2 misho 3330: oclength = PRIV(ord2utf)(othercase, occhars);
1.1 misho 3331: else oclength = 0;
3332: #endif /* SUPPORT_UCP */
3333:
3334: for (i = 1; i <= min; i++)
3335: {
3336: if (eptr <= md->end_subject - length &&
1.1.1.2 misho 3337: memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
1.1 misho 3338: #ifdef SUPPORT_UCP
3339: else if (oclength > 0 &&
3340: eptr <= md->end_subject - oclength &&
1.1.1.2 misho 3341: memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
1.1 misho 3342: #endif /* SUPPORT_UCP */
3343: else
3344: {
3345: CHECK_PARTIAL();
3346: RRETURN(MATCH_NOMATCH);
3347: }
3348: }
3349:
3350: if (min == max) continue;
3351:
3352: if (minimize)
3353: {
3354: for (fi = min;; fi++)
3355: {
3356: RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3357: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3358: if (fi >= max) RRETURN(MATCH_NOMATCH);
3359: if (eptr <= md->end_subject - length &&
1.1.1.2 misho 3360: memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
1.1 misho 3361: #ifdef SUPPORT_UCP
3362: else if (oclength > 0 &&
3363: eptr <= md->end_subject - oclength &&
1.1.1.2 misho 3364: memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
1.1 misho 3365: #endif /* SUPPORT_UCP */
3366: else
3367: {
3368: CHECK_PARTIAL();
3369: RRETURN(MATCH_NOMATCH);
3370: }
3371: }
3372: /* Control never gets here */
3373: }
3374:
3375: else /* Maximize */
3376: {
3377: pp = eptr;
3378: for (i = min; i < max; i++)
3379: {
3380: if (eptr <= md->end_subject - length &&
1.1.1.2 misho 3381: memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
1.1 misho 3382: #ifdef SUPPORT_UCP
3383: else if (oclength > 0 &&
3384: eptr <= md->end_subject - oclength &&
1.1.1.2 misho 3385: memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
1.1 misho 3386: #endif /* SUPPORT_UCP */
3387: else
3388: {
3389: CHECK_PARTIAL();
3390: break;
3391: }
3392: }
3393:
3394: if (possessive) continue;
3395:
3396: for(;;)
3397: {
3398: RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3399: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3400: if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3401: #ifdef SUPPORT_UCP
3402: eptr--;
3403: BACKCHAR(eptr);
3404: #else /* without SUPPORT_UCP */
3405: eptr -= length;
3406: #endif /* SUPPORT_UCP */
3407: }
3408: }
3409: /* Control never gets here */
3410: }
3411:
3412: /* If the length of a UTF-8 character is 1, we fall through here, and
3413: obey the code as for non-UTF-8 characters below, though in this case the
3414: value of fc will always be < 128. */
3415: }
3416: else
1.1.1.2 misho 3417: #endif /* SUPPORT_UTF */
3418: /* When not in UTF-8 mode, load a single-byte character. */
3419: fc = *ecode++;
1.1 misho 3420:
1.1.1.2 misho 3421: /* The value of fc at this point is always one character, though we may
3422: or may not be in UTF mode. The code is duplicated for the caseless and
1.1 misho 3423: caseful cases, for speed, since matching characters is likely to be quite
3424: common. First, ensure the minimum number of matches are present. If min =
3425: max, continue at the same level without recursing. Otherwise, if
3426: minimizing, keep trying the rest of the expression and advancing one
3427: matching character if failing, up to the maximum. Alternatively, if
3428: maximizing, find the maximum number of characters and work backwards. */
3429:
3430: DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
1.1.1.3 ! misho 3431: max, (char *)eptr));
1.1 misho 3432:
3433: if (op >= OP_STARI) /* Caseless */
3434: {
1.1.1.2 misho 3435: #ifdef COMPILE_PCRE8
3436: /* fc must be < 128 if UTF is enabled. */
3437: foc = md->fcc[fc];
3438: #else
3439: #ifdef SUPPORT_UTF
3440: #ifdef SUPPORT_UCP
3441: if (utf && fc > 127)
3442: foc = UCD_OTHERCASE(fc);
3443: #else
3444: if (utf && fc > 127)
3445: foc = fc;
3446: #endif /* SUPPORT_UCP */
3447: else
3448: #endif /* SUPPORT_UTF */
3449: foc = TABLE_GET(fc, md->fcc, fc);
3450: #endif /* COMPILE_PCRE8 */
3451:
1.1 misho 3452: for (i = 1; i <= min; i++)
3453: {
3454: if (eptr >= md->end_subject)
3455: {
3456: SCHECK_PARTIAL();
3457: RRETURN(MATCH_NOMATCH);
3458: }
1.1.1.2 misho 3459: if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3460: eptr++;
1.1 misho 3461: }
3462: if (min == max) continue;
3463: if (minimize)
3464: {
3465: for (fi = min;; fi++)
3466: {
3467: RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3468: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3469: if (fi >= max) RRETURN(MATCH_NOMATCH);
3470: if (eptr >= md->end_subject)
3471: {
3472: SCHECK_PARTIAL();
3473: RRETURN(MATCH_NOMATCH);
3474: }
1.1.1.2 misho 3475: if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3476: eptr++;
1.1 misho 3477: }
3478: /* Control never gets here */
3479: }
3480: else /* Maximize */
3481: {
3482: pp = eptr;
3483: for (i = min; i < max; i++)
3484: {
3485: if (eptr >= md->end_subject)
3486: {
3487: SCHECK_PARTIAL();
3488: break;
3489: }
1.1.1.2 misho 3490: if (fc != *eptr && foc != *eptr) break;
1.1 misho 3491: eptr++;
3492: }
3493:
3494: if (possessive) continue;
3495:
3496: while (eptr >= pp)
3497: {
3498: RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3499: eptr--;
3500: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3501: }
3502: RRETURN(MATCH_NOMATCH);
3503: }
3504: /* Control never gets here */
3505: }
3506:
3507: /* Caseful comparisons (includes all multi-byte characters) */
3508:
3509: else
3510: {
3511: for (i = 1; i <= min; i++)
3512: {
3513: if (eptr >= md->end_subject)
3514: {
3515: SCHECK_PARTIAL();
3516: RRETURN(MATCH_NOMATCH);
3517: }
3518: if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3519: }
3520:
3521: if (min == max) continue;
3522:
3523: if (minimize)
3524: {
3525: for (fi = min;; fi++)
3526: {
3527: RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3528: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3529: if (fi >= max) RRETURN(MATCH_NOMATCH);
3530: if (eptr >= md->end_subject)
3531: {
3532: SCHECK_PARTIAL();
3533: RRETURN(MATCH_NOMATCH);
3534: }
3535: if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3536: }
3537: /* Control never gets here */
3538: }
3539: else /* Maximize */
3540: {
3541: pp = eptr;
3542: for (i = min; i < max; i++)
3543: {
3544: if (eptr >= md->end_subject)
3545: {
3546: SCHECK_PARTIAL();
3547: break;
3548: }
3549: if (fc != *eptr) break;
3550: eptr++;
3551: }
3552: if (possessive) continue;
3553:
3554: while (eptr >= pp)
3555: {
3556: RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3557: eptr--;
3558: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3559: }
3560: RRETURN(MATCH_NOMATCH);
3561: }
3562: }
3563: /* Control never gets here */
3564:
3565: /* Match a negated single one-byte character. The character we are
3566: checking can be multibyte. */
3567:
3568: case OP_NOT:
3569: case OP_NOTI:
3570: if (eptr >= md->end_subject)
3571: {
3572: SCHECK_PARTIAL();
3573: RRETURN(MATCH_NOMATCH);
3574: }
1.1.1.3 ! misho 3575: #ifdef SUPPORT_UTF
! 3576: if (utf)
1.1 misho 3577: {
1.1.1.2 misho 3578: register unsigned int ch, och;
1.1.1.3 ! misho 3579:
! 3580: ecode++;
! 3581: GETCHARINC(ch, ecode);
! 3582: GETCHARINC(c, eptr);
! 3583:
! 3584: if (op == OP_NOT)
! 3585: {
! 3586: if (ch == c) RRETURN(MATCH_NOMATCH);
! 3587: }
! 3588: else
! 3589: {
1.1.1.2 misho 3590: #ifdef SUPPORT_UCP
1.1.1.3 ! misho 3591: if (ch > 127)
! 3592: och = UCD_OTHERCASE(ch);
1.1.1.2 misho 3593: #else
1.1.1.3 ! misho 3594: if (ch > 127)
! 3595: och = ch;
1.1.1.2 misho 3596: #endif /* SUPPORT_UCP */
1.1.1.3 ! misho 3597: else
! 3598: och = TABLE_GET(ch, md->fcc, ch);
! 3599: if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
! 3600: }
1.1 misho 3601: }
1.1.1.3 ! misho 3602: else
! 3603: #endif
1.1 misho 3604: {
1.1.1.3 ! misho 3605: register unsigned int ch = ecode[1];
! 3606: c = *eptr++;
! 3607: if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
! 3608: RRETURN(MATCH_NOMATCH);
! 3609: ecode += 2;
1.1 misho 3610: }
3611: break;
3612:
3613: /* Match a negated single one-byte character repeatedly. This is almost a
3614: repeat of the code for a repeated single character, but I haven't found a
3615: nice way of commoning these up that doesn't require a test of the
3616: positive/negative option for each character match. Maybe that wouldn't add
3617: very much to the time taken, but character matching *is* what this is all
3618: about... */
3619:
3620: case OP_NOTEXACT:
3621: case OP_NOTEXACTI:
3622: min = max = GET2(ecode, 1);
1.1.1.2 misho 3623: ecode += 1 + IMM2_SIZE;
1.1 misho 3624: goto REPEATNOTCHAR;
3625:
3626: case OP_NOTUPTO:
3627: case OP_NOTUPTOI:
3628: case OP_NOTMINUPTO:
3629: case OP_NOTMINUPTOI:
3630: min = 0;
3631: max = GET2(ecode, 1);
3632: minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
1.1.1.2 misho 3633: ecode += 1 + IMM2_SIZE;
1.1 misho 3634: goto REPEATNOTCHAR;
3635:
3636: case OP_NOTPOSSTAR:
3637: case OP_NOTPOSSTARI:
3638: possessive = TRUE;
3639: min = 0;
3640: max = INT_MAX;
3641: ecode++;
3642: goto REPEATNOTCHAR;
3643:
3644: case OP_NOTPOSPLUS:
3645: case OP_NOTPOSPLUSI:
3646: possessive = TRUE;
3647: min = 1;
3648: max = INT_MAX;
3649: ecode++;
3650: goto REPEATNOTCHAR;
3651:
3652: case OP_NOTPOSQUERY:
3653: case OP_NOTPOSQUERYI:
3654: possessive = TRUE;
3655: min = 0;
3656: max = 1;
3657: ecode++;
3658: goto REPEATNOTCHAR;
3659:
3660: case OP_NOTPOSUPTO:
3661: case OP_NOTPOSUPTOI:
3662: possessive = TRUE;
3663: min = 0;
3664: max = GET2(ecode, 1);
1.1.1.2 misho 3665: ecode += 1 + IMM2_SIZE;
1.1 misho 3666: goto REPEATNOTCHAR;
3667:
3668: case OP_NOTSTAR:
3669: case OP_NOTSTARI:
3670: case OP_NOTMINSTAR:
3671: case OP_NOTMINSTARI:
3672: case OP_NOTPLUS:
3673: case OP_NOTPLUSI:
3674: case OP_NOTMINPLUS:
3675: case OP_NOTMINPLUSI:
3676: case OP_NOTQUERY:
3677: case OP_NOTQUERYI:
3678: case OP_NOTMINQUERY:
3679: case OP_NOTMINQUERYI:
3680: c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3681: minimize = (c & 1) != 0;
3682: min = rep_min[c]; /* Pick up values from tables; */
3683: max = rep_max[c]; /* zero for max => infinity */
3684: if (max == 0) max = INT_MAX;
3685:
3686: /* Common code for all repeated single-byte matches. */
3687:
3688: REPEATNOTCHAR:
1.1.1.3 ! misho 3689: GETCHARINCTEST(fc, ecode);
1.1 misho 3690:
3691: /* The code is duplicated for the caseless and caseful cases, for speed,
3692: since matching characters is likely to be quite common. First, ensure the
3693: minimum number of matches are present. If min = max, continue at the same
3694: level without recursing. Otherwise, if minimizing, keep trying the rest of
3695: the expression and advancing one matching character if failing, up to the
3696: maximum. Alternatively, if maximizing, find the maximum number of
3697: characters and work backwards. */
3698:
3699: DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
1.1.1.3 ! misho 3700: max, (char *)eptr));
1.1 misho 3701:
3702: if (op >= OP_NOTSTARI) /* Caseless */
3703: {
1.1.1.2 misho 3704: #ifdef SUPPORT_UTF
3705: #ifdef SUPPORT_UCP
3706: if (utf && fc > 127)
3707: foc = UCD_OTHERCASE(fc);
3708: #else
3709: if (utf && fc > 127)
3710: foc = fc;
3711: #endif /* SUPPORT_UCP */
3712: else
3713: #endif /* SUPPORT_UTF */
3714: foc = TABLE_GET(fc, md->fcc, fc);
1.1 misho 3715:
1.1.1.2 misho 3716: #ifdef SUPPORT_UTF
3717: if (utf)
1.1 misho 3718: {
3719: register unsigned int d;
3720: for (i = 1; i <= min; i++)
3721: {
3722: if (eptr >= md->end_subject)
3723: {
3724: SCHECK_PARTIAL();
3725: RRETURN(MATCH_NOMATCH);
3726: }
3727: GETCHARINC(d, eptr);
1.1.1.3 ! misho 3728: if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
1.1 misho 3729: }
3730: }
3731: else
3732: #endif
1.1.1.2 misho 3733: /* Not UTF mode */
1.1 misho 3734: {
3735: for (i = 1; i <= min; i++)
3736: {
3737: if (eptr >= md->end_subject)
3738: {
3739: SCHECK_PARTIAL();
3740: RRETURN(MATCH_NOMATCH);
3741: }
1.1.1.2 misho 3742: if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3743: eptr++;
1.1 misho 3744: }
3745: }
3746:
3747: if (min == max) continue;
3748:
3749: if (minimize)
3750: {
1.1.1.2 misho 3751: #ifdef SUPPORT_UTF
3752: if (utf)
1.1 misho 3753: {
3754: register unsigned int d;
3755: for (fi = min;; fi++)
3756: {
3757: RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3758: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3759: if (fi >= max) RRETURN(MATCH_NOMATCH);
3760: if (eptr >= md->end_subject)
3761: {
3762: SCHECK_PARTIAL();
3763: RRETURN(MATCH_NOMATCH);
3764: }
3765: GETCHARINC(d, eptr);
1.1.1.2 misho 3766: if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
1.1 misho 3767: }
3768: }
3769: else
3770: #endif
1.1.1.2 misho 3771: /* Not UTF mode */
1.1 misho 3772: {
3773: for (fi = min;; fi++)
3774: {
3775: RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3776: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3777: if (fi >= max) RRETURN(MATCH_NOMATCH);
3778: if (eptr >= md->end_subject)
3779: {
3780: SCHECK_PARTIAL();
3781: RRETURN(MATCH_NOMATCH);
3782: }
1.1.1.2 misho 3783: if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3784: eptr++;
1.1 misho 3785: }
3786: }
3787: /* Control never gets here */
3788: }
3789:
3790: /* Maximize case */
3791:
3792: else
3793: {
3794: pp = eptr;
3795:
1.1.1.2 misho 3796: #ifdef SUPPORT_UTF
3797: if (utf)
1.1 misho 3798: {
3799: register unsigned int d;
3800: for (i = min; i < max; i++)
3801: {
3802: int len = 1;
3803: if (eptr >= md->end_subject)
3804: {
3805: SCHECK_PARTIAL();
3806: break;
3807: }
3808: GETCHARLEN(d, eptr, len);
1.1.1.2 misho 3809: if (fc == d || (unsigned int)foc == d) break;
1.1 misho 3810: eptr += len;
3811: }
1.1.1.2 misho 3812: if (possessive) continue;
3813: for(;;)
1.1 misho 3814: {
3815: RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3816: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3817: if (eptr-- == pp) break; /* Stop if tried at original pos */
3818: BACKCHAR(eptr);
3819: }
3820: }
3821: else
3822: #endif
1.1.1.2 misho 3823: /* Not UTF mode */
1.1 misho 3824: {
3825: for (i = min; i < max; i++)
3826: {
3827: if (eptr >= md->end_subject)
3828: {
3829: SCHECK_PARTIAL();
3830: break;
3831: }
1.1.1.2 misho 3832: if (fc == *eptr || foc == *eptr) break;
1.1 misho 3833: eptr++;
3834: }
3835: if (possessive) continue;
3836: while (eptr >= pp)
3837: {
3838: RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3839: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3840: eptr--;
3841: }
3842: }
3843:
3844: RRETURN(MATCH_NOMATCH);
3845: }
3846: /* Control never gets here */
3847: }
3848:
3849: /* Caseful comparisons */
3850:
3851: else
3852: {
1.1.1.2 misho 3853: #ifdef SUPPORT_UTF
3854: if (utf)
1.1 misho 3855: {
3856: register unsigned int d;
3857: for (i = 1; i <= min; i++)
3858: {
3859: if (eptr >= md->end_subject)
3860: {
3861: SCHECK_PARTIAL();
3862: RRETURN(MATCH_NOMATCH);
3863: }
3864: GETCHARINC(d, eptr);
3865: if (fc == d) RRETURN(MATCH_NOMATCH);
3866: }
3867: }
3868: else
3869: #endif
1.1.1.2 misho 3870: /* Not UTF mode */
1.1 misho 3871: {
3872: for (i = 1; i <= min; i++)
3873: {
3874: if (eptr >= md->end_subject)
3875: {
3876: SCHECK_PARTIAL();
3877: RRETURN(MATCH_NOMATCH);
3878: }
3879: if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3880: }
3881: }
3882:
3883: if (min == max) continue;
3884:
3885: if (minimize)
3886: {
1.1.1.2 misho 3887: #ifdef SUPPORT_UTF
3888: if (utf)
1.1 misho 3889: {
3890: register unsigned int d;
3891: for (fi = min;; fi++)
3892: {
3893: RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3894: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3895: if (fi >= max) RRETURN(MATCH_NOMATCH);
3896: if (eptr >= md->end_subject)
3897: {
3898: SCHECK_PARTIAL();
3899: RRETURN(MATCH_NOMATCH);
3900: }
3901: GETCHARINC(d, eptr);
3902: if (fc == d) RRETURN(MATCH_NOMATCH);
3903: }
3904: }
3905: else
3906: #endif
1.1.1.2 misho 3907: /* Not UTF mode */
1.1 misho 3908: {
3909: for (fi = min;; fi++)
3910: {
3911: RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3912: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3913: if (fi >= max) RRETURN(MATCH_NOMATCH);
3914: if (eptr >= md->end_subject)
3915: {
3916: SCHECK_PARTIAL();
3917: RRETURN(MATCH_NOMATCH);
3918: }
3919: if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3920: }
3921: }
3922: /* Control never gets here */
3923: }
3924:
3925: /* Maximize case */
3926:
3927: else
3928: {
3929: pp = eptr;
3930:
1.1.1.2 misho 3931: #ifdef SUPPORT_UTF
3932: if (utf)
1.1 misho 3933: {
3934: register unsigned int d;
3935: for (i = min; i < max; i++)
3936: {
3937: int len = 1;
3938: if (eptr >= md->end_subject)
3939: {
3940: SCHECK_PARTIAL();
3941: break;
3942: }
3943: GETCHARLEN(d, eptr, len);
3944: if (fc == d) break;
3945: eptr += len;
3946: }
3947: if (possessive) continue;
3948: for(;;)
3949: {
3950: RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3951: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3952: if (eptr-- == pp) break; /* Stop if tried at original pos */
3953: BACKCHAR(eptr);
3954: }
3955: }
3956: else
3957: #endif
1.1.1.2 misho 3958: /* Not UTF mode */
1.1 misho 3959: {
3960: for (i = min; i < max; i++)
3961: {
3962: if (eptr >= md->end_subject)
3963: {
3964: SCHECK_PARTIAL();
3965: break;
3966: }
3967: if (fc == *eptr) break;
3968: eptr++;
3969: }
3970: if (possessive) continue;
3971: while (eptr >= pp)
3972: {
3973: RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3974: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3975: eptr--;
3976: }
3977: }
3978:
3979: RRETURN(MATCH_NOMATCH);
3980: }
3981: }
3982: /* Control never gets here */
3983:
3984: /* Match a single character type repeatedly; several different opcodes
3985: share code. This is very similar to the code for single characters, but we
3986: repeat it in the interests of efficiency. */
3987:
3988: case OP_TYPEEXACT:
3989: min = max = GET2(ecode, 1);
3990: minimize = TRUE;
1.1.1.2 misho 3991: ecode += 1 + IMM2_SIZE;
1.1 misho 3992: goto REPEATTYPE;
3993:
3994: case OP_TYPEUPTO:
3995: case OP_TYPEMINUPTO:
3996: min = 0;
3997: max = GET2(ecode, 1);
3998: minimize = *ecode == OP_TYPEMINUPTO;
1.1.1.2 misho 3999: ecode += 1 + IMM2_SIZE;
1.1 misho 4000: goto REPEATTYPE;
4001:
4002: case OP_TYPEPOSSTAR:
4003: possessive = TRUE;
4004: min = 0;
4005: max = INT_MAX;
4006: ecode++;
4007: goto REPEATTYPE;
4008:
4009: case OP_TYPEPOSPLUS:
4010: possessive = TRUE;
4011: min = 1;
4012: max = INT_MAX;
4013: ecode++;
4014: goto REPEATTYPE;
4015:
4016: case OP_TYPEPOSQUERY:
4017: possessive = TRUE;
4018: min = 0;
4019: max = 1;
4020: ecode++;
4021: goto REPEATTYPE;
4022:
4023: case OP_TYPEPOSUPTO:
4024: possessive = TRUE;
4025: min = 0;
4026: max = GET2(ecode, 1);
1.1.1.2 misho 4027: ecode += 1 + IMM2_SIZE;
1.1 misho 4028: goto REPEATTYPE;
4029:
4030: case OP_TYPESTAR:
4031: case OP_TYPEMINSTAR:
4032: case OP_TYPEPLUS:
4033: case OP_TYPEMINPLUS:
4034: case OP_TYPEQUERY:
4035: case OP_TYPEMINQUERY:
4036: c = *ecode++ - OP_TYPESTAR;
4037: minimize = (c & 1) != 0;
4038: min = rep_min[c]; /* Pick up values from tables; */
4039: max = rep_max[c]; /* zero for max => infinity */
4040: if (max == 0) max = INT_MAX;
4041:
4042: /* Common code for all repeated single character type matches. Note that
4043: in UTF-8 mode, '.' matches a character of any length, but for the other
4044: character types, the valid characters are all one-byte long. */
4045:
4046: REPEATTYPE:
4047: ctype = *ecode++; /* Code for the character type */
4048:
4049: #ifdef SUPPORT_UCP
4050: if (ctype == OP_PROP || ctype == OP_NOTPROP)
4051: {
4052: prop_fail_result = ctype == OP_NOTPROP;
4053: prop_type = *ecode++;
4054: prop_value = *ecode++;
4055: }
4056: else prop_type = -1;
4057: #endif
4058:
4059: /* First, ensure the minimum number of matches are present. Use inline
4060: code for maximizing the speed, and do the type test once at the start
4061: (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4062: is tidier. Also separate the UCP code, which can be the same for both UTF-8
4063: and single-bytes. */
4064:
4065: if (min > 0)
4066: {
4067: #ifdef SUPPORT_UCP
4068: if (prop_type >= 0)
4069: {
4070: switch(prop_type)
4071: {
4072: case PT_ANY:
4073: if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4074: for (i = 1; i <= min; i++)
4075: {
4076: if (eptr >= md->end_subject)
4077: {
4078: SCHECK_PARTIAL();
4079: RRETURN(MATCH_NOMATCH);
4080: }
4081: GETCHARINCTEST(c, eptr);
4082: }
4083: break;
4084:
4085: case PT_LAMP:
4086: for (i = 1; i <= min; i++)
4087: {
4088: int chartype;
4089: if (eptr >= md->end_subject)
4090: {
4091: SCHECK_PARTIAL();
4092: RRETURN(MATCH_NOMATCH);
4093: }
4094: GETCHARINCTEST(c, eptr);
4095: chartype = UCD_CHARTYPE(c);
4096: if ((chartype == ucp_Lu ||
4097: chartype == ucp_Ll ||
4098: chartype == ucp_Lt) == prop_fail_result)
4099: RRETURN(MATCH_NOMATCH);
4100: }
4101: break;
4102:
4103: case PT_GC:
4104: for (i = 1; i <= min; i++)
4105: {
4106: if (eptr >= md->end_subject)
4107: {
4108: SCHECK_PARTIAL();
4109: RRETURN(MATCH_NOMATCH);
4110: }
4111: GETCHARINCTEST(c, eptr);
4112: if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4113: RRETURN(MATCH_NOMATCH);
4114: }
4115: break;
4116:
4117: case PT_PC:
4118: for (i = 1; i <= min; i++)
4119: {
4120: if (eptr >= md->end_subject)
4121: {
4122: SCHECK_PARTIAL();
4123: RRETURN(MATCH_NOMATCH);
4124: }
4125: GETCHARINCTEST(c, eptr);
4126: if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4127: RRETURN(MATCH_NOMATCH);
4128: }
4129: break;
4130:
4131: case PT_SC:
4132: for (i = 1; i <= min; i++)
4133: {
4134: if (eptr >= md->end_subject)
4135: {
4136: SCHECK_PARTIAL();
4137: RRETURN(MATCH_NOMATCH);
4138: }
4139: GETCHARINCTEST(c, eptr);
4140: if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4141: RRETURN(MATCH_NOMATCH);
4142: }
4143: break;
4144:
4145: case PT_ALNUM:
4146: for (i = 1; i <= min; i++)
4147: {
4148: int category;
4149: if (eptr >= md->end_subject)
4150: {
4151: SCHECK_PARTIAL();
4152: RRETURN(MATCH_NOMATCH);
4153: }
4154: GETCHARINCTEST(c, eptr);
4155: category = UCD_CATEGORY(c);
4156: if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4157: RRETURN(MATCH_NOMATCH);
4158: }
4159: break;
4160:
4161: case PT_SPACE: /* Perl space */
4162: for (i = 1; i <= min; i++)
4163: {
4164: if (eptr >= md->end_subject)
4165: {
4166: SCHECK_PARTIAL();
4167: RRETURN(MATCH_NOMATCH);
4168: }
4169: GETCHARINCTEST(c, eptr);
4170: if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4171: c == CHAR_FF || c == CHAR_CR)
4172: == prop_fail_result)
4173: RRETURN(MATCH_NOMATCH);
4174: }
4175: break;
4176:
4177: case PT_PXSPACE: /* POSIX space */
4178: for (i = 1; i <= min; i++)
4179: {
4180: if (eptr >= md->end_subject)
4181: {
4182: SCHECK_PARTIAL();
4183: RRETURN(MATCH_NOMATCH);
4184: }
4185: GETCHARINCTEST(c, eptr);
4186: if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4187: c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4188: == prop_fail_result)
4189: RRETURN(MATCH_NOMATCH);
4190: }
4191: break;
4192:
4193: case PT_WORD:
4194: for (i = 1; i <= min; i++)
4195: {
4196: int category;
4197: if (eptr >= md->end_subject)
4198: {
4199: SCHECK_PARTIAL();
4200: RRETURN(MATCH_NOMATCH);
4201: }
4202: GETCHARINCTEST(c, eptr);
4203: category = UCD_CATEGORY(c);
4204: if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4205: == prop_fail_result)
4206: RRETURN(MATCH_NOMATCH);
4207: }
4208: break;
4209:
4210: /* This should not occur */
4211:
4212: default:
4213: RRETURN(PCRE_ERROR_INTERNAL);
4214: }
4215: }
4216:
4217: /* Match extended Unicode sequences. We will get here only if the
4218: support is in the binary; otherwise a compile-time error occurs. */
4219:
4220: else if (ctype == OP_EXTUNI)
4221: {
4222: for (i = 1; i <= min; i++)
4223: {
4224: if (eptr >= md->end_subject)
4225: {
4226: SCHECK_PARTIAL();
4227: RRETURN(MATCH_NOMATCH);
4228: }
4229: GETCHARINCTEST(c, eptr);
4230: if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4231: while (eptr < md->end_subject)
4232: {
4233: int len = 1;
1.1.1.2 misho 4234: if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
1.1 misho 4235: if (UCD_CATEGORY(c) != ucp_M) break;
4236: eptr += len;
4237: }
1.1.1.3 ! misho 4238: CHECK_PARTIAL();
1.1 misho 4239: }
4240: }
4241:
4242: else
4243: #endif /* SUPPORT_UCP */
4244:
4245: /* Handle all other cases when the coding is UTF-8 */
4246:
1.1.1.2 misho 4247: #ifdef SUPPORT_UTF
4248: if (utf) switch(ctype)
1.1 misho 4249: {
4250: case OP_ANY:
4251: for (i = 1; i <= min; i++)
4252: {
4253: if (eptr >= md->end_subject)
4254: {
4255: SCHECK_PARTIAL();
4256: RRETURN(MATCH_NOMATCH);
4257: }
4258: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1.1.1.3 ! misho 4259: if (md->partial != 0 &&
! 4260: eptr + 1 >= md->end_subject &&
! 4261: NLBLOCK->nltype == NLTYPE_FIXED &&
! 4262: NLBLOCK->nllen == 2 &&
! 4263: *eptr == NLBLOCK->nl[0])
! 4264: {
! 4265: md->hitend = TRUE;
! 4266: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
! 4267: }
1.1 misho 4268: eptr++;
1.1.1.2 misho 4269: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misho 4270: }
4271: break;
4272:
4273: case OP_ALLANY:
4274: for (i = 1; i <= min; i++)
4275: {
4276: if (eptr >= md->end_subject)
4277: {
4278: SCHECK_PARTIAL();
4279: RRETURN(MATCH_NOMATCH);
4280: }
4281: eptr++;
1.1.1.2 misho 4282: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misho 4283: }
4284: break;
4285:
4286: case OP_ANYBYTE:
4287: if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4288: eptr += min;
4289: break;
4290:
4291: case OP_ANYNL:
4292: for (i = 1; i <= min; i++)
4293: {
4294: if (eptr >= md->end_subject)
4295: {
4296: SCHECK_PARTIAL();
4297: RRETURN(MATCH_NOMATCH);
4298: }
4299: GETCHARINC(c, eptr);
4300: switch(c)
4301: {
4302: default: RRETURN(MATCH_NOMATCH);
4303:
4304: case 0x000d:
4305: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4306: break;
4307:
4308: case 0x000a:
4309: break;
4310:
4311: case 0x000b:
4312: case 0x000c:
4313: case 0x0085:
4314: case 0x2028:
4315: case 0x2029:
4316: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4317: break;
4318: }
4319: }
4320: break;
4321:
4322: case OP_NOT_HSPACE:
4323: for (i = 1; i <= min; i++)
4324: {
4325: if (eptr >= md->end_subject)
4326: {
4327: SCHECK_PARTIAL();
4328: RRETURN(MATCH_NOMATCH);
4329: }
4330: GETCHARINC(c, eptr);
4331: switch(c)
4332: {
4333: default: break;
4334: case 0x09: /* HT */
4335: case 0x20: /* SPACE */
4336: case 0xa0: /* NBSP */
4337: case 0x1680: /* OGHAM SPACE MARK */
4338: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4339: case 0x2000: /* EN QUAD */
4340: case 0x2001: /* EM QUAD */
4341: case 0x2002: /* EN SPACE */
4342: case 0x2003: /* EM SPACE */
4343: case 0x2004: /* THREE-PER-EM SPACE */
4344: case 0x2005: /* FOUR-PER-EM SPACE */
4345: case 0x2006: /* SIX-PER-EM SPACE */
4346: case 0x2007: /* FIGURE SPACE */
4347: case 0x2008: /* PUNCTUATION SPACE */
4348: case 0x2009: /* THIN SPACE */
4349: case 0x200A: /* HAIR SPACE */
4350: case 0x202f: /* NARROW NO-BREAK SPACE */
4351: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4352: case 0x3000: /* IDEOGRAPHIC SPACE */
4353: RRETURN(MATCH_NOMATCH);
4354: }
4355: }
4356: break;
4357:
4358: case OP_HSPACE:
4359: for (i = 1; i <= min; i++)
4360: {
4361: if (eptr >= md->end_subject)
4362: {
4363: SCHECK_PARTIAL();
4364: RRETURN(MATCH_NOMATCH);
4365: }
4366: GETCHARINC(c, eptr);
4367: switch(c)
4368: {
4369: default: RRETURN(MATCH_NOMATCH);
4370: case 0x09: /* HT */
4371: case 0x20: /* SPACE */
4372: case 0xa0: /* NBSP */
4373: case 0x1680: /* OGHAM SPACE MARK */
4374: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4375: case 0x2000: /* EN QUAD */
4376: case 0x2001: /* EM QUAD */
4377: case 0x2002: /* EN SPACE */
4378: case 0x2003: /* EM SPACE */
4379: case 0x2004: /* THREE-PER-EM SPACE */
4380: case 0x2005: /* FOUR-PER-EM SPACE */
4381: case 0x2006: /* SIX-PER-EM SPACE */
4382: case 0x2007: /* FIGURE SPACE */
4383: case 0x2008: /* PUNCTUATION SPACE */
4384: case 0x2009: /* THIN SPACE */
4385: case 0x200A: /* HAIR SPACE */
4386: case 0x202f: /* NARROW NO-BREAK SPACE */
4387: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4388: case 0x3000: /* IDEOGRAPHIC SPACE */
4389: break;
4390: }
4391: }
4392: break;
4393:
4394: case OP_NOT_VSPACE:
4395: for (i = 1; i <= min; i++)
4396: {
4397: if (eptr >= md->end_subject)
4398: {
4399: SCHECK_PARTIAL();
4400: RRETURN(MATCH_NOMATCH);
4401: }
4402: GETCHARINC(c, eptr);
4403: switch(c)
4404: {
4405: default: break;
4406: case 0x0a: /* LF */
4407: case 0x0b: /* VT */
4408: case 0x0c: /* FF */
4409: case 0x0d: /* CR */
4410: case 0x85: /* NEL */
4411: case 0x2028: /* LINE SEPARATOR */
4412: case 0x2029: /* PARAGRAPH SEPARATOR */
4413: RRETURN(MATCH_NOMATCH);
4414: }
4415: }
4416: break;
4417:
4418: case OP_VSPACE:
4419: for (i = 1; i <= min; i++)
4420: {
4421: if (eptr >= md->end_subject)
4422: {
4423: SCHECK_PARTIAL();
4424: RRETURN(MATCH_NOMATCH);
4425: }
4426: GETCHARINC(c, eptr);
4427: switch(c)
4428: {
4429: default: RRETURN(MATCH_NOMATCH);
4430: case 0x0a: /* LF */
4431: case 0x0b: /* VT */
4432: case 0x0c: /* FF */
4433: case 0x0d: /* CR */
4434: case 0x85: /* NEL */
4435: case 0x2028: /* LINE SEPARATOR */
4436: case 0x2029: /* PARAGRAPH SEPARATOR */
4437: break;
4438: }
4439: }
4440: break;
4441:
4442: case OP_NOT_DIGIT:
4443: for (i = 1; i <= min; i++)
4444: {
4445: if (eptr >= md->end_subject)
4446: {
4447: SCHECK_PARTIAL();
4448: RRETURN(MATCH_NOMATCH);
4449: }
4450: GETCHARINC(c, eptr);
4451: if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4452: RRETURN(MATCH_NOMATCH);
4453: }
4454: break;
4455:
4456: case OP_DIGIT:
4457: for (i = 1; i <= min; i++)
4458: {
4459: if (eptr >= md->end_subject)
4460: {
4461: SCHECK_PARTIAL();
4462: RRETURN(MATCH_NOMATCH);
4463: }
1.1.1.2 misho 4464: if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_digit) == 0)
1.1 misho 4465: RRETURN(MATCH_NOMATCH);
1.1.1.2 misho 4466: eptr++;
1.1 misho 4467: /* No need to skip more bytes - we know it's a 1-byte character */
4468: }
4469: break;
4470:
4471: case OP_NOT_WHITESPACE:
4472: for (i = 1; i <= min; i++)
4473: {
4474: if (eptr >= md->end_subject)
4475: {
4476: SCHECK_PARTIAL();
4477: RRETURN(MATCH_NOMATCH);
4478: }
4479: if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4480: RRETURN(MATCH_NOMATCH);
1.1.1.2 misho 4481: eptr++;
4482: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misho 4483: }
4484: break;
4485:
4486: case OP_WHITESPACE:
4487: for (i = 1; i <= min; i++)
4488: {
4489: if (eptr >= md->end_subject)
4490: {
4491: SCHECK_PARTIAL();
4492: RRETURN(MATCH_NOMATCH);
4493: }
1.1.1.2 misho 4494: if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_space) == 0)
1.1 misho 4495: RRETURN(MATCH_NOMATCH);
1.1.1.2 misho 4496: eptr++;
1.1 misho 4497: /* No need to skip more bytes - we know it's a 1-byte character */
4498: }
4499: break;
4500:
4501: case OP_NOT_WORDCHAR:
4502: for (i = 1; i <= min; i++)
4503: {
4504: if (eptr >= md->end_subject)
4505: {
4506: SCHECK_PARTIAL();
4507: RRETURN(MATCH_NOMATCH);
4508: }
4509: if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4510: RRETURN(MATCH_NOMATCH);
1.1.1.2 misho 4511: eptr++;
4512: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misho 4513: }
4514: break;
4515:
4516: case OP_WORDCHAR:
4517: for (i = 1; i <= min; i++)
4518: {
4519: if (eptr >= md->end_subject)
4520: {
4521: SCHECK_PARTIAL();
4522: RRETURN(MATCH_NOMATCH);
4523: }
1.1.1.2 misho 4524: if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_word) == 0)
1.1 misho 4525: RRETURN(MATCH_NOMATCH);
1.1.1.2 misho 4526: eptr++;
1.1 misho 4527: /* No need to skip more bytes - we know it's a 1-byte character */
4528: }
4529: break;
4530:
4531: default:
4532: RRETURN(PCRE_ERROR_INTERNAL);
4533: } /* End switch(ctype) */
4534:
4535: else
1.1.1.2 misho 4536: #endif /* SUPPORT_UTF */
1.1 misho 4537:
4538: /* Code for the non-UTF-8 case for minimum matching of operators other
4539: than OP_PROP and OP_NOTPROP. */
4540:
4541: switch(ctype)
4542: {
4543: case OP_ANY:
4544: for (i = 1; i <= min; i++)
4545: {
4546: if (eptr >= md->end_subject)
4547: {
4548: SCHECK_PARTIAL();
4549: RRETURN(MATCH_NOMATCH);
4550: }
4551: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1.1.1.3 ! misho 4552: if (md->partial != 0 &&
! 4553: eptr + 1 >= md->end_subject &&
! 4554: NLBLOCK->nltype == NLTYPE_FIXED &&
! 4555: NLBLOCK->nllen == 2 &&
! 4556: *eptr == NLBLOCK->nl[0])
! 4557: {
! 4558: md->hitend = TRUE;
! 4559: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
! 4560: }
1.1 misho 4561: eptr++;
4562: }
4563: break;
4564:
4565: case OP_ALLANY:
4566: if (eptr > md->end_subject - min)
4567: {
4568: SCHECK_PARTIAL();
4569: RRETURN(MATCH_NOMATCH);
4570: }
4571: eptr += min;
4572: break;
4573:
4574: case OP_ANYBYTE:
4575: if (eptr > md->end_subject - min)
4576: {
4577: SCHECK_PARTIAL();
4578: RRETURN(MATCH_NOMATCH);
4579: }
4580: eptr += min;
4581: break;
4582:
4583: case OP_ANYNL:
4584: for (i = 1; i <= min; i++)
4585: {
4586: if (eptr >= md->end_subject)
4587: {
4588: SCHECK_PARTIAL();
4589: RRETURN(MATCH_NOMATCH);
4590: }
4591: switch(*eptr++)
4592: {
4593: default: RRETURN(MATCH_NOMATCH);
4594:
4595: case 0x000d:
4596: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4597: break;
4598:
4599: case 0x000a:
4600: break;
4601:
4602: case 0x000b:
4603: case 0x000c:
4604: case 0x0085:
1.1.1.2 misho 4605: #ifdef COMPILE_PCRE16
4606: case 0x2028:
4607: case 0x2029:
4608: #endif
1.1 misho 4609: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4610: break;
4611: }
4612: }
4613: break;
4614:
4615: case OP_NOT_HSPACE:
4616: for (i = 1; i <= min; i++)
4617: {
4618: if (eptr >= md->end_subject)
4619: {
4620: SCHECK_PARTIAL();
4621: RRETURN(MATCH_NOMATCH);
4622: }
4623: switch(*eptr++)
4624: {
4625: default: break;
4626: case 0x09: /* HT */
4627: case 0x20: /* SPACE */
4628: case 0xa0: /* NBSP */
1.1.1.2 misho 4629: #ifdef COMPILE_PCRE16
4630: case 0x1680: /* OGHAM SPACE MARK */
4631: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4632: case 0x2000: /* EN QUAD */
4633: case 0x2001: /* EM QUAD */
4634: case 0x2002: /* EN SPACE */
4635: case 0x2003: /* EM SPACE */
4636: case 0x2004: /* THREE-PER-EM SPACE */
4637: case 0x2005: /* FOUR-PER-EM SPACE */
4638: case 0x2006: /* SIX-PER-EM SPACE */
4639: case 0x2007: /* FIGURE SPACE */
4640: case 0x2008: /* PUNCTUATION SPACE */
4641: case 0x2009: /* THIN SPACE */
4642: case 0x200A: /* HAIR SPACE */
4643: case 0x202f: /* NARROW NO-BREAK SPACE */
4644: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4645: case 0x3000: /* IDEOGRAPHIC SPACE */
4646: #endif
1.1 misho 4647: RRETURN(MATCH_NOMATCH);
4648: }
4649: }
4650: break;
4651:
4652: case OP_HSPACE:
4653: for (i = 1; i <= min; i++)
4654: {
4655: if (eptr >= md->end_subject)
4656: {
4657: SCHECK_PARTIAL();
4658: RRETURN(MATCH_NOMATCH);
4659: }
4660: switch(*eptr++)
4661: {
4662: default: RRETURN(MATCH_NOMATCH);
4663: case 0x09: /* HT */
4664: case 0x20: /* SPACE */
4665: case 0xa0: /* NBSP */
1.1.1.2 misho 4666: #ifdef COMPILE_PCRE16
4667: case 0x1680: /* OGHAM SPACE MARK */
4668: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4669: case 0x2000: /* EN QUAD */
4670: case 0x2001: /* EM QUAD */
4671: case 0x2002: /* EN SPACE */
4672: case 0x2003: /* EM SPACE */
4673: case 0x2004: /* THREE-PER-EM SPACE */
4674: case 0x2005: /* FOUR-PER-EM SPACE */
4675: case 0x2006: /* SIX-PER-EM SPACE */
4676: case 0x2007: /* FIGURE SPACE */
4677: case 0x2008: /* PUNCTUATION SPACE */
4678: case 0x2009: /* THIN SPACE */
4679: case 0x200A: /* HAIR SPACE */
4680: case 0x202f: /* NARROW NO-BREAK SPACE */
4681: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4682: case 0x3000: /* IDEOGRAPHIC SPACE */
4683: #endif
1.1 misho 4684: break;
4685: }
4686: }
4687: break;
4688:
4689: case OP_NOT_VSPACE:
4690: for (i = 1; i <= min; i++)
4691: {
4692: if (eptr >= md->end_subject)
4693: {
4694: SCHECK_PARTIAL();
4695: RRETURN(MATCH_NOMATCH);
4696: }
4697: switch(*eptr++)
4698: {
4699: default: break;
4700: case 0x0a: /* LF */
4701: case 0x0b: /* VT */
4702: case 0x0c: /* FF */
4703: case 0x0d: /* CR */
4704: case 0x85: /* NEL */
1.1.1.2 misho 4705: #ifdef COMPILE_PCRE16
4706: case 0x2028: /* LINE SEPARATOR */
4707: case 0x2029: /* PARAGRAPH SEPARATOR */
4708: #endif
1.1 misho 4709: RRETURN(MATCH_NOMATCH);
4710: }
4711: }
4712: break;
4713:
4714: case OP_VSPACE:
4715: for (i = 1; i <= min; i++)
4716: {
4717: if (eptr >= md->end_subject)
4718: {
4719: SCHECK_PARTIAL();
4720: RRETURN(MATCH_NOMATCH);
4721: }
4722: switch(*eptr++)
4723: {
4724: default: RRETURN(MATCH_NOMATCH);
4725: case 0x0a: /* LF */
4726: case 0x0b: /* VT */
4727: case 0x0c: /* FF */
4728: case 0x0d: /* CR */
4729: case 0x85: /* NEL */
1.1.1.2 misho 4730: #ifdef COMPILE_PCRE16
4731: case 0x2028: /* LINE SEPARATOR */
4732: case 0x2029: /* PARAGRAPH SEPARATOR */
4733: #endif
1.1 misho 4734: break;
4735: }
4736: }
4737: break;
4738:
4739: case OP_NOT_DIGIT:
4740: for (i = 1; i <= min; i++)
4741: {
4742: if (eptr >= md->end_subject)
4743: {
4744: SCHECK_PARTIAL();
4745: RRETURN(MATCH_NOMATCH);
4746: }
1.1.1.2 misho 4747: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4748: RRETURN(MATCH_NOMATCH);
4749: eptr++;
1.1 misho 4750: }
4751: break;
4752:
4753: case OP_DIGIT:
4754: for (i = 1; i <= min; i++)
4755: {
4756: if (eptr >= md->end_subject)
4757: {
4758: SCHECK_PARTIAL();
4759: RRETURN(MATCH_NOMATCH);
4760: }
1.1.1.2 misho 4761: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4762: RRETURN(MATCH_NOMATCH);
4763: eptr++;
1.1 misho 4764: }
4765: break;
4766:
4767: case OP_NOT_WHITESPACE:
4768: for (i = 1; i <= min; i++)
4769: {
4770: if (eptr >= md->end_subject)
4771: {
4772: SCHECK_PARTIAL();
4773: RRETURN(MATCH_NOMATCH);
4774: }
1.1.1.2 misho 4775: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4776: RRETURN(MATCH_NOMATCH);
4777: eptr++;
1.1 misho 4778: }
4779: break;
4780:
4781: case OP_WHITESPACE:
4782: for (i = 1; i <= min; i++)
4783: {
4784: if (eptr >= md->end_subject)
4785: {
4786: SCHECK_PARTIAL();
4787: RRETURN(MATCH_NOMATCH);
4788: }
1.1.1.2 misho 4789: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4790: RRETURN(MATCH_NOMATCH);
4791: eptr++;
1.1 misho 4792: }
4793: break;
4794:
4795: case OP_NOT_WORDCHAR:
4796: for (i = 1; i <= min; i++)
4797: {
4798: if (eptr >= md->end_subject)
4799: {
4800: SCHECK_PARTIAL();
4801: RRETURN(MATCH_NOMATCH);
4802: }
1.1.1.2 misho 4803: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
1.1 misho 4804: RRETURN(MATCH_NOMATCH);
1.1.1.2 misho 4805: eptr++;
1.1 misho 4806: }
4807: break;
4808:
4809: case OP_WORDCHAR:
4810: for (i = 1; i <= min; i++)
4811: {
4812: if (eptr >= md->end_subject)
4813: {
4814: SCHECK_PARTIAL();
4815: RRETURN(MATCH_NOMATCH);
4816: }
1.1.1.2 misho 4817: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
1.1 misho 4818: RRETURN(MATCH_NOMATCH);
1.1.1.2 misho 4819: eptr++;
1.1 misho 4820: }
4821: break;
4822:
4823: default:
4824: RRETURN(PCRE_ERROR_INTERNAL);
4825: }
4826: }
4827:
4828: /* If min = max, continue at the same level without recursing */
4829:
4830: if (min == max) continue;
4831:
4832: /* If minimizing, we have to test the rest of the pattern before each
4833: subsequent match. Again, separate the UTF-8 case for speed, and also
4834: separate the UCP cases. */
4835:
4836: if (minimize)
4837: {
4838: #ifdef SUPPORT_UCP
4839: if (prop_type >= 0)
4840: {
4841: switch(prop_type)
4842: {
4843: case PT_ANY:
4844: for (fi = min;; fi++)
4845: {
4846: RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4847: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4848: if (fi >= max) RRETURN(MATCH_NOMATCH);
4849: if (eptr >= md->end_subject)
4850: {
4851: SCHECK_PARTIAL();
4852: RRETURN(MATCH_NOMATCH);
4853: }
4854: GETCHARINCTEST(c, eptr);
4855: if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4856: }
4857: /* Control never gets here */
4858:
4859: case PT_LAMP:
4860: for (fi = min;; fi++)
4861: {
4862: int chartype;
4863: RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4864: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4865: if (fi >= max) RRETURN(MATCH_NOMATCH);
4866: if (eptr >= md->end_subject)
4867: {
4868: SCHECK_PARTIAL();
4869: RRETURN(MATCH_NOMATCH);
4870: }
4871: GETCHARINCTEST(c, eptr);
4872: chartype = UCD_CHARTYPE(c);
4873: if ((chartype == ucp_Lu ||
4874: chartype == ucp_Ll ||
4875: chartype == ucp_Lt) == prop_fail_result)
4876: RRETURN(MATCH_NOMATCH);
4877: }
4878: /* Control never gets here */
4879:
4880: case PT_GC:
4881: for (fi = min;; fi++)
4882: {
4883: RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4884: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4885: if (fi >= max) RRETURN(MATCH_NOMATCH);
4886: if (eptr >= md->end_subject)
4887: {
4888: SCHECK_PARTIAL();
4889: RRETURN(MATCH_NOMATCH);
4890: }
4891: GETCHARINCTEST(c, eptr);
4892: if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4893: RRETURN(MATCH_NOMATCH);
4894: }
4895: /* Control never gets here */
4896:
4897: case PT_PC:
4898: for (fi = min;; fi++)
4899: {
4900: RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4901: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4902: if (fi >= max) RRETURN(MATCH_NOMATCH);
4903: if (eptr >= md->end_subject)
4904: {
4905: SCHECK_PARTIAL();
4906: RRETURN(MATCH_NOMATCH);
4907: }
4908: GETCHARINCTEST(c, eptr);
4909: if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4910: RRETURN(MATCH_NOMATCH);
4911: }
4912: /* Control never gets here */
4913:
4914: case PT_SC:
4915: for (fi = min;; fi++)
4916: {
4917: RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4918: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4919: if (fi >= max) RRETURN(MATCH_NOMATCH);
4920: if (eptr >= md->end_subject)
4921: {
4922: SCHECK_PARTIAL();
4923: RRETURN(MATCH_NOMATCH);
4924: }
4925: GETCHARINCTEST(c, eptr);
4926: if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4927: RRETURN(MATCH_NOMATCH);
4928: }
4929: /* Control never gets here */
4930:
4931: case PT_ALNUM:
4932: for (fi = min;; fi++)
4933: {
4934: int category;
4935: RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4936: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4937: if (fi >= max) RRETURN(MATCH_NOMATCH);
4938: if (eptr >= md->end_subject)
4939: {
4940: SCHECK_PARTIAL();
4941: RRETURN(MATCH_NOMATCH);
4942: }
4943: GETCHARINCTEST(c, eptr);
4944: category = UCD_CATEGORY(c);
4945: if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4946: RRETURN(MATCH_NOMATCH);
4947: }
4948: /* Control never gets here */
4949:
4950: case PT_SPACE: /* Perl space */
4951: for (fi = min;; fi++)
4952: {
4953: RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4954: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4955: if (fi >= max) RRETURN(MATCH_NOMATCH);
4956: if (eptr >= md->end_subject)
4957: {
4958: SCHECK_PARTIAL();
4959: RRETURN(MATCH_NOMATCH);
4960: }
4961: GETCHARINCTEST(c, eptr);
4962: if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4963: c == CHAR_FF || c == CHAR_CR)
4964: == prop_fail_result)
4965: RRETURN(MATCH_NOMATCH);
4966: }
4967: /* Control never gets here */
4968:
4969: case PT_PXSPACE: /* POSIX space */
4970: for (fi = min;; fi++)
4971: {
4972: RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4973: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4974: if (fi >= max) RRETURN(MATCH_NOMATCH);
4975: if (eptr >= md->end_subject)
4976: {
4977: SCHECK_PARTIAL();
4978: RRETURN(MATCH_NOMATCH);
4979: }
4980: GETCHARINCTEST(c, eptr);
4981: if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4982: c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4983: == prop_fail_result)
4984: RRETURN(MATCH_NOMATCH);
4985: }
4986: /* Control never gets here */
4987:
4988: case PT_WORD:
4989: for (fi = min;; fi++)
4990: {
4991: int category;
4992: RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4993: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4994: if (fi >= max) RRETURN(MATCH_NOMATCH);
4995: if (eptr >= md->end_subject)
4996: {
4997: SCHECK_PARTIAL();
4998: RRETURN(MATCH_NOMATCH);
4999: }
5000: GETCHARINCTEST(c, eptr);
5001: category = UCD_CATEGORY(c);
5002: if ((category == ucp_L ||
5003: category == ucp_N ||
5004: c == CHAR_UNDERSCORE)
5005: == prop_fail_result)
5006: RRETURN(MATCH_NOMATCH);
5007: }
5008: /* Control never gets here */
5009:
5010: /* This should never occur */
5011:
5012: default:
5013: RRETURN(PCRE_ERROR_INTERNAL);
5014: }
5015: }
5016:
5017: /* Match extended Unicode sequences. We will get here only if the
5018: support is in the binary; otherwise a compile-time error occurs. */
5019:
5020: else if (ctype == OP_EXTUNI)
5021: {
5022: for (fi = min;; fi++)
5023: {
5024: RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5025: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5026: if (fi >= max) RRETURN(MATCH_NOMATCH);
5027: if (eptr >= md->end_subject)
5028: {
5029: SCHECK_PARTIAL();
5030: RRETURN(MATCH_NOMATCH);
5031: }
5032: GETCHARINCTEST(c, eptr);
5033: if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
5034: while (eptr < md->end_subject)
5035: {
5036: int len = 1;
1.1.1.2 misho 5037: if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
1.1 misho 5038: if (UCD_CATEGORY(c) != ucp_M) break;
5039: eptr += len;
5040: }
1.1.1.3 ! misho 5041: CHECK_PARTIAL();
1.1 misho 5042: }
5043: }
5044: else
5045: #endif /* SUPPORT_UCP */
5046:
1.1.1.2 misho 5047: #ifdef SUPPORT_UTF
5048: if (utf)
1.1 misho 5049: {
5050: for (fi = min;; fi++)
5051: {
5052: RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5053: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5054: if (fi >= max) RRETURN(MATCH_NOMATCH);
5055: if (eptr >= md->end_subject)
5056: {
5057: SCHECK_PARTIAL();
5058: RRETURN(MATCH_NOMATCH);
5059: }
5060: if (ctype == OP_ANY && IS_NEWLINE(eptr))
5061: RRETURN(MATCH_NOMATCH);
5062: GETCHARINC(c, eptr);
5063: switch(ctype)
5064: {
1.1.1.3 ! misho 5065: case OP_ANY: /* This is the non-NL case */
! 5066: if (md->partial != 0 && /* Take care with CRLF partial */
! 5067: eptr >= md->end_subject &&
! 5068: NLBLOCK->nltype == NLTYPE_FIXED &&
! 5069: NLBLOCK->nllen == 2 &&
! 5070: c == NLBLOCK->nl[0])
! 5071: {
! 5072: md->hitend = TRUE;
! 5073: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
! 5074: }
! 5075: break;
! 5076:
1.1 misho 5077: case OP_ALLANY:
5078: case OP_ANYBYTE:
5079: break;
5080:
5081: case OP_ANYNL:
5082: switch(c)
5083: {
5084: default: RRETURN(MATCH_NOMATCH);
5085: case 0x000d:
5086: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5087: break;
5088: case 0x000a:
5089: break;
5090:
5091: case 0x000b:
5092: case 0x000c:
5093: case 0x0085:
5094: case 0x2028:
5095: case 0x2029:
5096: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5097: break;
5098: }
5099: break;
5100:
5101: case OP_NOT_HSPACE:
5102: switch(c)
5103: {
5104: default: break;
5105: case 0x09: /* HT */
5106: case 0x20: /* SPACE */
5107: case 0xa0: /* NBSP */
5108: case 0x1680: /* OGHAM SPACE MARK */
5109: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5110: case 0x2000: /* EN QUAD */
5111: case 0x2001: /* EM QUAD */
5112: case 0x2002: /* EN SPACE */
5113: case 0x2003: /* EM SPACE */
5114: case 0x2004: /* THREE-PER-EM SPACE */
5115: case 0x2005: /* FOUR-PER-EM SPACE */
5116: case 0x2006: /* SIX-PER-EM SPACE */
5117: case 0x2007: /* FIGURE SPACE */
5118: case 0x2008: /* PUNCTUATION SPACE */
5119: case 0x2009: /* THIN SPACE */
5120: case 0x200A: /* HAIR SPACE */
5121: case 0x202f: /* NARROW NO-BREAK SPACE */
5122: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5123: case 0x3000: /* IDEOGRAPHIC SPACE */
5124: RRETURN(MATCH_NOMATCH);
5125: }
5126: break;
5127:
5128: case OP_HSPACE:
5129: switch(c)
5130: {
5131: default: RRETURN(MATCH_NOMATCH);
5132: case 0x09: /* HT */
5133: case 0x20: /* SPACE */
5134: case 0xa0: /* NBSP */
5135: case 0x1680: /* OGHAM SPACE MARK */
5136: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5137: case 0x2000: /* EN QUAD */
5138: case 0x2001: /* EM QUAD */
5139: case 0x2002: /* EN SPACE */
5140: case 0x2003: /* EM SPACE */
5141: case 0x2004: /* THREE-PER-EM SPACE */
5142: case 0x2005: /* FOUR-PER-EM SPACE */
5143: case 0x2006: /* SIX-PER-EM SPACE */
5144: case 0x2007: /* FIGURE SPACE */
5145: case 0x2008: /* PUNCTUATION SPACE */
5146: case 0x2009: /* THIN SPACE */
5147: case 0x200A: /* HAIR SPACE */
5148: case 0x202f: /* NARROW NO-BREAK SPACE */
5149: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5150: case 0x3000: /* IDEOGRAPHIC SPACE */
5151: break;
5152: }
5153: break;
5154:
5155: case OP_NOT_VSPACE:
5156: switch(c)
5157: {
5158: default: break;
5159: case 0x0a: /* LF */
5160: case 0x0b: /* VT */
5161: case 0x0c: /* FF */
5162: case 0x0d: /* CR */
5163: case 0x85: /* NEL */
5164: case 0x2028: /* LINE SEPARATOR */
5165: case 0x2029: /* PARAGRAPH SEPARATOR */
5166: RRETURN(MATCH_NOMATCH);
5167: }
5168: break;
5169:
5170: case OP_VSPACE:
5171: switch(c)
5172: {
5173: default: RRETURN(MATCH_NOMATCH);
5174: case 0x0a: /* LF */
5175: case 0x0b: /* VT */
5176: case 0x0c: /* FF */
5177: case 0x0d: /* CR */
5178: case 0x85: /* NEL */
5179: case 0x2028: /* LINE SEPARATOR */
5180: case 0x2029: /* PARAGRAPH SEPARATOR */
5181: break;
5182: }
5183: break;
5184:
5185: case OP_NOT_DIGIT:
5186: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5187: RRETURN(MATCH_NOMATCH);
5188: break;
5189:
5190: case OP_DIGIT:
5191: if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5192: RRETURN(MATCH_NOMATCH);
5193: break;
5194:
5195: case OP_NOT_WHITESPACE:
5196: if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5197: RRETURN(MATCH_NOMATCH);
5198: break;
5199:
5200: case OP_WHITESPACE:
1.1.1.2 misho 5201: if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
1.1 misho 5202: RRETURN(MATCH_NOMATCH);
5203: break;
5204:
5205: case OP_NOT_WORDCHAR:
5206: if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5207: RRETURN(MATCH_NOMATCH);
5208: break;
5209:
5210: case OP_WORDCHAR:
5211: if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5212: RRETURN(MATCH_NOMATCH);
5213: break;
5214:
5215: default:
5216: RRETURN(PCRE_ERROR_INTERNAL);
5217: }
5218: }
5219: }
5220: else
5221: #endif
1.1.1.2 misho 5222: /* Not UTF mode */
1.1 misho 5223: {
5224: for (fi = min;; fi++)
5225: {
5226: RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5227: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5228: if (fi >= max) RRETURN(MATCH_NOMATCH);
5229: if (eptr >= md->end_subject)
5230: {
5231: SCHECK_PARTIAL();
5232: RRETURN(MATCH_NOMATCH);
5233: }
5234: if (ctype == OP_ANY && IS_NEWLINE(eptr))
5235: RRETURN(MATCH_NOMATCH);
5236: c = *eptr++;
5237: switch(ctype)
5238: {
1.1.1.3 ! misho 5239: case OP_ANY: /* This is the non-NL case */
! 5240: if (md->partial != 0 && /* Take care with CRLF partial */
! 5241: eptr >= md->end_subject &&
! 5242: NLBLOCK->nltype == NLTYPE_FIXED &&
! 5243: NLBLOCK->nllen == 2 &&
! 5244: c == NLBLOCK->nl[0])
! 5245: {
! 5246: md->hitend = TRUE;
! 5247: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
! 5248: }
! 5249: break;
! 5250:
1.1 misho 5251: case OP_ALLANY:
5252: case OP_ANYBYTE:
5253: break;
5254:
5255: case OP_ANYNL:
5256: switch(c)
5257: {
5258: default: RRETURN(MATCH_NOMATCH);
5259: case 0x000d:
5260: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5261: break;
5262:
5263: case 0x000a:
5264: break;
5265:
5266: case 0x000b:
5267: case 0x000c:
5268: case 0x0085:
1.1.1.2 misho 5269: #ifdef COMPILE_PCRE16
5270: case 0x2028:
5271: case 0x2029:
5272: #endif
1.1 misho 5273: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5274: break;
5275: }
5276: break;
5277:
5278: case OP_NOT_HSPACE:
5279: switch(c)
5280: {
5281: default: break;
5282: case 0x09: /* HT */
5283: case 0x20: /* SPACE */
5284: case 0xa0: /* NBSP */
1.1.1.2 misho 5285: #ifdef COMPILE_PCRE16
5286: case 0x1680: /* OGHAM SPACE MARK */
5287: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5288: case 0x2000: /* EN QUAD */
5289: case 0x2001: /* EM QUAD */
5290: case 0x2002: /* EN SPACE */
5291: case 0x2003: /* EM SPACE */
5292: case 0x2004: /* THREE-PER-EM SPACE */
5293: case 0x2005: /* FOUR-PER-EM SPACE */
5294: case 0x2006: /* SIX-PER-EM SPACE */
5295: case 0x2007: /* FIGURE SPACE */
5296: case 0x2008: /* PUNCTUATION SPACE */
5297: case 0x2009: /* THIN SPACE */
5298: case 0x200A: /* HAIR SPACE */
5299: case 0x202f: /* NARROW NO-BREAK SPACE */
5300: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5301: case 0x3000: /* IDEOGRAPHIC SPACE */
5302: #endif
1.1 misho 5303: RRETURN(MATCH_NOMATCH);
5304: }
5305: break;
5306:
5307: case OP_HSPACE:
5308: switch(c)
5309: {
5310: default: RRETURN(MATCH_NOMATCH);
5311: case 0x09: /* HT */
5312: case 0x20: /* SPACE */
5313: case 0xa0: /* NBSP */
1.1.1.2 misho 5314: #ifdef COMPILE_PCRE16
5315: case 0x1680: /* OGHAM SPACE MARK */
5316: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5317: case 0x2000: /* EN QUAD */
5318: case 0x2001: /* EM QUAD */
5319: case 0x2002: /* EN SPACE */
5320: case 0x2003: /* EM SPACE */
5321: case 0x2004: /* THREE-PER-EM SPACE */
5322: case 0x2005: /* FOUR-PER-EM SPACE */
5323: case 0x2006: /* SIX-PER-EM SPACE */
5324: case 0x2007: /* FIGURE SPACE */
5325: case 0x2008: /* PUNCTUATION SPACE */
5326: case 0x2009: /* THIN SPACE */
5327: case 0x200A: /* HAIR SPACE */
5328: case 0x202f: /* NARROW NO-BREAK SPACE */
5329: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5330: case 0x3000: /* IDEOGRAPHIC SPACE */
5331: #endif
1.1 misho 5332: break;
5333: }
5334: break;
5335:
5336: case OP_NOT_VSPACE:
5337: switch(c)
5338: {
5339: default: break;
5340: case 0x0a: /* LF */
5341: case 0x0b: /* VT */
5342: case 0x0c: /* FF */
5343: case 0x0d: /* CR */
5344: case 0x85: /* NEL */
1.1.1.2 misho 5345: #ifdef COMPILE_PCRE16
5346: case 0x2028: /* LINE SEPARATOR */
5347: case 0x2029: /* PARAGRAPH SEPARATOR */
5348: #endif
1.1 misho 5349: RRETURN(MATCH_NOMATCH);
5350: }
5351: break;
5352:
5353: case OP_VSPACE:
5354: switch(c)
5355: {
5356: default: RRETURN(MATCH_NOMATCH);
5357: case 0x0a: /* LF */
5358: case 0x0b: /* VT */
5359: case 0x0c: /* FF */
5360: case 0x0d: /* CR */
5361: case 0x85: /* NEL */
1.1.1.2 misho 5362: #ifdef COMPILE_PCRE16
5363: case 0x2028: /* LINE SEPARATOR */
5364: case 0x2029: /* PARAGRAPH SEPARATOR */
5365: #endif
1.1 misho 5366: break;
5367: }
5368: break;
5369:
5370: case OP_NOT_DIGIT:
1.1.1.2 misho 5371: if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
1.1 misho 5372: break;
5373:
5374: case OP_DIGIT:
1.1.1.2 misho 5375: if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
1.1 misho 5376: break;
5377:
5378: case OP_NOT_WHITESPACE:
1.1.1.2 misho 5379: if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
1.1 misho 5380: break;
5381:
5382: case OP_WHITESPACE:
1.1.1.2 misho 5383: if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
1.1 misho 5384: break;
5385:
5386: case OP_NOT_WORDCHAR:
1.1.1.2 misho 5387: if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
1.1 misho 5388: break;
5389:
5390: case OP_WORDCHAR:
1.1.1.2 misho 5391: if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
1.1 misho 5392: break;
5393:
5394: default:
5395: RRETURN(PCRE_ERROR_INTERNAL);
5396: }
5397: }
5398: }
5399: /* Control never gets here */
5400: }
5401:
5402: /* If maximizing, it is worth using inline code for speed, doing the type
5403: test once at the start (i.e. keep it out of the loop). Again, keep the
5404: UTF-8 and UCP stuff separate. */
5405:
5406: else
5407: {
5408: pp = eptr; /* Remember where we started */
5409:
5410: #ifdef SUPPORT_UCP
5411: if (prop_type >= 0)
5412: {
5413: switch(prop_type)
5414: {
5415: case PT_ANY:
5416: for (i = min; i < max; i++)
5417: {
5418: int len = 1;
5419: if (eptr >= md->end_subject)
5420: {
5421: SCHECK_PARTIAL();
5422: break;
5423: }
5424: GETCHARLENTEST(c, eptr, len);
5425: if (prop_fail_result) break;
5426: eptr+= len;
5427: }
5428: break;
5429:
5430: case PT_LAMP:
5431: for (i = min; i < max; i++)
5432: {
5433: int chartype;
5434: int len = 1;
5435: if (eptr >= md->end_subject)
5436: {
5437: SCHECK_PARTIAL();
5438: break;
5439: }
5440: GETCHARLENTEST(c, eptr, len);
5441: chartype = UCD_CHARTYPE(c);
5442: if ((chartype == ucp_Lu ||
5443: chartype == ucp_Ll ||
5444: chartype == ucp_Lt) == prop_fail_result)
5445: break;
5446: eptr+= len;
5447: }
5448: break;
5449:
5450: case PT_GC:
5451: for (i = min; i < max; i++)
5452: {
5453: int len = 1;
5454: if (eptr >= md->end_subject)
5455: {
5456: SCHECK_PARTIAL();
5457: break;
5458: }
5459: GETCHARLENTEST(c, eptr, len);
5460: if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5461: eptr+= len;
5462: }
5463: break;
5464:
5465: case PT_PC:
5466: for (i = min; i < max; i++)
5467: {
5468: int len = 1;
5469: if (eptr >= md->end_subject)
5470: {
5471: SCHECK_PARTIAL();
5472: break;
5473: }
5474: GETCHARLENTEST(c, eptr, len);
5475: if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5476: eptr+= len;
5477: }
5478: break;
5479:
5480: case PT_SC:
5481: for (i = min; i < max; i++)
5482: {
5483: int len = 1;
5484: if (eptr >= md->end_subject)
5485: {
5486: SCHECK_PARTIAL();
5487: break;
5488: }
5489: GETCHARLENTEST(c, eptr, len);
5490: if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5491: eptr+= len;
5492: }
5493: break;
5494:
5495: case PT_ALNUM:
5496: for (i = min; i < max; i++)
5497: {
5498: int category;
5499: int len = 1;
5500: if (eptr >= md->end_subject)
5501: {
5502: SCHECK_PARTIAL();
5503: break;
5504: }
5505: GETCHARLENTEST(c, eptr, len);
5506: category = UCD_CATEGORY(c);
5507: if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5508: break;
5509: eptr+= len;
5510: }
5511: break;
5512:
5513: case PT_SPACE: /* Perl space */
5514: for (i = min; i < max; i++)
5515: {
5516: int len = 1;
5517: if (eptr >= md->end_subject)
5518: {
5519: SCHECK_PARTIAL();
5520: break;
5521: }
5522: GETCHARLENTEST(c, eptr, len);
5523: if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5524: c == CHAR_FF || c == CHAR_CR)
5525: == prop_fail_result)
5526: break;
5527: eptr+= len;
5528: }
5529: break;
5530:
5531: case PT_PXSPACE: /* POSIX space */
5532: for (i = min; i < max; i++)
5533: {
5534: int len = 1;
5535: if (eptr >= md->end_subject)
5536: {
5537: SCHECK_PARTIAL();
5538: break;
5539: }
5540: GETCHARLENTEST(c, eptr, len);
5541: if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5542: c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5543: == prop_fail_result)
5544: break;
5545: eptr+= len;
5546: }
5547: break;
5548:
5549: case PT_WORD:
5550: for (i = min; i < max; i++)
5551: {
5552: int category;
5553: int len = 1;
5554: if (eptr >= md->end_subject)
5555: {
5556: SCHECK_PARTIAL();
5557: break;
5558: }
5559: GETCHARLENTEST(c, eptr, len);
5560: category = UCD_CATEGORY(c);
5561: if ((category == ucp_L || category == ucp_N ||
5562: c == CHAR_UNDERSCORE) == prop_fail_result)
5563: break;
5564: eptr+= len;
5565: }
5566: break;
5567:
5568: default:
5569: RRETURN(PCRE_ERROR_INTERNAL);
5570: }
5571:
5572: /* eptr is now past the end of the maximum run */
5573:
5574: if (possessive) continue;
5575: for(;;)
5576: {
5577: RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5578: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5579: if (eptr-- == pp) break; /* Stop if tried at original pos */
1.1.1.2 misho 5580: if (utf) BACKCHAR(eptr);
1.1 misho 5581: }
5582: }
5583:
5584: /* Match extended Unicode sequences. We will get here only if the
5585: support is in the binary; otherwise a compile-time error occurs. */
5586:
5587: else if (ctype == OP_EXTUNI)
5588: {
5589: for (i = min; i < max; i++)
5590: {
5591: int len = 1;
5592: if (eptr >= md->end_subject)
5593: {
5594: SCHECK_PARTIAL();
5595: break;
5596: }
1.1.1.2 misho 5597: if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
1.1 misho 5598: if (UCD_CATEGORY(c) == ucp_M) break;
5599: eptr += len;
5600: while (eptr < md->end_subject)
5601: {
5602: len = 1;
1.1.1.2 misho 5603: if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
1.1 misho 5604: if (UCD_CATEGORY(c) != ucp_M) break;
5605: eptr += len;
5606: }
1.1.1.3 ! misho 5607: CHECK_PARTIAL();
1.1 misho 5608: }
5609:
5610: /* eptr is now past the end of the maximum run */
5611:
5612: if (possessive) continue;
5613:
5614: for(;;)
5615: {
5616: RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5617: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5618: if (eptr-- == pp) break; /* Stop if tried at original pos */
5619: for (;;) /* Move back over one extended */
5620: {
1.1.1.2 misho 5621: if (!utf) c = *eptr; else
1.1 misho 5622: {
5623: BACKCHAR(eptr);
5624: GETCHAR(c, eptr);
5625: }
5626: if (UCD_CATEGORY(c) != ucp_M) break;
5627: eptr--;
5628: }
5629: }
5630: }
5631:
5632: else
5633: #endif /* SUPPORT_UCP */
5634:
1.1.1.2 misho 5635: #ifdef SUPPORT_UTF
5636: if (utf)
1.1 misho 5637: {
5638: switch(ctype)
5639: {
5640: case OP_ANY:
5641: if (max < INT_MAX)
5642: {
5643: for (i = min; i < max; i++)
5644: {
5645: if (eptr >= md->end_subject)
5646: {
5647: SCHECK_PARTIAL();
5648: break;
5649: }
5650: if (IS_NEWLINE(eptr)) break;
1.1.1.3 ! misho 5651: if (md->partial != 0 && /* Take care with CRLF partial */
! 5652: eptr + 1 >= md->end_subject &&
! 5653: NLBLOCK->nltype == NLTYPE_FIXED &&
! 5654: NLBLOCK->nllen == 2 &&
! 5655: *eptr == NLBLOCK->nl[0])
! 5656: {
! 5657: md->hitend = TRUE;
! 5658: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
! 5659: }
1.1 misho 5660: eptr++;
1.1.1.2 misho 5661: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misho 5662: }
5663: }
5664:
5665: /* Handle unlimited UTF-8 repeat */
5666:
5667: else
5668: {
5669: for (i = min; i < max; i++)
5670: {
5671: if (eptr >= md->end_subject)
5672: {
5673: SCHECK_PARTIAL();
5674: break;
5675: }
5676: if (IS_NEWLINE(eptr)) break;
1.1.1.3 ! misho 5677: if (md->partial != 0 && /* Take care with CRLF partial */
! 5678: eptr + 1 >= md->end_subject &&
! 5679: NLBLOCK->nltype == NLTYPE_FIXED &&
! 5680: NLBLOCK->nllen == 2 &&
! 5681: *eptr == NLBLOCK->nl[0])
! 5682: {
! 5683: md->hitend = TRUE;
! 5684: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
! 5685: }
1.1 misho 5686: eptr++;
1.1.1.2 misho 5687: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misho 5688: }
5689: }
5690: break;
5691:
5692: case OP_ALLANY:
5693: if (max < INT_MAX)
5694: {
5695: for (i = min; i < max; i++)
5696: {
5697: if (eptr >= md->end_subject)
5698: {
5699: SCHECK_PARTIAL();
5700: break;
5701: }
5702: eptr++;
1.1.1.2 misho 5703: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misho 5704: }
5705: }
5706: else
5707: {
5708: eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5709: SCHECK_PARTIAL();
5710: }
5711: break;
5712:
5713: /* The byte case is the same as non-UTF8 */
5714:
5715: case OP_ANYBYTE:
5716: c = max - min;
5717: if (c > (unsigned int)(md->end_subject - eptr))
5718: {
5719: eptr = md->end_subject;
5720: SCHECK_PARTIAL();
5721: }
5722: else eptr += c;
5723: break;
5724:
5725: case OP_ANYNL:
5726: for (i = min; i < max; i++)
5727: {
5728: int len = 1;
5729: if (eptr >= md->end_subject)
5730: {
5731: SCHECK_PARTIAL();
5732: break;
5733: }
5734: GETCHARLEN(c, eptr, len);
5735: if (c == 0x000d)
5736: {
5737: if (++eptr >= md->end_subject) break;
5738: if (*eptr == 0x000a) eptr++;
5739: }
5740: else
5741: {
5742: if (c != 0x000a &&
5743: (md->bsr_anycrlf ||
5744: (c != 0x000b && c != 0x000c &&
5745: c != 0x0085 && c != 0x2028 && c != 0x2029)))
5746: break;
5747: eptr += len;
5748: }
5749: }
5750: break;
5751:
5752: case OP_NOT_HSPACE:
5753: case OP_HSPACE:
5754: for (i = min; i < max; i++)
5755: {
5756: BOOL gotspace;
5757: int len = 1;
5758: if (eptr >= md->end_subject)
5759: {
5760: SCHECK_PARTIAL();
5761: break;
5762: }
5763: GETCHARLEN(c, eptr, len);
5764: switch(c)
5765: {
5766: default: gotspace = FALSE; break;
5767: case 0x09: /* HT */
5768: case 0x20: /* SPACE */
5769: case 0xa0: /* NBSP */
5770: case 0x1680: /* OGHAM SPACE MARK */
5771: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5772: case 0x2000: /* EN QUAD */
5773: case 0x2001: /* EM QUAD */
5774: case 0x2002: /* EN SPACE */
5775: case 0x2003: /* EM SPACE */
5776: case 0x2004: /* THREE-PER-EM SPACE */
5777: case 0x2005: /* FOUR-PER-EM SPACE */
5778: case 0x2006: /* SIX-PER-EM SPACE */
5779: case 0x2007: /* FIGURE SPACE */
5780: case 0x2008: /* PUNCTUATION SPACE */
5781: case 0x2009: /* THIN SPACE */
5782: case 0x200A: /* HAIR SPACE */
5783: case 0x202f: /* NARROW NO-BREAK SPACE */
5784: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5785: case 0x3000: /* IDEOGRAPHIC SPACE */
5786: gotspace = TRUE;
5787: break;
5788: }
5789: if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5790: eptr += len;
5791: }
5792: break;
5793:
5794: case OP_NOT_VSPACE:
5795: case OP_VSPACE:
5796: for (i = min; i < max; i++)
5797: {
5798: BOOL gotspace;
5799: int len = 1;
5800: if (eptr >= md->end_subject)
5801: {
5802: SCHECK_PARTIAL();
5803: break;
5804: }
5805: GETCHARLEN(c, eptr, len);
5806: switch(c)
5807: {
5808: default: gotspace = FALSE; break;
5809: case 0x0a: /* LF */
5810: case 0x0b: /* VT */
5811: case 0x0c: /* FF */
5812: case 0x0d: /* CR */
5813: case 0x85: /* NEL */
5814: case 0x2028: /* LINE SEPARATOR */
5815: case 0x2029: /* PARAGRAPH SEPARATOR */
5816: gotspace = TRUE;
5817: break;
5818: }
5819: if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5820: eptr += len;
5821: }
5822: break;
5823:
5824: case OP_NOT_DIGIT:
5825: for (i = min; i < max; i++)
5826: {
5827: int len = 1;
5828: if (eptr >= md->end_subject)
5829: {
5830: SCHECK_PARTIAL();
5831: break;
5832: }
5833: GETCHARLEN(c, eptr, len);
5834: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5835: eptr+= len;
5836: }
5837: break;
5838:
5839: case OP_DIGIT:
5840: for (i = min; i < max; i++)
5841: {
5842: int len = 1;
5843: if (eptr >= md->end_subject)
5844: {
5845: SCHECK_PARTIAL();
5846: break;
5847: }
5848: GETCHARLEN(c, eptr, len);
5849: if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5850: eptr+= len;
5851: }
5852: break;
5853:
5854: case OP_NOT_WHITESPACE:
5855: for (i = min; i < max; i++)
5856: {
5857: int len = 1;
5858: if (eptr >= md->end_subject)
5859: {
5860: SCHECK_PARTIAL();
5861: break;
5862: }
5863: GETCHARLEN(c, eptr, len);
5864: if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5865: eptr+= len;
5866: }
5867: break;
5868:
5869: case OP_WHITESPACE:
5870: for (i = min; i < max; i++)
5871: {
5872: int len = 1;
5873: if (eptr >= md->end_subject)
5874: {
5875: SCHECK_PARTIAL();
5876: break;
5877: }
5878: GETCHARLEN(c, eptr, len);
5879: if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5880: eptr+= len;
5881: }
5882: break;
5883:
5884: case OP_NOT_WORDCHAR:
5885: for (i = min; i < max; i++)
5886: {
5887: int len = 1;
5888: if (eptr >= md->end_subject)
5889: {
5890: SCHECK_PARTIAL();
5891: break;
5892: }
5893: GETCHARLEN(c, eptr, len);
5894: if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5895: eptr+= len;
5896: }
5897: break;
5898:
5899: case OP_WORDCHAR:
5900: for (i = min; i < max; i++)
5901: {
5902: int len = 1;
5903: if (eptr >= md->end_subject)
5904: {
5905: SCHECK_PARTIAL();
5906: break;
5907: }
5908: GETCHARLEN(c, eptr, len);
5909: if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5910: eptr+= len;
5911: }
5912: break;
5913:
5914: default:
5915: RRETURN(PCRE_ERROR_INTERNAL);
5916: }
5917:
5918: /* eptr is now past the end of the maximum run. If possessive, we are
5919: done (no backing up). Otherwise, match at this position; anything other
5920: than no match is immediately returned. For nomatch, back up one
5921: character, unless we are matching \R and the last thing matched was
5922: \r\n, in which case, back up two bytes. */
5923:
5924: if (possessive) continue;
5925: for(;;)
5926: {
5927: RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5928: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5929: if (eptr-- == pp) break; /* Stop if tried at original pos */
5930: BACKCHAR(eptr);
5931: if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5932: eptr[-1] == '\r') eptr--;
5933: }
5934: }
5935: else
1.1.1.2 misho 5936: #endif /* SUPPORT_UTF */
5937: /* Not UTF mode */
1.1 misho 5938: {
5939: switch(ctype)
5940: {
5941: case OP_ANY:
5942: for (i = min; i < max; i++)
5943: {
5944: if (eptr >= md->end_subject)
5945: {
5946: SCHECK_PARTIAL();
5947: break;
5948: }
5949: if (IS_NEWLINE(eptr)) break;
1.1.1.3 ! misho 5950: if (md->partial != 0 && /* Take care with CRLF partial */
! 5951: eptr + 1 >= md->end_subject &&
! 5952: NLBLOCK->nltype == NLTYPE_FIXED &&
! 5953: NLBLOCK->nllen == 2 &&
! 5954: *eptr == NLBLOCK->nl[0])
! 5955: {
! 5956: md->hitend = TRUE;
! 5957: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
! 5958: }
1.1 misho 5959: eptr++;
5960: }
5961: break;
5962:
5963: case OP_ALLANY:
5964: case OP_ANYBYTE:
5965: c = max - min;
5966: if (c > (unsigned int)(md->end_subject - eptr))
5967: {
5968: eptr = md->end_subject;
5969: SCHECK_PARTIAL();
5970: }
5971: else eptr += c;
5972: break;
5973:
5974: case OP_ANYNL:
5975: for (i = min; i < max; i++)
5976: {
5977: if (eptr >= md->end_subject)
5978: {
5979: SCHECK_PARTIAL();
5980: break;
5981: }
5982: c = *eptr;
5983: if (c == 0x000d)
5984: {
5985: if (++eptr >= md->end_subject) break;
5986: if (*eptr == 0x000a) eptr++;
5987: }
5988: else
5989: {
1.1.1.2 misho 5990: if (c != 0x000a && (md->bsr_anycrlf ||
5991: (c != 0x000b && c != 0x000c && c != 0x0085
5992: #ifdef COMPILE_PCRE16
5993: && c != 0x2028 && c != 0x2029
5994: #endif
5995: ))) break;
1.1 misho 5996: eptr++;
5997: }
5998: }
5999: break;
6000:
6001: case OP_NOT_HSPACE:
6002: for (i = min; i < max; i++)
6003: {
6004: if (eptr >= md->end_subject)
6005: {
6006: SCHECK_PARTIAL();
6007: break;
6008: }
6009: c = *eptr;
1.1.1.2 misho 6010: if (c == 0x09 || c == 0x20 || c == 0xa0
6011: #ifdef COMPILE_PCRE16
6012: || c == 0x1680 || c == 0x180e || (c >= 0x2000 && c <= 0x200A)
6013: || c == 0x202f || c == 0x205f || c == 0x3000
6014: #endif
6015: ) break;
1.1 misho 6016: eptr++;
6017: }
6018: break;
6019:
6020: case OP_HSPACE:
6021: for (i = min; i < max; i++)
6022: {
6023: if (eptr >= md->end_subject)
6024: {
6025: SCHECK_PARTIAL();
6026: break;
6027: }
6028: c = *eptr;
1.1.1.2 misho 6029: if (c != 0x09 && c != 0x20 && c != 0xa0
6030: #ifdef COMPILE_PCRE16
6031: && c != 0x1680 && c != 0x180e && (c < 0x2000 || c > 0x200A)
6032: && c != 0x202f && c != 0x205f && c != 0x3000
6033: #endif
6034: ) break;
1.1 misho 6035: eptr++;
6036: }
6037: break;
6038:
6039: case OP_NOT_VSPACE:
6040: for (i = min; i < max; i++)
6041: {
6042: if (eptr >= md->end_subject)
6043: {
6044: SCHECK_PARTIAL();
6045: break;
6046: }
6047: c = *eptr;
1.1.1.2 misho 6048: if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85
6049: #ifdef COMPILE_PCRE16
6050: || c == 0x2028 || c == 0x2029
6051: #endif
6052: ) break;
1.1 misho 6053: eptr++;
6054: }
6055: break;
6056:
6057: case OP_VSPACE:
6058: for (i = min; i < max; i++)
6059: {
6060: if (eptr >= md->end_subject)
6061: {
6062: SCHECK_PARTIAL();
6063: break;
6064: }
6065: c = *eptr;
1.1.1.2 misho 6066: if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85
6067: #ifdef COMPILE_PCRE16
6068: && c != 0x2028 && c != 0x2029
6069: #endif
6070: ) break;
1.1 misho 6071: eptr++;
6072: }
6073: break;
6074:
6075: case OP_NOT_DIGIT:
6076: for (i = min; i < max; i++)
6077: {
6078: if (eptr >= md->end_subject)
6079: {
6080: SCHECK_PARTIAL();
6081: break;
6082: }
1.1.1.2 misho 6083: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
1.1 misho 6084: eptr++;
6085: }
6086: break;
6087:
6088: case OP_DIGIT:
6089: for (i = min; i < max; i++)
6090: {
6091: if (eptr >= md->end_subject)
6092: {
6093: SCHECK_PARTIAL();
6094: break;
6095: }
1.1.1.2 misho 6096: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
1.1 misho 6097: eptr++;
6098: }
6099: break;
6100:
6101: case OP_NOT_WHITESPACE:
6102: for (i = min; i < max; i++)
6103: {
6104: if (eptr >= md->end_subject)
6105: {
6106: SCHECK_PARTIAL();
6107: break;
6108: }
1.1.1.2 misho 6109: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
1.1 misho 6110: eptr++;
6111: }
6112: break;
6113:
6114: case OP_WHITESPACE:
6115: for (i = min; i < max; i++)
6116: {
6117: if (eptr >= md->end_subject)
6118: {
6119: SCHECK_PARTIAL();
6120: break;
6121: }
1.1.1.2 misho 6122: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
1.1 misho 6123: eptr++;
6124: }
6125: break;
6126:
6127: case OP_NOT_WORDCHAR:
6128: for (i = min; i < max; i++)
6129: {
6130: if (eptr >= md->end_subject)
6131: {
6132: SCHECK_PARTIAL();
6133: break;
6134: }
1.1.1.2 misho 6135: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
1.1 misho 6136: eptr++;
6137: }
6138: break;
6139:
6140: case OP_WORDCHAR:
6141: for (i = min; i < max; i++)
6142: {
6143: if (eptr >= md->end_subject)
6144: {
6145: SCHECK_PARTIAL();
6146: break;
6147: }
1.1.1.2 misho 6148: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
1.1 misho 6149: eptr++;
6150: }
6151: break;
6152:
6153: default:
6154: RRETURN(PCRE_ERROR_INTERNAL);
6155: }
6156:
6157: /* eptr is now past the end of the maximum run. If possessive, we are
6158: done (no backing up). Otherwise, match at this position; anything other
6159: than no match is immediately returned. For nomatch, back up one
6160: character (byte), unless we are matching \R and the last thing matched
6161: was \r\n, in which case, back up two bytes. */
6162:
6163: if (possessive) continue;
6164: while (eptr >= pp)
6165: {
6166: RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6167: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6168: eptr--;
6169: if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
6170: eptr[-1] == '\r') eptr--;
6171: }
6172: }
6173:
6174: /* Get here if we can't make it match with any permitted repetitions */
6175:
6176: RRETURN(MATCH_NOMATCH);
6177: }
6178: /* Control never gets here */
6179:
6180: /* There's been some horrible disaster. Arrival here can only mean there is
6181: something seriously wrong in the code above or the OP_xxx definitions. */
6182:
6183: default:
6184: DPRINTF(("Unknown opcode %d\n", *ecode));
6185: RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6186: }
6187:
6188: /* Do not stick any code in here without much thought; it is assumed
6189: that "continue" in the code above comes out to here to repeat the main
6190: loop. */
6191:
6192: } /* End of main loop */
6193: /* Control never reaches here */
6194:
6195:
6196: /* When compiling to use the heap rather than the stack for recursive calls to
6197: match(), the RRETURN() macro jumps here. The number that is saved in
6198: frame->Xwhere indicates which label we actually want to return to. */
6199:
6200: #ifdef NO_RECURSE
6201: #define LBL(val) case val: goto L_RM##val;
6202: HEAP_RETURN:
6203: switch (frame->Xwhere)
6204: {
6205: LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6206: LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6207: LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6208: LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6209: LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6210: LBL(65) LBL(66)
1.1.1.2 misho 6211: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6212: LBL(21)
6213: #endif
6214: #ifdef SUPPORT_UTF
6215: LBL(16) LBL(18) LBL(20)
6216: LBL(22) LBL(23) LBL(28) LBL(30)
1.1 misho 6217: LBL(32) LBL(34) LBL(42) LBL(46)
6218: #ifdef SUPPORT_UCP
6219: LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6220: LBL(59) LBL(60) LBL(61) LBL(62)
6221: #endif /* SUPPORT_UCP */
1.1.1.2 misho 6222: #endif /* SUPPORT_UTF */
1.1 misho 6223: default:
6224: DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
1.1.1.2 misho 6225:
6226: printf("+++jump error in pcre match: label %d non-existent\n", frame->Xwhere);
6227:
1.1 misho 6228: return PCRE_ERROR_INTERNAL;
6229: }
6230: #undef LBL
6231: #endif /* NO_RECURSE */
6232: }
6233:
6234:
6235: /***************************************************************************
6236: ****************************************************************************
6237: RECURSION IN THE match() FUNCTION
6238:
6239: Undefine all the macros that were defined above to handle this. */
6240:
6241: #ifdef NO_RECURSE
6242: #undef eptr
6243: #undef ecode
6244: #undef mstart
6245: #undef offset_top
6246: #undef eptrb
6247: #undef flags
6248:
6249: #undef callpat
6250: #undef charptr
6251: #undef data
6252: #undef next
6253: #undef pp
6254: #undef prev
6255: #undef saved_eptr
6256:
6257: #undef new_recursive
6258:
6259: #undef cur_is_word
6260: #undef condition
6261: #undef prev_is_word
6262:
6263: #undef ctype
6264: #undef length
6265: #undef max
6266: #undef min
6267: #undef number
6268: #undef offset
6269: #undef op
6270: #undef save_capture_last
6271: #undef save_offset1
6272: #undef save_offset2
6273: #undef save_offset3
6274: #undef stacksave
6275:
6276: #undef newptrb
6277:
6278: #endif
6279:
6280: /* These two are defined as macros in both cases */
6281:
6282: #undef fc
6283: #undef fi
6284:
6285: /***************************************************************************
6286: ***************************************************************************/
6287:
6288:
1.1.1.3 ! misho 6289: #ifdef NO_RECURSE
! 6290: /*************************************************
! 6291: * Release allocated heap frames *
! 6292: *************************************************/
! 6293:
! 6294: /* This function releases all the allocated frames. The base frame is on the
! 6295: machine stack, and so must not be freed.
! 6296:
! 6297: Argument: the address of the base frame
! 6298: Returns: nothing
! 6299: */
! 6300:
! 6301: static void
! 6302: release_match_heapframes (heapframe *frame_base)
! 6303: {
! 6304: heapframe *nextframe = frame_base->Xnextframe;
! 6305: while (nextframe != NULL)
! 6306: {
! 6307: heapframe *oldframe = nextframe;
! 6308: nextframe = nextframe->Xnextframe;
! 6309: (PUBL(stack_free))(oldframe);
! 6310: }
! 6311: }
! 6312: #endif
! 6313:
1.1 misho 6314:
6315: /*************************************************
6316: * Execute a Regular Expression *
6317: *************************************************/
6318:
6319: /* This function applies a compiled re to a subject string and picks out
6320: portions of the string if it matches. Two elements in the vector are set for
6321: each substring: the offsets to the start and end of the substring.
6322:
6323: Arguments:
6324: argument_re points to the compiled expression
6325: extra_data points to extra data or is NULL
6326: subject points to the subject string
6327: length length of subject string (may contain binary zeros)
6328: start_offset where to start in the subject string
6329: options option bits
6330: offsets points to a vector of ints to be filled in with offsets
6331: offsetcount the number of elements in the vector
6332:
6333: Returns: > 0 => success; value is the number of elements filled in
6334: = 0 => success, but offsets is not big enough
6335: -1 => failed to match
6336: < -1 => some kind of unexpected problem
6337: */
6338:
1.1.1.2 misho 6339: #ifdef COMPILE_PCRE8
1.1 misho 6340: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6341: pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6342: PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6343: int offsetcount)
1.1.1.2 misho 6344: #else
6345: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6346: pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6347: PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6348: int offsetcount)
6349: #endif
1.1 misho 6350: {
6351: int rc, ocount, arg_offset_max;
6352: int newline;
6353: BOOL using_temporary_offsets = FALSE;
6354: BOOL anchored;
6355: BOOL startline;
6356: BOOL firstline;
1.1.1.2 misho 6357: BOOL utf;
6358: BOOL has_first_char = FALSE;
6359: BOOL has_req_char = FALSE;
6360: pcre_uchar first_char = 0;
6361: pcre_uchar first_char2 = 0;
6362: pcre_uchar req_char = 0;
6363: pcre_uchar req_char2 = 0;
1.1 misho 6364: match_data match_block;
6365: match_data *md = &match_block;
1.1.1.2 misho 6366: const pcre_uint8 *tables;
6367: const pcre_uint8 *start_bits = NULL;
6368: PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6369: PCRE_PUCHAR end_subject;
6370: PCRE_PUCHAR start_partial = NULL;
6371: PCRE_PUCHAR req_char_ptr = start_match - 1;
1.1 misho 6372:
6373: const pcre_study_data *study;
1.1.1.2 misho 6374: const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
1.1 misho 6375:
1.1.1.3 ! misho 6376: #ifdef NO_RECURSE
! 6377: heapframe frame_zero;
! 6378: frame_zero.Xprevframe = NULL; /* Marks the top level */
! 6379: frame_zero.Xnextframe = NULL; /* None are allocated yet */
! 6380: md->match_frames_base = &frame_zero;
! 6381: #endif
! 6382:
1.1.1.2 misho 6383: /* Check for the special magic call that measures the size of the stack used
1.1.1.3 ! misho 6384: per recursive call of match(). Without the funny casting for sizeof, a Windows
! 6385: compiler gave this error: "unary minus operator applied to unsigned type,
! 6386: result still unsigned". Hopefully the cast fixes that. */
1.1.1.2 misho 6387:
6388: if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6389: start_offset == -999)
6390: #ifdef NO_RECURSE
1.1.1.3 ! misho 6391: return -((int)sizeof(heapframe));
1.1.1.2 misho 6392: #else
6393: return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6394: #endif
1.1 misho 6395:
6396: /* Plausibility checks */
6397:
6398: if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
1.1.1.2 misho 6399: if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6400: return PCRE_ERROR_NULL;
1.1 misho 6401: if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6402: if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6403:
1.1.1.2 misho 6404: /* Check that the first field in the block is the magic number. If it is not,
6405: return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6406: REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6407: means that the pattern is likely compiled with different endianness. */
6408:
6409: if (re->magic_number != MAGIC_NUMBER)
6410: return re->magic_number == REVERSED_MAGIC_NUMBER?
6411: PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6412: if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6413:
1.1 misho 6414: /* These two settings are used in the code for checking a UTF-8 string that
6415: follows immediately afterwards. Other values in the md block are used only
6416: during "normal" pcre_exec() processing, not when the JIT support is in use,
6417: so they are set up later. */
6418:
1.1.1.2 misho 6419: /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6420: utf = md->utf = (re->options & PCRE_UTF8) != 0;
1.1 misho 6421: md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6422: ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6423:
6424: /* Check a UTF-8 string if required. Pass back the character offset and error
6425: code for an invalid string if a results vector is available. */
6426:
1.1.1.2 misho 6427: #ifdef SUPPORT_UTF
6428: if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
1.1 misho 6429: {
6430: int erroroffset;
1.1.1.2 misho 6431: int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
1.1 misho 6432: if (errorcode != 0)
6433: {
6434: if (offsetcount >= 2)
6435: {
6436: offsets[0] = erroroffset;
6437: offsets[1] = errorcode;
6438: }
1.1.1.2 misho 6439: #ifdef COMPILE_PCRE16
6440: return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6441: PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6442: #else
1.1 misho 6443: return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6444: PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
1.1.1.2 misho 6445: #endif
1.1 misho 6446: }
6447:
1.1.1.2 misho 6448: /* Check that a start_offset points to the start of a UTF character. */
1.1 misho 6449: if (start_offset > 0 && start_offset < length &&
1.1.1.2 misho 6450: NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
1.1 misho 6451: return PCRE_ERROR_BADUTF8_OFFSET;
6452: }
6453: #endif
6454:
6455: /* If the pattern was successfully studied with JIT support, run the JIT
6456: executable instead of the rest of this function. Most options must be set at
6457: compile time for the JIT code to be usable. Fallback to the normal code path if
1.1.1.3 ! misho 6458: an unsupported flag is set. */
1.1 misho 6459:
6460: #ifdef SUPPORT_JIT
6461: if (extra_data != NULL
1.1.1.3 ! misho 6462: && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
! 6463: PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
1.1 misho 6464: && extra_data->executable_jit != NULL
6465: && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
1.1.1.3 ! misho 6466: PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART |
! 6467: PCRE_PARTIAL_SOFT | PCRE_PARTIAL_HARD)) == 0)
! 6468: {
! 6469: rc = PRIV(jit_exec)(re, extra_data, (const pcre_uchar *)subject, length,
! 6470: start_offset, options, offsets, offsetcount);
! 6471:
! 6472: /* PCRE_ERROR_NULL means that the selected normal or partial matching
! 6473: mode is not compiled. In this case we simply fallback to interpreter. */
! 6474:
! 6475: if (rc != PCRE_ERROR_NULL) return rc;
! 6476: }
1.1 misho 6477: #endif
6478:
6479: /* Carry on with non-JIT matching. This information is for finding all the
6480: numbers associated with a given name, for condition testing. */
6481:
1.1.1.2 misho 6482: md->name_table = (pcre_uchar *)re + re->name_table_offset;
1.1 misho 6483: md->name_count = re->name_count;
6484: md->name_entry_size = re->name_entry_size;
6485:
6486: /* Fish out the optional data from the extra_data structure, first setting
6487: the default values. */
6488:
6489: study = NULL;
6490: md->match_limit = MATCH_LIMIT;
6491: md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6492: md->callout_data = NULL;
6493:
6494: /* The table pointer is always in native byte order. */
6495:
1.1.1.2 misho 6496: tables = re->tables;
1.1 misho 6497:
6498: if (extra_data != NULL)
6499: {
6500: register unsigned int flags = extra_data->flags;
6501: if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6502: study = (const pcre_study_data *)extra_data->study_data;
6503: if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6504: md->match_limit = extra_data->match_limit;
6505: if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6506: md->match_limit_recursion = extra_data->match_limit_recursion;
6507: if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6508: md->callout_data = extra_data->callout_data;
6509: if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6510: }
6511:
6512: /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6513: is a feature that makes it possible to save compiled regex and re-use them
6514: in other programs later. */
6515:
1.1.1.2 misho 6516: if (tables == NULL) tables = PRIV(default_tables);
1.1 misho 6517:
6518: /* Set up other data */
6519:
6520: anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6521: startline = (re->flags & PCRE_STARTLINE) != 0;
6522: firstline = (re->options & PCRE_FIRSTLINE) != 0;
6523:
6524: /* The code starts after the real_pcre block and the capture name table. */
6525:
1.1.1.2 misho 6526: md->start_code = (const pcre_uchar *)re + re->name_table_offset +
1.1 misho 6527: re->name_count * re->name_entry_size;
6528:
1.1.1.2 misho 6529: md->start_subject = (PCRE_PUCHAR)subject;
1.1 misho 6530: md->start_offset = start_offset;
6531: md->end_subject = md->start_subject + length;
6532: end_subject = md->end_subject;
6533:
6534: md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6535: md->use_ucp = (re->options & PCRE_UCP) != 0;
6536: md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6537: md->ignore_skip_arg = FALSE;
6538:
6539: /* Some options are unpacked into BOOL variables in the hope that testing
6540: them will be faster than individual option bits. */
6541:
6542: md->notbol = (options & PCRE_NOTBOL) != 0;
6543: md->noteol = (options & PCRE_NOTEOL) != 0;
6544: md->notempty = (options & PCRE_NOTEMPTY) != 0;
6545: md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6546:
6547: md->hitend = FALSE;
6548: md->mark = md->nomatch_mark = NULL; /* In case never set */
6549:
6550: md->recursive = NULL; /* No recursion at top level */
6551: md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6552:
6553: md->lcc = tables + lcc_offset;
1.1.1.2 misho 6554: md->fcc = tables + fcc_offset;
1.1 misho 6555: md->ctypes = tables + ctypes_offset;
6556:
6557: /* Handle different \R options. */
6558:
6559: switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6560: {
6561: case 0:
6562: if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6563: md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6564: else
6565: #ifdef BSR_ANYCRLF
6566: md->bsr_anycrlf = TRUE;
6567: #else
6568: md->bsr_anycrlf = FALSE;
6569: #endif
6570: break;
6571:
6572: case PCRE_BSR_ANYCRLF:
6573: md->bsr_anycrlf = TRUE;
6574: break;
6575:
6576: case PCRE_BSR_UNICODE:
6577: md->bsr_anycrlf = FALSE;
6578: break;
6579:
6580: default: return PCRE_ERROR_BADNEWLINE;
6581: }
6582:
6583: /* Handle different types of newline. The three bits give eight cases. If
6584: nothing is set at run time, whatever was used at compile time applies. */
6585:
6586: switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6587: (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6588: {
6589: case 0: newline = NEWLINE; break; /* Compile-time default */
6590: case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6591: case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6592: case PCRE_NEWLINE_CR+
6593: PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6594: case PCRE_NEWLINE_ANY: newline = -1; break;
6595: case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6596: default: return PCRE_ERROR_BADNEWLINE;
6597: }
6598:
6599: if (newline == -2)
6600: {
6601: md->nltype = NLTYPE_ANYCRLF;
6602: }
6603: else if (newline < 0)
6604: {
6605: md->nltype = NLTYPE_ANY;
6606: }
6607: else
6608: {
6609: md->nltype = NLTYPE_FIXED;
6610: if (newline > 255)
6611: {
6612: md->nllen = 2;
6613: md->nl[0] = (newline >> 8) & 255;
6614: md->nl[1] = newline & 255;
6615: }
6616: else
6617: {
6618: md->nllen = 1;
6619: md->nl[0] = newline;
6620: }
6621: }
6622:
6623: /* Partial matching was originally supported only for a restricted set of
6624: regexes; from release 8.00 there are no restrictions, but the bits are still
6625: defined (though never set). So there's no harm in leaving this code. */
6626:
6627: if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6628: return PCRE_ERROR_BADPARTIAL;
6629:
6630: /* If the expression has got more back references than the offsets supplied can
6631: hold, we get a temporary chunk of working store to use during the matching.
6632: Otherwise, we can use the vector supplied, rounding down its size to a multiple
6633: of 3. */
6634:
6635: ocount = offsetcount - (offsetcount % 3);
6636: arg_offset_max = (2*ocount)/3;
6637:
6638: if (re->top_backref > 0 && re->top_backref >= ocount/3)
6639: {
6640: ocount = re->top_backref * 3 + 3;
1.1.1.2 misho 6641: md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
1.1 misho 6642: if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6643: using_temporary_offsets = TRUE;
6644: DPRINTF(("Got memory to hold back references\n"));
6645: }
6646: else md->offset_vector = offsets;
6647:
6648: md->offset_end = ocount;
6649: md->offset_max = (2*ocount)/3;
6650: md->offset_overflow = FALSE;
6651: md->capture_last = -1;
6652:
6653: /* Reset the working variable associated with each extraction. These should
6654: never be used unless previously set, but they get saved and restored, and so we
6655: initialize them to avoid reading uninitialized locations. Also, unset the
6656: offsets for the matched string. This is really just for tidiness with callouts,
6657: in case they inspect these fields. */
6658:
6659: if (md->offset_vector != NULL)
6660: {
6661: register int *iptr = md->offset_vector + ocount;
6662: register int *iend = iptr - re->top_bracket;
6663: if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6664: while (--iptr >= iend) *iptr = -1;
6665: md->offset_vector[0] = md->offset_vector[1] = -1;
6666: }
6667:
1.1.1.2 misho 6668: /* Set up the first character to match, if available. The first_char value is
1.1 misho 6669: never set for an anchored regular expression, but the anchoring may be forced
6670: at run time, so we have to test for anchoring. The first char may be unset for
6671: an unanchored pattern, of course. If there's no first char and the pattern was
6672: studied, there may be a bitmap of possible first characters. */
6673:
6674: if (!anchored)
6675: {
6676: if ((re->flags & PCRE_FIRSTSET) != 0)
6677: {
1.1.1.2 misho 6678: has_first_char = TRUE;
6679: first_char = first_char2 = (pcre_uchar)(re->first_char);
6680: if ((re->flags & PCRE_FCH_CASELESS) != 0)
6681: {
6682: first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6683: #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6684: if (utf && first_char > 127)
6685: first_char2 = UCD_OTHERCASE(first_char);
6686: #endif
6687: }
1.1 misho 6688: }
6689: else
6690: if (!startline && study != NULL &&
6691: (study->flags & PCRE_STUDY_MAPPED) != 0)
6692: start_bits = study->start_bits;
6693: }
6694:
6695: /* For anchored or unanchored matches, there may be a "last known required
6696: character" set. */
6697:
6698: if ((re->flags & PCRE_REQCHSET) != 0)
6699: {
1.1.1.2 misho 6700: has_req_char = TRUE;
6701: req_char = req_char2 = (pcre_uchar)(re->req_char);
6702: if ((re->flags & PCRE_RCH_CASELESS) != 0)
6703: {
6704: req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6705: #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6706: if (utf && req_char > 127)
6707: req_char2 = UCD_OTHERCASE(req_char);
6708: #endif
6709: }
1.1 misho 6710: }
6711:
6712:
6713: /* ==========================================================================*/
6714:
6715: /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6716: the loop runs just once. */
6717:
6718: for(;;)
6719: {
1.1.1.2 misho 6720: PCRE_PUCHAR save_end_subject = end_subject;
6721: PCRE_PUCHAR new_start_match;
1.1 misho 6722:
6723: /* If firstline is TRUE, the start of the match is constrained to the first
6724: line of a multiline string. That is, the match must be before or at the first
6725: newline. Implement this by temporarily adjusting end_subject so that we stop
6726: scanning at a newline. If the match fails at the newline, later code breaks
6727: this loop. */
6728:
6729: if (firstline)
6730: {
1.1.1.2 misho 6731: PCRE_PUCHAR t = start_match;
6732: #ifdef SUPPORT_UTF
6733: if (utf)
1.1 misho 6734: {
6735: while (t < md->end_subject && !IS_NEWLINE(t))
6736: {
6737: t++;
1.1.1.2 misho 6738: ACROSSCHAR(t < end_subject, *t, t++);
1.1 misho 6739: }
6740: }
6741: else
6742: #endif
6743: while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6744: end_subject = t;
6745: }
6746:
6747: /* There are some optimizations that avoid running the match if a known
6748: starting point is not found, or if a known later character is not present.
6749: However, there is an option that disables these, for testing and for ensuring
6750: that all callouts do actually occur. The option can be set in the regex by
6751: (*NO_START_OPT) or passed in match-time options. */
6752:
6753: if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6754: {
1.1.1.2 misho 6755: /* Advance to a unique first char if there is one. */
1.1 misho 6756:
1.1.1.2 misho 6757: if (has_first_char)
1.1 misho 6758: {
1.1.1.2 misho 6759: if (first_char != first_char2)
6760: while (start_match < end_subject &&
6761: *start_match != first_char && *start_match != first_char2)
1.1 misho 6762: start_match++;
6763: else
1.1.1.2 misho 6764: while (start_match < end_subject && *start_match != first_char)
1.1 misho 6765: start_match++;
6766: }
6767:
6768: /* Or to just after a linebreak for a multiline match */
6769:
6770: else if (startline)
6771: {
6772: if (start_match > md->start_subject + start_offset)
6773: {
1.1.1.2 misho 6774: #ifdef SUPPORT_UTF
6775: if (utf)
1.1 misho 6776: {
6777: while (start_match < end_subject && !WAS_NEWLINE(start_match))
6778: {
6779: start_match++;
1.1.1.2 misho 6780: ACROSSCHAR(start_match < end_subject, *start_match,
6781: start_match++);
1.1 misho 6782: }
6783: }
6784: else
6785: #endif
6786: while (start_match < end_subject && !WAS_NEWLINE(start_match))
6787: start_match++;
6788:
6789: /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6790: and we are now at a LF, advance the match position by one more character.
6791: */
6792:
6793: if (start_match[-1] == CHAR_CR &&
6794: (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6795: start_match < end_subject &&
6796: *start_match == CHAR_NL)
6797: start_match++;
6798: }
6799: }
6800:
6801: /* Or to a non-unique first byte after study */
6802:
6803: else if (start_bits != NULL)
6804: {
6805: while (start_match < end_subject)
6806: {
6807: register unsigned int c = *start_match;
1.1.1.2 misho 6808: #ifndef COMPILE_PCRE8
6809: if (c > 255) c = 255;
6810: #endif
1.1 misho 6811: if ((start_bits[c/8] & (1 << (c&7))) == 0)
6812: {
6813: start_match++;
1.1.1.2 misho 6814: #if defined SUPPORT_UTF && defined COMPILE_PCRE8
6815: /* In non 8-bit mode, the iteration will stop for
6816: characters > 255 at the beginning or not stop at all. */
6817: if (utf)
6818: ACROSSCHAR(start_match < end_subject, *start_match,
6819: start_match++);
1.1 misho 6820: #endif
6821: }
6822: else break;
6823: }
6824: }
6825: } /* Starting optimizations */
6826:
6827: /* Restore fudged end_subject */
6828:
6829: end_subject = save_end_subject;
6830:
6831: /* The following two optimizations are disabled for partial matching or if
6832: disabling is explicitly requested. */
6833:
6834: if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6835: {
6836: /* If the pattern was studied, a minimum subject length may be set. This is
6837: a lower bound; no actual string of that length may actually match the
6838: pattern. Although the value is, strictly, in characters, we treat it as
6839: bytes to avoid spending too much time in this optimization. */
6840:
6841: if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6842: (pcre_uint32)(end_subject - start_match) < study->minlength)
6843: {
6844: rc = MATCH_NOMATCH;
6845: break;
6846: }
6847:
1.1.1.2 misho 6848: /* If req_char is set, we know that that character must appear in the
6849: subject for the match to succeed. If the first character is set, req_char
1.1 misho 6850: must be later in the subject; otherwise the test starts at the match point.
6851: This optimization can save a huge amount of backtracking in patterns with
6852: nested unlimited repeats that aren't going to match. Writing separate code
6853: for cased/caseless versions makes it go faster, as does using an
6854: autoincrement and backing off on a match.
6855:
6856: HOWEVER: when the subject string is very, very long, searching to its end
6857: can take a long time, and give bad performance on quite ordinary patterns.
6858: This showed up when somebody was matching something like /^\d+C/ on a
6859: 32-megabyte string... so we don't do this when the string is sufficiently
6860: long. */
6861:
1.1.1.2 misho 6862: if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
1.1 misho 6863: {
1.1.1.2 misho 6864: register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
1.1 misho 6865:
6866: /* We don't need to repeat the search if we haven't yet reached the
6867: place we found it at last time. */
6868:
1.1.1.2 misho 6869: if (p > req_char_ptr)
1.1 misho 6870: {
1.1.1.2 misho 6871: if (req_char != req_char2)
1.1 misho 6872: {
6873: while (p < end_subject)
6874: {
6875: register int pp = *p++;
1.1.1.2 misho 6876: if (pp == req_char || pp == req_char2) { p--; break; }
1.1 misho 6877: }
6878: }
6879: else
6880: {
6881: while (p < end_subject)
6882: {
1.1.1.2 misho 6883: if (*p++ == req_char) { p--; break; }
1.1 misho 6884: }
6885: }
6886:
6887: /* If we can't find the required character, break the matching loop,
6888: forcing a match failure. */
6889:
6890: if (p >= end_subject)
6891: {
6892: rc = MATCH_NOMATCH;
6893: break;
6894: }
6895:
6896: /* If we have found the required character, save the point where we
6897: found it, so that we don't search again next time round the loop if
6898: the start hasn't passed this character yet. */
6899:
1.1.1.2 misho 6900: req_char_ptr = p;
1.1 misho 6901: }
6902: }
6903: }
6904:
6905: #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6906: printf(">>>> Match against: ");
6907: pchars(start_match, end_subject - start_match, TRUE, md);
6908: printf("\n");
6909: #endif
6910:
6911: /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6912: first starting point for which a partial match was found. */
6913:
6914: md->start_match_ptr = start_match;
6915: md->start_used_ptr = start_match;
6916: md->match_call_count = 0;
6917: md->match_function_type = 0;
6918: md->end_offset_top = 0;
6919: rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0);
6920: if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6921:
6922: switch(rc)
6923: {
6924: /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6925: the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
6926: entirely. The only way we can do that is to re-do the match at the same
6927: point, with a flag to force SKIP with an argument to be ignored. Just
6928: treating this case as NOMATCH does not work because it does not check other
6929: alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
6930:
6931: case MATCH_SKIP_ARG:
6932: new_start_match = start_match;
6933: md->ignore_skip_arg = TRUE;
6934: break;
6935:
6936: /* SKIP passes back the next starting point explicitly, but if it is the
6937: same as the match we have just done, treat it as NOMATCH. */
6938:
6939: case MATCH_SKIP:
6940: if (md->start_match_ptr != start_match)
6941: {
6942: new_start_match = md->start_match_ptr;
6943: break;
6944: }
6945: /* Fall through */
6946:
6947: /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6948: exactly like PRUNE. Unset the ignore SKIP-with-argument flag. */
6949:
6950: case MATCH_NOMATCH:
6951: case MATCH_PRUNE:
6952: case MATCH_THEN:
6953: md->ignore_skip_arg = FALSE;
6954: new_start_match = start_match + 1;
1.1.1.2 misho 6955: #ifdef SUPPORT_UTF
6956: if (utf)
6957: ACROSSCHAR(new_start_match < end_subject, *new_start_match,
6958: new_start_match++);
1.1 misho 6959: #endif
6960: break;
6961:
6962: /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6963:
6964: case MATCH_COMMIT:
6965: rc = MATCH_NOMATCH;
6966: goto ENDLOOP;
6967:
6968: /* Any other return is either a match, or some kind of error. */
6969:
6970: default:
6971: goto ENDLOOP;
6972: }
6973:
6974: /* Control reaches here for the various types of "no match at this point"
6975: result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6976:
6977: rc = MATCH_NOMATCH;
6978:
6979: /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6980: newline in the subject (though it may continue over the newline). Therefore,
6981: if we have just failed to match, starting at a newline, do not continue. */
6982:
6983: if (firstline && IS_NEWLINE(start_match)) break;
6984:
6985: /* Advance to new matching position */
6986:
6987: start_match = new_start_match;
6988:
6989: /* Break the loop if the pattern is anchored or if we have passed the end of
6990: the subject. */
6991:
6992: if (anchored || start_match > end_subject) break;
6993:
6994: /* If we have just passed a CR and we are now at a LF, and the pattern does
6995: not contain any explicit matches for \r or \n, and the newline option is CRLF
1.1.1.2 misho 6996: or ANY or ANYCRLF, advance the match position by one more character. In
6997: normal matching start_match will aways be greater than the first position at
6998: this stage, but a failed *SKIP can cause a return at the same point, which is
6999: why the first test exists. */
1.1 misho 7000:
1.1.1.2 misho 7001: if (start_match > (PCRE_PUCHAR)subject + start_offset &&
7002: start_match[-1] == CHAR_CR &&
1.1 misho 7003: start_match < end_subject &&
7004: *start_match == CHAR_NL &&
7005: (re->flags & PCRE_HASCRORLF) == 0 &&
7006: (md->nltype == NLTYPE_ANY ||
7007: md->nltype == NLTYPE_ANYCRLF ||
7008: md->nllen == 2))
7009: start_match++;
7010:
7011: md->mark = NULL; /* Reset for start of next match attempt */
7012: } /* End of for(;;) "bumpalong" loop */
7013:
7014: /* ==========================================================================*/
7015:
7016: /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
7017: conditions is true:
7018:
7019: (1) The pattern is anchored or the match was failed by (*COMMIT);
7020:
7021: (2) We are past the end of the subject;
7022:
7023: (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
7024: this option requests that a match occur at or before the first newline in
7025: the subject.
7026:
7027: When we have a match and the offset vector is big enough to deal with any
7028: backreferences, captured substring offsets will already be set up. In the case
7029: where we had to get some local store to hold offsets for backreference
7030: processing, copy those that we can. In this case there need not be overflow if
7031: certain parts of the pattern were not used, even though there are more
7032: capturing parentheses than vector slots. */
7033:
7034: ENDLOOP:
7035:
7036: if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
7037: {
7038: if (using_temporary_offsets)
7039: {
7040: if (arg_offset_max >= 4)
7041: {
7042: memcpy(offsets + 2, md->offset_vector + 2,
7043: (arg_offset_max - 2) * sizeof(int));
7044: DPRINTF(("Copied offsets from temporary memory\n"));
7045: }
7046: if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE;
7047: DPRINTF(("Freeing temporary memory\n"));
1.1.1.2 misho 7048: (PUBL(free))(md->offset_vector);
1.1 misho 7049: }
7050:
7051: /* Set the return code to the number of captured strings, or 0 if there were
7052: too many to fit into the vector. */
7053:
7054: rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)?
7055: 0 : md->end_offset_top/2;
7056:
7057: /* If there is space in the offset vector, set any unused pairs at the end of
7058: the pattern to -1 for backwards compatibility. It is documented that this
7059: happens. In earlier versions, the whole set of potential capturing offsets
7060: was set to -1 each time round the loop, but this is handled differently now.
7061: "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
7062: those at the end that need unsetting here. We can't just unset them all at
7063: the start of the whole thing because they may get set in one branch that is
7064: not the final matching branch. */
7065:
7066: if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
7067: {
7068: register int *iptr, *iend;
7069: int resetcount = 2 + re->top_bracket * 2;
1.1.1.3 ! misho 7070: if (resetcount > offsetcount) resetcount = offsetcount;
1.1 misho 7071: iptr = offsets + md->end_offset_top;
7072: iend = offsets + resetcount;
7073: while (iptr < iend) *iptr++ = -1;
7074: }
7075:
7076: /* If there is space, set up the whole thing as substring 0. The value of
7077: md->start_match_ptr might be modified if \K was encountered on the success
7078: matching path. */
7079:
7080: if (offsetcount < 2) rc = 0; else
7081: {
7082: offsets[0] = (int)(md->start_match_ptr - md->start_subject);
7083: offsets[1] = (int)(md->end_match_ptr - md->start_subject);
7084: }
7085:
7086: /* Return MARK data if requested */
7087:
7088: if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
1.1.1.2 misho 7089: *(extra_data->mark) = (pcre_uchar *)md->mark;
1.1 misho 7090: DPRINTF((">>>> returning %d\n", rc));
1.1.1.3 ! misho 7091: #ifdef NO_RECURSE
! 7092: release_match_heapframes(&frame_zero);
! 7093: #endif
1.1 misho 7094: return rc;
7095: }
7096:
7097: /* Control gets here if there has been an error, or if the overall match
7098: attempt has failed at all permitted starting positions. */
7099:
7100: if (using_temporary_offsets)
7101: {
7102: DPRINTF(("Freeing temporary memory\n"));
1.1.1.2 misho 7103: (PUBL(free))(md->offset_vector);
1.1 misho 7104: }
7105:
7106: /* For anything other than nomatch or partial match, just return the code. */
7107:
7108: if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
7109: {
7110: DPRINTF((">>>> error: returning %d\n", rc));
1.1.1.3 ! misho 7111: #ifdef NO_RECURSE
! 7112: release_match_heapframes(&frame_zero);
! 7113: #endif
1.1 misho 7114: return rc;
7115: }
7116:
7117: /* Handle partial matches - disable any mark data */
7118:
7119: if (start_partial != NULL)
7120: {
7121: DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
7122: md->mark = NULL;
7123: if (offsetcount > 1)
7124: {
1.1.1.2 misho 7125: offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject);
7126: offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
1.1 misho 7127: }
7128: rc = PCRE_ERROR_PARTIAL;
7129: }
7130:
7131: /* This is the classic nomatch case */
7132:
7133: else
7134: {
7135: DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
7136: rc = PCRE_ERROR_NOMATCH;
7137: }
7138:
7139: /* Return the MARK data if it has been requested. */
7140:
7141: if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
1.1.1.2 misho 7142: *(extra_data->mark) = (pcre_uchar *)md->nomatch_mark;
1.1.1.3 ! misho 7143: #ifdef NO_RECURSE
! 7144: release_match_heapframes(&frame_zero);
! 7145: #endif
1.1 misho 7146: return rc;
7147: }
7148:
7149: /* End of pcre_exec.c */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>