Annotation of embedaddon/pcre/pcre_exec.c, revision 1.1.1.5
1.1 misho 1: /*************************************************
2: * Perl-Compatible Regular Expressions *
3: *************************************************/
4:
5: /* PCRE is a library of functions to support regular expressions whose syntax
6: and semantics are as close as possible to those of the Perl 5 language.
7:
8: Written by Philip Hazel
1.1.1.4 misho 9: Copyright (c) 1997-2013 University of Cambridge
1.1 misho 10:
11: -----------------------------------------------------------------------------
12: Redistribution and use in source and binary forms, with or without
13: modification, are permitted provided that the following conditions are met:
14:
15: * Redistributions of source code must retain the above copyright notice,
16: this list of conditions and the following disclaimer.
17:
18: * Redistributions in binary form must reproduce the above copyright
19: notice, this list of conditions and the following disclaimer in the
20: documentation and/or other materials provided with the distribution.
21:
22: * Neither the name of the University of Cambridge nor the names of its
23: contributors may be used to endorse or promote products derived from
24: this software without specific prior written permission.
25:
26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36: POSSIBILITY OF SUCH DAMAGE.
37: -----------------------------------------------------------------------------
38: */
39:
40: /* This module contains pcre_exec(), the externally visible function that does
41: pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42: possible. There are also some static supporting functions. */
43:
44: #ifdef HAVE_CONFIG_H
45: #include "config.h"
46: #endif
47:
48: #define NLBLOCK md /* Block containing newline information */
49: #define PSSTART start_subject /* Field containing processed string start */
50: #define PSEND end_subject /* Field containing processed string end */
51:
52: #include "pcre_internal.h"
53:
54: /* Undefine some potentially clashing cpp symbols */
55:
56: #undef min
57: #undef max
58:
1.1.1.4 misho 59: /* The md->capture_last field uses the lower 16 bits for the last captured
60: substring (which can never be greater than 65535) and a bit in the top half
61: to mean "capture vector overflowed". This odd way of doing things was
62: implemented when it was realized that preserving and restoring the overflow bit
63: whenever the last capture number was saved/restored made for a neater
64: interface, and doing it this way saved on (a) another variable, which would
65: have increased the stack frame size (a big NO-NO in PCRE) and (b) another
66: separate set of save/restore instructions. The following defines are used in
67: implementing this. */
68:
69: #define CAPLMASK 0x0000ffff /* The bits used for last_capture */
70: #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
71: #define OVFLBIT 0x00010000 /* The bit that is set for overflow */
72:
1.1 misho 73: /* Values for setting in md->match_function_type to indicate two special types
74: of call to match(). We do it this way to save on using another stack variable,
75: as stack usage is to be discouraged. */
76:
77: #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
78: #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
79:
80: /* Non-error returns from the match() function. Error returns are externally
81: defined PCRE_ERROR_xxx codes, which are all negative. */
82:
83: #define MATCH_MATCH 1
84: #define MATCH_NOMATCH 0
85:
86: /* Special internal returns from the match() function. Make them sufficiently
87: negative to avoid the external error codes. */
88:
89: #define MATCH_ACCEPT (-999)
1.1.1.4 misho 90: #define MATCH_KETRPOS (-998)
91: #define MATCH_ONCE (-997)
92: /* The next 5 must be kept together and in sequence so that a test that checks
93: for any one of them can use a range. */
94: #define MATCH_COMMIT (-996)
1.1 misho 95: #define MATCH_PRUNE (-995)
96: #define MATCH_SKIP (-994)
97: #define MATCH_SKIP_ARG (-993)
98: #define MATCH_THEN (-992)
1.1.1.4 misho 99: #define MATCH_BACKTRACK_MAX MATCH_THEN
100: #define MATCH_BACKTRACK_MIN MATCH_COMMIT
1.1 misho 101:
102: /* Maximum number of ints of offset to save on the stack for recursive calls.
103: If the offset vector is bigger, malloc is used. This should be a multiple of 3,
104: because the offset vector is always a multiple of 3 long. */
105:
106: #define REC_STACK_SAVE_MAX 30
107:
108: /* Min and max values for the common repeats; for the maxima, 0 => infinity */
109:
1.1.1.5 ! misho 110: static const char rep_min[] = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, };
! 111: static const char rep_max[] = { 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, };
1.1 misho 112:
113: #ifdef PCRE_DEBUG
114: /*************************************************
115: * Debugging function to print chars *
116: *************************************************/
117:
118: /* Print a sequence of chars in printable format, stopping at the end of the
119: subject if the requested.
120:
121: Arguments:
122: p points to characters
123: length number to print
124: is_subject TRUE if printing from within md->start_subject
125: md pointer to matching data block, if is_subject is TRUE
126:
127: Returns: nothing
128: */
129:
130: static void
1.1.1.2 misho 131: pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
1.1 misho 132: {
1.1.1.4 misho 133: pcre_uint32 c;
134: BOOL utf = md->utf;
1.1 misho 135: if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
136: while (length-- > 0)
1.1.1.4 misho 137: if (isprint(c = RAWUCHARINCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
1.1 misho 138: }
139: #endif
140:
141:
142:
143: /*************************************************
144: * Match a back-reference *
145: *************************************************/
146:
147: /* Normally, if a back reference hasn't been set, the length that is passed is
148: negative, so the match always fails. However, in JavaScript compatibility mode,
149: the length passed is zero. Note that in caseless UTF-8 mode, the number of
150: subject bytes matched may be different to the number of reference bytes.
151:
152: Arguments:
153: offset index into the offset vector
154: eptr pointer into the subject
155: length length of reference to be matched (number of bytes)
156: md points to match data block
157: caseless TRUE if caseless
158:
1.1.1.3 misho 159: Returns: >= 0 the number of subject bytes matched
160: -1 no match
161: -2 partial match; always given if at end subject
1.1 misho 162: */
163:
164: static int
1.1.1.2 misho 165: match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
1.1 misho 166: BOOL caseless)
167: {
1.1.1.2 misho 168: PCRE_PUCHAR eptr_start = eptr;
169: register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
1.1.1.5 ! misho 170: #if defined SUPPORT_UTF && defined SUPPORT_UCP
1.1.1.4 misho 171: BOOL utf = md->utf;
172: #endif
1.1 misho 173:
174: #ifdef PCRE_DEBUG
175: if (eptr >= md->end_subject)
176: printf("matching subject <null>");
177: else
178: {
179: printf("matching subject ");
180: pchars(eptr, length, TRUE, md);
181: }
182: printf(" against backref ");
183: pchars(p, length, FALSE, md);
184: printf("\n");
185: #endif
186:
1.1.1.3 misho 187: /* Always fail if reference not set (and not JavaScript compatible - in that
188: case the length is passed as zero). */
1.1 misho 189:
190: if (length < 0) return -1;
191:
192: /* Separate the caseless case for speed. In UTF-8 mode we can only do this
193: properly if Unicode properties are supported. Otherwise, we can check only
194: ASCII characters. */
195:
196: if (caseless)
197: {
1.1.1.5 ! misho 198: #if defined SUPPORT_UTF && defined SUPPORT_UCP
1.1.1.4 misho 199: if (utf)
1.1 misho 200: {
201: /* Match characters up to the end of the reference. NOTE: the number of
1.1.1.4 misho 202: data units matched may differ, because in UTF-8 there are some characters
203: whose upper and lower case versions code have different numbers of bytes.
204: For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
205: (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
206: sequence of two of the latter. It is important, therefore, to check the
207: length along the reference, not along the subject (earlier code did this
208: wrong). */
1.1 misho 209:
1.1.1.2 misho 210: PCRE_PUCHAR endptr = p + length;
1.1 misho 211: while (p < endptr)
212: {
1.1.1.4 misho 213: pcre_uint32 c, d;
214: const ucd_record *ur;
1.1.1.3 misho 215: if (eptr >= md->end_subject) return -2; /* Partial match */
1.1 misho 216: GETCHARINC(c, eptr);
217: GETCHARINC(d, p);
1.1.1.4 misho 218: ur = GET_UCD(d);
219: if (c != d && c != d + ur->other_case)
220: {
221: const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
222: for (;;)
223: {
224: if (c < *pp) return -1;
225: if (c == *pp++) break;
226: }
227: }
1.1 misho 228: }
229: }
230: else
231: #endif
232:
233: /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
234: is no UCP support. */
235: {
236: while (length-- > 0)
1.1.1.2 misho 237: {
1.1.1.4 misho 238: pcre_uint32 cc, cp;
1.1.1.3 misho 239: if (eptr >= md->end_subject) return -2; /* Partial match */
1.1.1.4 misho 240: cc = RAWUCHARTEST(eptr);
241: cp = RAWUCHARTEST(p);
242: if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
1.1.1.2 misho 243: p++;
244: eptr++;
245: }
1.1 misho 246: }
247: }
248:
249: /* In the caseful case, we can just compare the bytes, whether or not we
250: are in UTF-8 mode. */
251:
252: else
253: {
1.1.1.3 misho 254: while (length-- > 0)
255: {
256: if (eptr >= md->end_subject) return -2; /* Partial match */
1.1.1.4 misho 257: if (RAWUCHARINCTEST(p) != RAWUCHARINCTEST(eptr)) return -1;
1.1.1.3 misho 258: }
1.1 misho 259: }
260:
261: return (int)(eptr - eptr_start);
262: }
263:
264:
265:
266: /***************************************************************************
267: ****************************************************************************
268: RECURSION IN THE match() FUNCTION
269:
270: The match() function is highly recursive, though not every recursive call
271: increases the recursive depth. Nevertheless, some regular expressions can cause
272: it to recurse to a great depth. I was writing for Unix, so I just let it call
273: itself recursively. This uses the stack for saving everything that has to be
274: saved for a recursive call. On Unix, the stack can be large, and this works
275: fine.
276:
277: It turns out that on some non-Unix-like systems there are problems with
278: programs that use a lot of stack. (This despite the fact that every last chip
279: has oodles of memory these days, and techniques for extending the stack have
280: been known for decades.) So....
281:
282: There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
283: calls by keeping local variables that need to be preserved in blocks of memory
284: obtained from malloc() instead instead of on the stack. Macros are used to
285: achieve this so that the actual code doesn't look very different to what it
286: always used to.
287:
288: The original heap-recursive code used longjmp(). However, it seems that this
289: can be very slow on some operating systems. Following a suggestion from Stan
290: Switzer, the use of longjmp() has been abolished, at the cost of having to
291: provide a unique number for each call to RMATCH. There is no way of generating
292: a sequence of numbers at compile time in C. I have given them names, to make
293: them stand out more clearly.
294:
295: Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
296: FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
297: tests. Furthermore, not using longjmp() means that local dynamic variables
298: don't have indeterminate values; this has meant that the frame size can be
299: reduced because the result can be "passed back" by straight setting of the
300: variable instead of being passed in the frame.
301: ****************************************************************************
302: ***************************************************************************/
303:
304: /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
305: below must be updated in sync. */
306:
307: enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
308: RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
309: RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
310: RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
311: RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
312: RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
1.1.1.5 ! misho 313: RM61, RM62, RM63, RM64, RM65, RM66, RM67 };
1.1 misho 314:
315: /* These versions of the macros use the stack, as normal. There are debugging
316: versions and production versions. Note that the "rw" argument of RMATCH isn't
317: actually used in this definition. */
318:
319: #ifndef NO_RECURSE
320: #define REGISTER register
321:
322: #ifdef PCRE_DEBUG
323: #define RMATCH(ra,rb,rc,rd,re,rw) \
324: { \
325: printf("match() called in line %d\n", __LINE__); \
326: rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
327: printf("to line %d\n", __LINE__); \
328: }
329: #define RRETURN(ra) \
330: { \
1.1.1.4 misho 331: printf("match() returned %d from line %d\n", ra, __LINE__); \
1.1 misho 332: return ra; \
333: }
334: #else
335: #define RMATCH(ra,rb,rc,rd,re,rw) \
336: rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
337: #define RRETURN(ra) return ra
338: #endif
339:
340: #else
341:
342:
343: /* These versions of the macros manage a private stack on the heap. Note that
344: the "rd" argument of RMATCH isn't actually used in this definition. It's the md
345: argument of match(), which never changes. */
346:
347: #define REGISTER
348:
349: #define RMATCH(ra,rb,rc,rd,re,rw)\
350: {\
1.1.1.3 misho 351: heapframe *newframe = frame->Xnextframe;\
352: if (newframe == NULL)\
353: {\
354: newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
355: if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
356: newframe->Xnextframe = NULL;\
357: frame->Xnextframe = newframe;\
358: }\
359: frame->Xwhere = rw;\
1.1 misho 360: newframe->Xeptr = ra;\
361: newframe->Xecode = rb;\
362: newframe->Xmstart = mstart;\
363: newframe->Xoffset_top = rc;\
364: newframe->Xeptrb = re;\
365: newframe->Xrdepth = frame->Xrdepth + 1;\
366: newframe->Xprevframe = frame;\
367: frame = newframe;\
368: DPRINTF(("restarting from line %d\n", __LINE__));\
369: goto HEAP_RECURSE;\
370: L_##rw:\
371: DPRINTF(("jumped back to line %d\n", __LINE__));\
372: }
373:
374: #define RRETURN(ra)\
375: {\
376: heapframe *oldframe = frame;\
377: frame = oldframe->Xprevframe;\
378: if (frame != NULL)\
379: {\
380: rrc = ra;\
381: goto HEAP_RETURN;\
382: }\
383: return ra;\
384: }
385:
386:
387: /* Structure for remembering the local variables in a private frame */
388:
389: typedef struct heapframe {
390: struct heapframe *Xprevframe;
1.1.1.3 misho 391: struct heapframe *Xnextframe;
1.1 misho 392:
393: /* Function arguments that may change */
394:
1.1.1.2 misho 395: PCRE_PUCHAR Xeptr;
396: const pcre_uchar *Xecode;
397: PCRE_PUCHAR Xmstart;
1.1 misho 398: int Xoffset_top;
399: eptrblock *Xeptrb;
400: unsigned int Xrdepth;
401:
402: /* Function local variables */
403:
1.1.1.2 misho 404: PCRE_PUCHAR Xcallpat;
405: #ifdef SUPPORT_UTF
406: PCRE_PUCHAR Xcharptr;
407: #endif
408: PCRE_PUCHAR Xdata;
409: PCRE_PUCHAR Xnext;
410: PCRE_PUCHAR Xpp;
411: PCRE_PUCHAR Xprev;
412: PCRE_PUCHAR Xsaved_eptr;
1.1 misho 413:
414: recursion_info Xnew_recursive;
415:
416: BOOL Xcur_is_word;
417: BOOL Xcondition;
418: BOOL Xprev_is_word;
419:
420: #ifdef SUPPORT_UCP
421: int Xprop_type;
1.1.1.4 misho 422: unsigned int Xprop_value;
1.1 misho 423: int Xprop_fail_result;
424: int Xoclength;
1.1.1.2 misho 425: pcre_uchar Xocchars[6];
1.1 misho 426: #endif
427:
428: int Xcodelink;
429: int Xctype;
430: unsigned int Xfc;
431: int Xfi;
432: int Xlength;
433: int Xmax;
434: int Xmin;
1.1.1.4 misho 435: unsigned int Xnumber;
1.1 misho 436: int Xoffset;
1.1.1.4 misho 437: unsigned int Xop;
438: pcre_int32 Xsave_capture_last;
1.1 misho 439: int Xsave_offset1, Xsave_offset2, Xsave_offset3;
440: int Xstacksave[REC_STACK_SAVE_MAX];
441:
442: eptrblock Xnewptrb;
443:
444: /* Where to jump back to */
445:
446: int Xwhere;
447:
448: } heapframe;
449:
450: #endif
451:
452:
453: /***************************************************************************
454: ***************************************************************************/
455:
456:
457:
458: /*************************************************
459: * Match from current position *
460: *************************************************/
461:
462: /* This function is called recursively in many circumstances. Whenever it
463: returns a negative (error) response, the outer incarnation must also return the
464: same response. */
465:
466: /* These macros pack up tests that are used for partial matching, and which
467: appear several times in the code. We set the "hit end" flag if the pointer is
468: at the end of the subject and also past the start of the subject (i.e.
469: something has been matched). For hard partial matching, we then return
470: immediately. The second one is used when we already know we are past the end of
471: the subject. */
472:
473: #define CHECK_PARTIAL()\
474: if (md->partial != 0 && eptr >= md->end_subject && \
475: eptr > md->start_used_ptr) \
476: { \
477: md->hitend = TRUE; \
478: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
479: }
480:
481: #define SCHECK_PARTIAL()\
482: if (md->partial != 0 && eptr > md->start_used_ptr) \
483: { \
484: md->hitend = TRUE; \
485: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
486: }
487:
488:
489: /* Performance note: It might be tempting to extract commonly used fields from
1.1.1.2 misho 490: the md structure (e.g. utf, end_subject) into individual variables to improve
1.1 misho 491: performance. Tests using gcc on a SPARC disproved this; in the first case, it
492: made performance worse.
493:
494: Arguments:
495: eptr pointer to current character in subject
496: ecode pointer to current position in compiled code
497: mstart pointer to the current match start position (can be modified
498: by encountering \K)
499: offset_top current top pointer
500: md pointer to "static" info for the match
501: eptrb pointer to chain of blocks containing eptr at start of
502: brackets - for testing for empty matches
503: rdepth the recursion depth
504:
505: Returns: MATCH_MATCH if matched ) these values are >= 0
506: MATCH_NOMATCH if failed to match )
507: a negative MATCH_xxx value for PRUNE, SKIP, etc
508: a negative PCRE_ERROR_xxx value if aborted by an error condition
509: (e.g. stopped by repeated call or recursion limit)
510: */
511:
512: static int
1.1.1.2 misho 513: match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
514: PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
515: unsigned int rdepth)
1.1 misho 516: {
517: /* These variables do not need to be preserved over recursion in this function,
518: so they can be ordinary variables in all cases. Mark some of them with
519: "register" because they are used a lot in loops. */
520:
521: register int rrc; /* Returns from recursive calls */
522: register int i; /* Used for loops not involving calls to RMATCH() */
1.1.1.4 misho 523: register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
1.1.1.2 misho 524: register BOOL utf; /* Local copy of UTF flag for speed */
1.1 misho 525:
526: BOOL minimize, possessive; /* Quantifier options */
527: BOOL caseless;
528: int condcode;
529:
530: /* When recursion is not being used, all "local" variables that have to be
1.1.1.2 misho 531: preserved over calls to RMATCH() are part of a "frame". We set up the top-level
532: frame on the stack here; subsequent instantiations are obtained from the heap
533: whenever RMATCH() does a "recursion". See the macro definitions above. Putting
534: the top-level on the stack rather than malloc-ing them all gives a performance
535: boost in many cases where there is not much "recursion". */
1.1 misho 536:
537: #ifdef NO_RECURSE
1.1.1.3 misho 538: heapframe *frame = (heapframe *)md->match_frames_base;
1.1 misho 539:
540: /* Copy in the original argument variables */
541:
542: frame->Xeptr = eptr;
543: frame->Xecode = ecode;
544: frame->Xmstart = mstart;
545: frame->Xoffset_top = offset_top;
546: frame->Xeptrb = eptrb;
547: frame->Xrdepth = rdepth;
548:
549: /* This is where control jumps back to to effect "recursion" */
550:
551: HEAP_RECURSE:
552:
553: /* Macros make the argument variables come from the current frame */
554:
555: #define eptr frame->Xeptr
556: #define ecode frame->Xecode
557: #define mstart frame->Xmstart
558: #define offset_top frame->Xoffset_top
559: #define eptrb frame->Xeptrb
560: #define rdepth frame->Xrdepth
561:
562: /* Ditto for the local variables */
563:
1.1.1.2 misho 564: #ifdef SUPPORT_UTF
1.1 misho 565: #define charptr frame->Xcharptr
566: #endif
567: #define callpat frame->Xcallpat
568: #define codelink frame->Xcodelink
569: #define data frame->Xdata
570: #define next frame->Xnext
571: #define pp frame->Xpp
572: #define prev frame->Xprev
573: #define saved_eptr frame->Xsaved_eptr
574:
575: #define new_recursive frame->Xnew_recursive
576:
577: #define cur_is_word frame->Xcur_is_word
578: #define condition frame->Xcondition
579: #define prev_is_word frame->Xprev_is_word
580:
581: #ifdef SUPPORT_UCP
582: #define prop_type frame->Xprop_type
583: #define prop_value frame->Xprop_value
584: #define prop_fail_result frame->Xprop_fail_result
585: #define oclength frame->Xoclength
586: #define occhars frame->Xocchars
587: #endif
588:
589: #define ctype frame->Xctype
590: #define fc frame->Xfc
591: #define fi frame->Xfi
592: #define length frame->Xlength
593: #define max frame->Xmax
594: #define min frame->Xmin
595: #define number frame->Xnumber
596: #define offset frame->Xoffset
597: #define op frame->Xop
598: #define save_capture_last frame->Xsave_capture_last
599: #define save_offset1 frame->Xsave_offset1
600: #define save_offset2 frame->Xsave_offset2
601: #define save_offset3 frame->Xsave_offset3
602: #define stacksave frame->Xstacksave
603:
604: #define newptrb frame->Xnewptrb
605:
606: /* When recursion is being used, local variables are allocated on the stack and
607: get preserved during recursion in the normal way. In this environment, fi and
608: i, and fc and c, can be the same variables. */
609:
610: #else /* NO_RECURSE not defined */
611: #define fi i
612: #define fc c
613:
614: /* Many of the following variables are used only in small blocks of the code.
615: My normal style of coding would have declared them within each of those blocks.
616: However, in order to accommodate the version of this code that uses an external
617: "stack" implemented on the heap, it is easier to declare them all here, so the
618: declarations can be cut out in a block. The only declarations within blocks
619: below are for variables that do not have to be preserved over a recursive call
620: to RMATCH(). */
621:
1.1.1.2 misho 622: #ifdef SUPPORT_UTF
623: const pcre_uchar *charptr;
1.1 misho 624: #endif
1.1.1.2 misho 625: const pcre_uchar *callpat;
626: const pcre_uchar *data;
627: const pcre_uchar *next;
628: PCRE_PUCHAR pp;
629: const pcre_uchar *prev;
630: PCRE_PUCHAR saved_eptr;
1.1 misho 631:
632: recursion_info new_recursive;
633:
634: BOOL cur_is_word;
635: BOOL condition;
636: BOOL prev_is_word;
637:
638: #ifdef SUPPORT_UCP
639: int prop_type;
1.1.1.4 misho 640: unsigned int prop_value;
1.1 misho 641: int prop_fail_result;
642: int oclength;
1.1.1.2 misho 643: pcre_uchar occhars[6];
1.1 misho 644: #endif
645:
646: int codelink;
647: int ctype;
648: int length;
649: int max;
650: int min;
1.1.1.4 misho 651: unsigned int number;
1.1 misho 652: int offset;
1.1.1.4 misho 653: unsigned int op;
654: pcre_int32 save_capture_last;
1.1 misho 655: int save_offset1, save_offset2, save_offset3;
656: int stacksave[REC_STACK_SAVE_MAX];
657:
658: eptrblock newptrb;
1.1.1.2 misho 659:
660: /* There is a special fudge for calling match() in a way that causes it to
661: measure the size of its basic stack frame when the stack is being used for
662: recursion. The second argument (ecode) being NULL triggers this behaviour. It
663: cannot normally ever be NULL. The return is the negated value of the frame
664: size. */
665:
666: if (ecode == NULL)
667: {
668: if (rdepth == 0)
669: return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
670: else
671: {
672: int len = (char *)&rdepth - (char *)eptr;
673: return (len > 0)? -len : len;
674: }
675: }
1.1 misho 676: #endif /* NO_RECURSE */
677:
678: /* To save space on the stack and in the heap frame, I have doubled up on some
679: of the local variables that are used only in localised parts of the code, but
680: still need to be preserved over recursive calls of match(). These macros define
681: the alternative names that are used. */
682:
683: #define allow_zero cur_is_word
684: #define cbegroup condition
685: #define code_offset codelink
686: #define condassert condition
687: #define matched_once prev_is_word
1.1.1.2 misho 688: #define foc number
689: #define save_mark data
1.1 misho 690:
691: /* These statements are here to stop the compiler complaining about unitialized
692: variables. */
693:
694: #ifdef SUPPORT_UCP
695: prop_value = 0;
696: prop_fail_result = 0;
697: #endif
698:
699:
700: /* This label is used for tail recursion, which is used in a few cases even
701: when NO_RECURSE is not defined, in order to reduce the amount of stack that is
702: used. Thanks to Ian Taylor for noticing this possibility and sending the
703: original patch. */
704:
705: TAIL_RECURSE:
706:
707: /* OK, now we can get on with the real code of the function. Recursive calls
708: are specified by the macro RMATCH and RRETURN is used to return. When
709: NO_RECURSE is *not* defined, these just turn into a recursive call to match()
710: and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
711: defined). However, RMATCH isn't like a function call because it's quite a
712: complicated macro. It has to be used in one particular way. This shouldn't,
713: however, impact performance when true recursion is being used. */
714:
1.1.1.2 misho 715: #ifdef SUPPORT_UTF
716: utf = md->utf; /* Local copy of the flag */
1.1 misho 717: #else
1.1.1.2 misho 718: utf = FALSE;
1.1 misho 719: #endif
720:
721: /* First check that we haven't called match() too many times, or that we
722: haven't exceeded the recursive call limit. */
723:
724: if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
725: if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
726:
727: /* At the start of a group with an unlimited repeat that may match an empty
728: string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
729: done this way to save having to use another function argument, which would take
730: up space on the stack. See also MATCH_CONDASSERT below.
731:
732: When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
733: such remembered pointers, to be checked when we hit the closing ket, in order
734: to break infinite loops that match no characters. When match() is called in
735: other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
736: NOT be used with tail recursion, because the memory block that is used is on
737: the stack, so a new one may be required for each match(). */
738:
739: if (md->match_function_type == MATCH_CBEGROUP)
740: {
741: newptrb.epb_saved_eptr = eptr;
742: newptrb.epb_prev = eptrb;
743: eptrb = &newptrb;
744: md->match_function_type = 0;
745: }
746:
747: /* Now start processing the opcodes. */
748:
749: for (;;)
750: {
751: minimize = possessive = FALSE;
752: op = *ecode;
753:
754: switch(op)
755: {
756: case OP_MARK:
757: md->nomatch_mark = ecode + 2;
758: md->mark = NULL; /* In case previously set by assertion */
1.1.1.2 misho 759: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
1.1 misho 760: eptrb, RM55);
761: if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
762: md->mark == NULL) md->mark = ecode + 2;
763:
764: /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
765: argument, and we must check whether that argument matches this MARK's
766: argument. It is passed back in md->start_match_ptr (an overloading of that
767: variable). If it does match, we reset that variable to the current subject
768: position and return MATCH_SKIP. Otherwise, pass back the return code
769: unaltered. */
770:
771: else if (rrc == MATCH_SKIP_ARG &&
1.1.1.4 misho 772: STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
1.1 misho 773: {
774: md->start_match_ptr = eptr;
775: RRETURN(MATCH_SKIP);
776: }
777: RRETURN(rrc);
778:
779: case OP_FAIL:
780: RRETURN(MATCH_NOMATCH);
781:
782: case OP_COMMIT:
1.1.1.2 misho 783: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1.1 misho 784: eptrb, RM52);
1.1.1.4 misho 785: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1 misho 786: RRETURN(MATCH_COMMIT);
787:
788: case OP_PRUNE:
1.1.1.2 misho 789: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1.1 misho 790: eptrb, RM51);
1.1.1.4 misho 791: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1 misho 792: RRETURN(MATCH_PRUNE);
793:
794: case OP_PRUNE_ARG:
795: md->nomatch_mark = ecode + 2;
796: md->mark = NULL; /* In case previously set by assertion */
1.1.1.2 misho 797: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
1.1 misho 798: eptrb, RM56);
799: if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
800: md->mark == NULL) md->mark = ecode + 2;
1.1.1.4 misho 801: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1 misho 802: RRETURN(MATCH_PRUNE);
803:
804: case OP_SKIP:
1.1.1.2 misho 805: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1.1 misho 806: eptrb, RM53);
1.1.1.4 misho 807: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1 misho 808: md->start_match_ptr = eptr; /* Pass back current position */
809: RRETURN(MATCH_SKIP);
810:
811: /* Note that, for Perl compatibility, SKIP with an argument does NOT set
1.1.1.4 misho 812: nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
813: not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
814: that failed and any that precede it (either they also failed, or were not
815: triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
816: SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg
817: set to the count of the one that failed. */
1.1 misho 818:
819: case OP_SKIP_ARG:
1.1.1.4 misho 820: md->skip_arg_count++;
821: if (md->skip_arg_count <= md->ignore_skip_arg)
1.1 misho 822: {
1.1.1.2 misho 823: ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
1.1 misho 824: break;
825: }
1.1.1.2 misho 826: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
1.1 misho 827: eptrb, RM57);
1.1.1.4 misho 828: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1 misho 829:
830: /* Pass back the current skip name by overloading md->start_match_ptr and
831: returning the special MATCH_SKIP_ARG return code. This will either be
832: caught by a matching MARK, or get to the top, where it causes a rematch
1.1.1.4 misho 833: with md->ignore_skip_arg set to the value of md->skip_arg_count. */
1.1 misho 834:
835: md->start_match_ptr = ecode + 2;
836: RRETURN(MATCH_SKIP_ARG);
837:
838: /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
839: the branch in which it occurs can be determined. Overload the start of
840: match pointer to do this. */
841:
842: case OP_THEN:
1.1.1.2 misho 843: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1.1 misho 844: eptrb, RM54);
845: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
846: md->start_match_ptr = ecode;
847: RRETURN(MATCH_THEN);
848:
849: case OP_THEN_ARG:
850: md->nomatch_mark = ecode + 2;
851: md->mark = NULL; /* In case previously set by assertion */
1.1.1.2 misho 852: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
1.1 misho 853: md, eptrb, RM58);
854: if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
855: md->mark == NULL) md->mark = ecode + 2;
856: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
857: md->start_match_ptr = ecode;
858: RRETURN(MATCH_THEN);
859:
860: /* Handle an atomic group that does not contain any capturing parentheses.
861: This can be handled like an assertion. Prior to 8.13, all atomic groups
862: were handled this way. In 8.13, the code was changed as below for ONCE, so
863: that backups pass through the group and thereby reset captured values.
864: However, this uses a lot more stack, so in 8.20, atomic groups that do not
865: contain any captures generate OP_ONCE_NC, which can be handled in the old,
866: less stack intensive way.
867:
868: Check the alternative branches in turn - the matching won't pass the KET
869: for this kind of subpattern. If any one branch matches, we carry on as at
870: the end of a normal bracket, leaving the subject pointer, but resetting
871: the start-of-match value in case it was changed by \K. */
872:
873: case OP_ONCE_NC:
874: prev = ecode;
875: saved_eptr = eptr;
1.1.1.2 misho 876: save_mark = md->mark;
1.1 misho 877: do
878: {
879: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
880: if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
881: {
882: mstart = md->start_match_ptr;
883: break;
884: }
885: if (rrc == MATCH_THEN)
886: {
887: next = ecode + GET(ecode,1);
888: if (md->start_match_ptr < next &&
889: (*ecode == OP_ALT || *next == OP_ALT))
890: rrc = MATCH_NOMATCH;
891: }
892:
893: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
894: ecode += GET(ecode,1);
1.1.1.2 misho 895: md->mark = save_mark;
1.1 misho 896: }
897: while (*ecode == OP_ALT);
898:
899: /* If hit the end of the group (which could be repeated), fail */
900:
901: if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
902:
903: /* Continue as from after the group, updating the offsets high water
904: mark, since extracts may have been taken. */
905:
906: do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
907:
908: offset_top = md->end_offset_top;
909: eptr = md->end_match_ptr;
910:
911: /* For a non-repeating ket, just continue at this level. This also
912: happens for a repeating ket if no characters were matched in the group.
913: This is the forcible breaking of infinite loops as implemented in Perl
914: 5.005. */
915:
916: if (*ecode == OP_KET || eptr == saved_eptr)
917: {
918: ecode += 1+LINK_SIZE;
919: break;
920: }
921:
922: /* The repeating kets try the rest of the pattern or restart from the
923: preceding bracket, in the appropriate order. The second "call" of match()
924: uses tail recursion, to avoid using another stack frame. */
925:
926: if (*ecode == OP_KETRMIN)
927: {
928: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
929: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
930: ecode = prev;
931: goto TAIL_RECURSE;
932: }
933: else /* OP_KETRMAX */
934: {
935: RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
936: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
937: ecode += 1 + LINK_SIZE;
938: goto TAIL_RECURSE;
939: }
940: /* Control never gets here */
941:
942: /* Handle a capturing bracket, other than those that are possessive with an
943: unlimited repeat. If there is space in the offset vector, save the current
944: subject position in the working slot at the top of the vector. We mustn't
945: change the current values of the data slot, because they may be set from a
946: previous iteration of this group, and be referred to by a reference inside
947: the group. A failure to match might occur after the group has succeeded,
948: if something later on doesn't match. For this reason, we need to restore
949: the working value and also the values of the final offsets, in case they
950: were set by a previous iteration of the same bracket.
951:
952: If there isn't enough space in the offset vector, treat this as if it were
953: a non-capturing bracket. Don't worry about setting the flag for the error
954: case here; that is handled in the code for KET. */
955:
956: case OP_CBRA:
957: case OP_SCBRA:
958: number = GET2(ecode, 1+LINK_SIZE);
959: offset = number << 1;
960:
961: #ifdef PCRE_DEBUG
962: printf("start bracket %d\n", number);
963: printf("subject=");
964: pchars(eptr, 16, TRUE, md);
965: printf("\n");
966: #endif
967:
968: if (offset < md->offset_max)
969: {
970: save_offset1 = md->offset_vector[offset];
971: save_offset2 = md->offset_vector[offset+1];
972: save_offset3 = md->offset_vector[md->offset_end - number];
973: save_capture_last = md->capture_last;
1.1.1.2 misho 974: save_mark = md->mark;
1.1 misho 975:
976: DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
977: md->offset_vector[md->offset_end - number] =
978: (int)(eptr - md->start_subject);
979:
980: for (;;)
981: {
982: if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1.1.1.2 misho 983: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1.1 misho 984: eptrb, RM1);
985: if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
986:
987: /* If we backed up to a THEN, check whether it is within the current
988: branch by comparing the address of the THEN that is passed back with
989: the end of the branch. If it is within the current branch, and the
990: branch is one of two or more alternatives (it either starts or ends
991: with OP_ALT), we have reached the limit of THEN's action, so convert
992: the return code to NOMATCH, which will cause normal backtracking to
993: happen from now on. Otherwise, THEN is passed back to an outer
994: alternative. This implements Perl's treatment of parenthesized groups,
995: where a group not containing | does not affect the current alternative,
996: that is, (X) is NOT the same as (X|(*F)). */
997:
998: if (rrc == MATCH_THEN)
999: {
1000: next = ecode + GET(ecode,1);
1001: if (md->start_match_ptr < next &&
1002: (*ecode == OP_ALT || *next == OP_ALT))
1003: rrc = MATCH_NOMATCH;
1004: }
1005:
1006: /* Anything other than NOMATCH is passed back. */
1007:
1008: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1009: md->capture_last = save_capture_last;
1010: ecode += GET(ecode, 1);
1.1.1.2 misho 1011: md->mark = save_mark;
1.1 misho 1012: if (*ecode != OP_ALT) break;
1013: }
1014:
1015: DPRINTF(("bracket %d failed\n", number));
1016: md->offset_vector[offset] = save_offset1;
1017: md->offset_vector[offset+1] = save_offset2;
1018: md->offset_vector[md->offset_end - number] = save_offset3;
1019:
1020: /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1021:
1022: RRETURN(rrc);
1023: }
1024:
1025: /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1026: as a non-capturing bracket. */
1027:
1028: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1029: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1030:
1031: DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1032:
1033: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1034: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1035:
1036: /* Non-capturing or atomic group, except for possessive with unlimited
1037: repeat and ONCE group with no captures. Loop for all the alternatives.
1038:
1039: When we get to the final alternative within the brackets, we used to return
1040: the result of a recursive call to match() whatever happened so it was
1041: possible to reduce stack usage by turning this into a tail recursion,
1042: except in the case of a possibly empty group. However, now that there is
1043: the possiblity of (*THEN) occurring in the final alternative, this
1044: optimization is no longer always possible.
1045:
1046: We can optimize if we know there are no (*THEN)s in the pattern; at present
1047: this is the best that can be done.
1048:
1049: MATCH_ONCE is returned when the end of an atomic group is successfully
1050: reached, but subsequent matching fails. It passes back up the tree (causing
1051: captured values to be reset) until the original atomic group level is
1052: reached. This is tested by comparing md->once_target with the start of the
1053: group. At this point, the return is converted into MATCH_NOMATCH so that
1054: previous backup points can be taken. */
1055:
1056: case OP_ONCE:
1057: case OP_BRA:
1058: case OP_SBRA:
1059: DPRINTF(("start non-capturing bracket\n"));
1060:
1061: for (;;)
1062: {
1.1.1.3 misho 1063: if (op >= OP_SBRA || op == OP_ONCE)
1064: md->match_function_type = MATCH_CBEGROUP;
1.1 misho 1065:
1066: /* If this is not a possibly empty group, and there are no (*THEN)s in
1067: the pattern, and this is the final alternative, optimize as described
1068: above. */
1069:
1070: else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1071: {
1.1.1.2 misho 1072: ecode += PRIV(OP_lengths)[*ecode];
1.1 misho 1073: goto TAIL_RECURSE;
1074: }
1075:
1076: /* In all other cases, we have to make another call to match(). */
1077:
1.1.1.2 misho 1078: save_mark = md->mark;
1.1.1.4 misho 1079: save_capture_last = md->capture_last;
1.1.1.2 misho 1080: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1.1 misho 1081: RM2);
1082:
1083: /* See comment in the code for capturing groups above about handling
1084: THEN. */
1085:
1086: if (rrc == MATCH_THEN)
1087: {
1088: next = ecode + GET(ecode,1);
1089: if (md->start_match_ptr < next &&
1090: (*ecode == OP_ALT || *next == OP_ALT))
1091: rrc = MATCH_NOMATCH;
1092: }
1093:
1094: if (rrc != MATCH_NOMATCH)
1095: {
1096: if (rrc == MATCH_ONCE)
1097: {
1.1.1.2 misho 1098: const pcre_uchar *scode = ecode;
1.1 misho 1099: if (*scode != OP_ONCE) /* If not at start, find it */
1100: {
1101: while (*scode == OP_ALT) scode += GET(scode, 1);
1102: scode -= GET(scode, 1);
1103: }
1104: if (md->once_target == scode) rrc = MATCH_NOMATCH;
1105: }
1106: RRETURN(rrc);
1107: }
1108: ecode += GET(ecode, 1);
1.1.1.2 misho 1109: md->mark = save_mark;
1.1 misho 1110: if (*ecode != OP_ALT) break;
1.1.1.4 misho 1111: md->capture_last = save_capture_last;
1.1 misho 1112: }
1113:
1114: RRETURN(MATCH_NOMATCH);
1115:
1116: /* Handle possessive capturing brackets with an unlimited repeat. We come
1117: here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1118: handled similarly to the normal case above. However, the matching is
1119: different. The end of these brackets will always be OP_KETRPOS, which
1120: returns MATCH_KETRPOS without going further in the pattern. By this means
1121: we can handle the group by iteration rather than recursion, thereby
1122: reducing the amount of stack needed. */
1123:
1124: case OP_CBRAPOS:
1125: case OP_SCBRAPOS:
1126: allow_zero = FALSE;
1127:
1128: POSSESSIVE_CAPTURE:
1129: number = GET2(ecode, 1+LINK_SIZE);
1130: offset = number << 1;
1131:
1132: #ifdef PCRE_DEBUG
1133: printf("start possessive bracket %d\n", number);
1134: printf("subject=");
1135: pchars(eptr, 16, TRUE, md);
1136: printf("\n");
1137: #endif
1138:
1139: if (offset < md->offset_max)
1140: {
1141: matched_once = FALSE;
1142: code_offset = (int)(ecode - md->start_code);
1143:
1144: save_offset1 = md->offset_vector[offset];
1145: save_offset2 = md->offset_vector[offset+1];
1146: save_offset3 = md->offset_vector[md->offset_end - number];
1147: save_capture_last = md->capture_last;
1148:
1149: DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1150:
1151: /* Each time round the loop, save the current subject position for use
1152: when the group matches. For MATCH_MATCH, the group has matched, so we
1153: restart it with a new subject starting position, remembering that we had
1154: at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1155: usual. If we haven't matched any alternatives in any iteration, check to
1156: see if a previous iteration matched. If so, the group has matched;
1157: continue from afterwards. Otherwise it has failed; restore the previous
1158: capture values before returning NOMATCH. */
1159:
1160: for (;;)
1161: {
1162: md->offset_vector[md->offset_end - number] =
1163: (int)(eptr - md->start_subject);
1164: if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1.1.1.2 misho 1165: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1.1 misho 1166: eptrb, RM63);
1167: if (rrc == MATCH_KETRPOS)
1168: {
1169: offset_top = md->end_offset_top;
1170: eptr = md->end_match_ptr;
1171: ecode = md->start_code + code_offset;
1172: save_capture_last = md->capture_last;
1173: matched_once = TRUE;
1.1.1.5 ! misho 1174: mstart = md->start_match_ptr; /* In case \K changed it */
1.1 misho 1175: continue;
1176: }
1177:
1178: /* See comment in the code for capturing groups above about handling
1179: THEN. */
1180:
1181: if (rrc == MATCH_THEN)
1182: {
1183: next = ecode + GET(ecode,1);
1184: if (md->start_match_ptr < next &&
1185: (*ecode == OP_ALT || *next == OP_ALT))
1186: rrc = MATCH_NOMATCH;
1187: }
1188:
1189: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1190: md->capture_last = save_capture_last;
1191: ecode += GET(ecode, 1);
1192: if (*ecode != OP_ALT) break;
1193: }
1194:
1195: if (!matched_once)
1196: {
1197: md->offset_vector[offset] = save_offset1;
1198: md->offset_vector[offset+1] = save_offset2;
1199: md->offset_vector[md->offset_end - number] = save_offset3;
1200: }
1201:
1202: if (allow_zero || matched_once)
1203: {
1204: ecode += 1 + LINK_SIZE;
1205: break;
1206: }
1207:
1208: RRETURN(MATCH_NOMATCH);
1209: }
1210:
1211: /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1212: as a non-capturing bracket. */
1213:
1214: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1215: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1216:
1217: DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1218:
1219: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1220: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1221:
1222: /* Non-capturing possessive bracket with unlimited repeat. We come here
1223: from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1224: without the capturing complication. It is written out separately for speed
1225: and cleanliness. */
1226:
1227: case OP_BRAPOS:
1228: case OP_SBRAPOS:
1229: allow_zero = FALSE;
1230:
1231: POSSESSIVE_NON_CAPTURE:
1232: matched_once = FALSE;
1233: code_offset = (int)(ecode - md->start_code);
1.1.1.4 misho 1234: save_capture_last = md->capture_last;
1.1 misho 1235:
1236: for (;;)
1237: {
1238: if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1.1.1.2 misho 1239: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1.1 misho 1240: eptrb, RM48);
1241: if (rrc == MATCH_KETRPOS)
1242: {
1243: offset_top = md->end_offset_top;
1244: eptr = md->end_match_ptr;
1245: ecode = md->start_code + code_offset;
1246: matched_once = TRUE;
1.1.1.5 ! misho 1247: mstart = md->start_match_ptr; /* In case \K reset it */
1.1 misho 1248: continue;
1249: }
1250:
1251: /* See comment in the code for capturing groups above about handling
1252: THEN. */
1253:
1254: if (rrc == MATCH_THEN)
1255: {
1256: next = ecode + GET(ecode,1);
1257: if (md->start_match_ptr < next &&
1258: (*ecode == OP_ALT || *next == OP_ALT))
1259: rrc = MATCH_NOMATCH;
1260: }
1261:
1262: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1263: ecode += GET(ecode, 1);
1264: if (*ecode != OP_ALT) break;
1.1.1.4 misho 1265: md->capture_last = save_capture_last;
1.1 misho 1266: }
1267:
1268: if (matched_once || allow_zero)
1269: {
1270: ecode += 1 + LINK_SIZE;
1271: break;
1272: }
1273: RRETURN(MATCH_NOMATCH);
1274:
1275: /* Control never reaches here. */
1276:
1.1.1.5 ! misho 1277: /* Conditional group: compilation checked that there are no more than two
! 1278: branches. If the condition is false, skipping the first branch takes us
! 1279: past the end of the item if there is only one branch, but that's exactly
! 1280: what we want. */
1.1 misho 1281:
1282: case OP_COND:
1283: case OP_SCOND:
1.1.1.5 ! misho 1284:
! 1285: /* The variable codelink will be added to ecode when the condition is
! 1286: false, to get to the second branch. Setting it to the offset to the ALT
! 1287: or KET, then incrementing ecode achieves this effect. We now have ecode
! 1288: pointing to the condition or callout. */
! 1289:
! 1290: codelink = GET(ecode, 1); /* Offset to the second branch */
! 1291: ecode += 1 + LINK_SIZE; /* From this opcode */
1.1 misho 1292:
1293: /* Because of the way auto-callout works during compile, a callout item is
1294: inserted between OP_COND and an assertion condition. */
1295:
1.1.1.5 ! misho 1296: if (*ecode == OP_CALLOUT)
1.1 misho 1297: {
1.1.1.2 misho 1298: if (PUBL(callout) != NULL)
1.1 misho 1299: {
1.1.1.2 misho 1300: PUBL(callout_block) cb;
1.1 misho 1301: cb.version = 2; /* Version 1 of the callout block */
1.1.1.5 ! misho 1302: cb.callout_number = ecode[1];
1.1 misho 1303: cb.offset_vector = md->offset_vector;
1.1.1.4 misho 1304: #if defined COMPILE_PCRE8
1.1 misho 1305: cb.subject = (PCRE_SPTR)md->start_subject;
1.1.1.4 misho 1306: #elif defined COMPILE_PCRE16
1.1.1.2 misho 1307: cb.subject = (PCRE_SPTR16)md->start_subject;
1.1.1.4 misho 1308: #elif defined COMPILE_PCRE32
1309: cb.subject = (PCRE_SPTR32)md->start_subject;
1.1.1.2 misho 1310: #endif
1.1 misho 1311: cb.subject_length = (int)(md->end_subject - md->start_subject);
1312: cb.start_match = (int)(mstart - md->start_subject);
1313: cb.current_position = (int)(eptr - md->start_subject);
1.1.1.5 ! misho 1314: cb.pattern_position = GET(ecode, 2);
! 1315: cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1.1 misho 1316: cb.capture_top = offset_top/2;
1.1.1.4 misho 1317: cb.capture_last = md->capture_last & CAPLMASK;
1318: /* Internal change requires this for API compatibility. */
1319: if (cb.capture_last == 0) cb.capture_last = -1;
1.1 misho 1320: cb.callout_data = md->callout_data;
1321: cb.mark = md->nomatch_mark;
1.1.1.2 misho 1322: if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1.1 misho 1323: if (rrc < 0) RRETURN(rrc);
1324: }
1.1.1.5 ! misho 1325:
! 1326: /* Advance ecode past the callout, so it now points to the condition. We
! 1327: must adjust codelink so that the value of ecode+codelink is unchanged. */
! 1328:
1.1.1.2 misho 1329: ecode += PRIV(OP_lengths)[OP_CALLOUT];
1.1.1.4 misho 1330: codelink -= PRIV(OP_lengths)[OP_CALLOUT];
1.1 misho 1331: }
1332:
1.1.1.5 ! misho 1333: /* Test the various possible conditions */
1.1 misho 1334:
1.1.1.5 ! misho 1335: condition = FALSE;
! 1336: switch(condcode = *ecode)
1.1 misho 1337: {
1.1.1.5 ! misho 1338: case OP_RREF: /* Numbered group recursion test */
! 1339: if (md->recursive != NULL) /* Not recursing => FALSE */
1.1 misho 1340: {
1.1.1.5 ! misho 1341: unsigned int recno = GET2(ecode, 1); /* Recursion group number*/
1.1 misho 1342: condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1.1.1.5 ! misho 1343: }
! 1344: break;
1.1 misho 1345:
1.1.1.5 ! misho 1346: case OP_DNRREF: /* Duplicate named group recursion test */
! 1347: if (md->recursive != NULL)
! 1348: {
! 1349: int count = GET2(ecode, 1 + IMM2_SIZE);
! 1350: pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
! 1351: while (count-- > 0)
! 1352: {
! 1353: unsigned int recno = GET2(slot, 0);
! 1354: condition = recno == md->recursive->group_num;
! 1355: if (condition) break;
! 1356: slot += md->name_entry_size;
1.1 misho 1357: }
1358: }
1.1.1.5 ! misho 1359: break;
1.1 misho 1360:
1.1.1.5 ! misho 1361: case OP_CREF: /* Numbered group used test */
! 1362: offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1.1 misho 1363: condition = offset < offset_top && md->offset_vector[offset] >= 0;
1.1.1.5 ! misho 1364: break;
1.1 misho 1365:
1.1.1.5 ! misho 1366: case OP_DNCREF: /* Duplicate named group used test */
1.1 misho 1367: {
1.1.1.5 ! misho 1368: int count = GET2(ecode, 1 + IMM2_SIZE);
! 1369: pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
! 1370: while (count-- > 0)
! 1371: {
! 1372: offset = GET2(slot, 0) << 1;
! 1373: condition = offset < offset_top && md->offset_vector[offset] >= 0;
! 1374: if (condition) break;
! 1375: slot += md->name_entry_size;
1.1 misho 1376: }
1377: }
1.1.1.5 ! misho 1378: break;
1.1 misho 1379:
1.1.1.5 ! misho 1380: case OP_DEF: /* DEFINE - always false */
! 1381: break;
1.1 misho 1382:
1.1.1.5 ! misho 1383: /* The condition is an assertion. Call match() to evaluate it - setting
! 1384: md->match_function_type to MATCH_CONDASSERT causes it to stop at the end
! 1385: of an assertion. */
1.1 misho 1386:
1.1.1.5 ! misho 1387: default:
1.1 misho 1388: md->match_function_type = MATCH_CONDASSERT;
1.1.1.5 ! misho 1389: RMATCH(eptr, ecode, offset_top, md, NULL, RM3);
1.1 misho 1390: if (rrc == MATCH_MATCH)
1391: {
1392: if (md->end_offset_top > offset_top)
1393: offset_top = md->end_offset_top; /* Captures may have happened */
1394: condition = TRUE;
1.1.1.5 ! misho 1395:
! 1396: /* Advance ecode past the assertion to the start of the first branch,
! 1397: but adjust it so that the general choosing code below works. */
! 1398:
! 1399: ecode += GET(ecode, 1);
1.1 misho 1400: while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1.1.1.5 ! misho 1401: ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode];
1.1 misho 1402: }
1403:
1404: /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1.1.1.5 ! misho 1405: assertion; it is therefore treated as NOMATCH. Any other return is an
! 1406: error. */
1.1 misho 1407:
1408: else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1409: {
1410: RRETURN(rrc); /* Need braces because of following else */
1411: }
1.1.1.5 ! misho 1412: break;
1.1 misho 1413: }
1414:
1.1.1.5 ! misho 1415: /* Choose branch according to the condition */
! 1416:
! 1417: ecode += condition? PRIV(OP_lengths)[condcode] : codelink;
1.1 misho 1418:
1.1.1.5 ! misho 1419: /* We are now at the branch that is to be obeyed. As there is only one, we
! 1420: can use tail recursion to avoid using another stack frame, except when
! 1421: there is unlimited repeat of a possibly empty group. In the latter case, a
! 1422: recursive call to match() is always required, unless the second alternative
! 1423: doesn't exist, in which case we can just plough on. Note that, for
! 1424: compatibility with Perl, the | in a conditional group is NOT treated as
! 1425: creating two alternatives. If a THEN is encountered in the branch, it
! 1426: propagates out to the enclosing alternative (unless nested in a deeper set
! 1427: of alternatives, of course). */
! 1428:
! 1429: if (condition || ecode[-(1+LINK_SIZE)] == OP_ALT)
1.1 misho 1430: {
1431: if (op != OP_SCOND)
1432: {
1433: goto TAIL_RECURSE;
1434: }
1435:
1436: md->match_function_type = MATCH_CBEGROUP;
1.1.1.5 ! misho 1437: RMATCH(eptr, ecode, offset_top, md, eptrb, RM49);
1.1 misho 1438: RRETURN(rrc);
1439: }
1440:
1441: /* Condition false & no alternative; continue after the group. */
1442:
1443: else
1444: {
1445: }
1446: break;
1447:
1448:
1449: /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1450: to close any currently open capturing brackets. */
1451:
1452: case OP_CLOSE:
1.1.1.4 misho 1453: number = GET2(ecode, 1); /* Must be less than 65536 */
1.1 misho 1454: offset = number << 1;
1455:
1456: #ifdef PCRE_DEBUG
1457: printf("end bracket %d at *ACCEPT", number);
1458: printf("\n");
1459: #endif
1460:
1.1.1.4 misho 1461: md->capture_last = (md->capture_last & OVFLMASK) | number;
1462: if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1.1 misho 1463: {
1464: md->offset_vector[offset] =
1465: md->offset_vector[md->offset_end - number];
1466: md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1467: if (offset_top <= offset) offset_top = offset + 2;
1468: }
1.1.1.2 misho 1469: ecode += 1 + IMM2_SIZE;
1.1 misho 1470: break;
1471:
1472:
1473: /* End of the pattern, either real or forced. */
1474:
1475: case OP_END:
1476: case OP_ACCEPT:
1477: case OP_ASSERT_ACCEPT:
1478:
1479: /* If we have matched an empty string, fail if not in an assertion and not
1480: in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1481: is set and we have matched at the start of the subject. In both cases,
1482: backtracking will then try other alternatives, if any. */
1483:
1484: if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1485: md->recursive == NULL &&
1486: (md->notempty ||
1487: (md->notempty_atstart &&
1488: mstart == md->start_subject + md->start_offset)))
1489: RRETURN(MATCH_NOMATCH);
1490:
1491: /* Otherwise, we have a match. */
1492:
1493: md->end_match_ptr = eptr; /* Record where we ended */
1494: md->end_offset_top = offset_top; /* and how many extracts were taken */
1495: md->start_match_ptr = mstart; /* and the start (\K can modify) */
1496:
1497: /* For some reason, the macros don't work properly if an expression is
1498: given as the argument to RRETURN when the heap is in use. */
1499:
1500: rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1501: RRETURN(rrc);
1502:
1503: /* Assertion brackets. Check the alternative branches in turn - the
1504: matching won't pass the KET for an assertion. If any one branch matches,
1505: the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1506: start of each branch to move the current point backwards, so the code at
1507: this level is identical to the lookahead case. When the assertion is part
1508: of a condition, we want to return immediately afterwards. The caller of
1509: this incarnation of the match() function will have set MATCH_CONDASSERT in
1510: md->match_function type, and one of these opcodes will be the first opcode
1511: that is processed. We use a local variable that is preserved over calls to
1512: match() to remember this case. */
1513:
1514: case OP_ASSERT:
1515: case OP_ASSERTBACK:
1.1.1.2 misho 1516: save_mark = md->mark;
1.1 misho 1517: if (md->match_function_type == MATCH_CONDASSERT)
1518: {
1519: condassert = TRUE;
1520: md->match_function_type = 0;
1521: }
1522: else condassert = FALSE;
1523:
1.1.1.4 misho 1524: /* Loop for each branch */
1525:
1.1 misho 1526: do
1527: {
1528: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1.1.1.4 misho 1529:
1530: /* A match means that the assertion is true; break out of the loop
1531: that matches its alternatives. */
1532:
1.1 misho 1533: if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1534: {
1535: mstart = md->start_match_ptr; /* In case \K reset it */
1536: break;
1537: }
1.1.1.4 misho 1538:
1539: /* If not matched, restore the previous mark setting. */
1540:
1.1.1.3 misho 1541: md->mark = save_mark;
1.1 misho 1542:
1.1.1.4 misho 1543: /* See comment in the code for capturing groups above about handling
1544: THEN. */
1.1.1.3 misho 1545:
1.1.1.4 misho 1546: if (rrc == MATCH_THEN)
1547: {
1548: next = ecode + GET(ecode,1);
1549: if (md->start_match_ptr < next &&
1550: (*ecode == OP_ALT || *next == OP_ALT))
1551: rrc = MATCH_NOMATCH;
1552: }
1.1.1.3 misho 1553:
1.1.1.4 misho 1554: /* Anything other than NOMATCH causes the entire assertion to fail,
1555: passing back the return code. This includes COMMIT, SKIP, PRUNE and an
1556: uncaptured THEN, which means they take their normal effect. This
1557: consistent approach does not always have exactly the same effect as in
1558: Perl. */
1.1 misho 1559:
1.1.1.4 misho 1560: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1 misho 1561: ecode += GET(ecode, 1);
1562: }
1.1.1.4 misho 1563: while (*ecode == OP_ALT); /* Continue for next alternative */
1564:
1565: /* If we have tried all the alternative branches, the assertion has
1566: failed. If not, we broke out after a match. */
1.1 misho 1567:
1568: if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1569:
1570: /* If checking an assertion for a condition, return MATCH_MATCH. */
1571:
1572: if (condassert) RRETURN(MATCH_MATCH);
1573:
1.1.1.4 misho 1574: /* Continue from after a successful assertion, updating the offsets high
1575: water mark, since extracts may have been taken during the assertion. */
1.1 misho 1576:
1577: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1578: ecode += 1 + LINK_SIZE;
1579: offset_top = md->end_offset_top;
1580: continue;
1581:
1.1.1.4 misho 1582: /* Negative assertion: all branches must fail to match for the assertion to
1583: succeed. */
1.1 misho 1584:
1585: case OP_ASSERT_NOT:
1586: case OP_ASSERTBACK_NOT:
1.1.1.2 misho 1587: save_mark = md->mark;
1.1 misho 1588: if (md->match_function_type == MATCH_CONDASSERT)
1589: {
1590: condassert = TRUE;
1591: md->match_function_type = 0;
1592: }
1593: else condassert = FALSE;
1594:
1.1.1.4 misho 1595: /* Loop for each alternative branch. */
1596:
1.1 misho 1597: do
1598: {
1599: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1.1.1.4 misho 1600: md->mark = save_mark; /* Always restore the mark setting */
1601:
1602: switch(rrc)
1.1 misho 1603: {
1.1.1.4 misho 1604: case MATCH_MATCH: /* A successful match means */
1605: case MATCH_ACCEPT: /* the assertion has failed. */
1606: RRETURN(MATCH_NOMATCH);
1607:
1608: case MATCH_NOMATCH: /* Carry on with next branch */
1.1 misho 1609: break;
1.1.1.4 misho 1610:
1611: /* See comment in the code for capturing groups above about handling
1612: THEN. */
1613:
1614: case MATCH_THEN:
1615: next = ecode + GET(ecode,1);
1616: if (md->start_match_ptr < next &&
1617: (*ecode == OP_ALT || *next == OP_ALT))
1618: {
1619: rrc = MATCH_NOMATCH;
1620: break;
1621: }
1622: /* Otherwise fall through. */
1623:
1624: /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole
1625: assertion to fail to match, without considering any more alternatives.
1626: Failing to match means the assertion is true. This is a consistent
1627: approach, but does not always have the same effect as in Perl. */
1628:
1629: case MATCH_COMMIT:
1630: case MATCH_SKIP:
1631: case MATCH_SKIP_ARG:
1632: case MATCH_PRUNE:
1633: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1634: goto NEG_ASSERT_TRUE; /* Break out of alternation loop */
1635:
1636: /* Anything else is an error */
1637:
1638: default:
1639: RRETURN(rrc);
1.1 misho 1640: }
1641:
1.1.1.4 misho 1642: /* Continue with next branch */
1.1 misho 1643:
1644: ecode += GET(ecode,1);
1645: }
1646: while (*ecode == OP_ALT);
1647:
1.1.1.4 misho 1648: /* All branches in the assertion failed to match. */
1.1 misho 1649:
1.1.1.4 misho 1650: NEG_ASSERT_TRUE:
1651: if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1652: ecode += 1 + LINK_SIZE; /* Continue with current branch */
1.1 misho 1653: continue;
1654:
1655: /* Move the subject pointer back. This occurs only at the start of
1656: each branch of a lookbehind assertion. If we are too close to the start to
1657: move back, this match function fails. When working with UTF-8 we move
1658: back a number of characters, not bytes. */
1659:
1660: case OP_REVERSE:
1.1.1.2 misho 1661: #ifdef SUPPORT_UTF
1662: if (utf)
1.1 misho 1663: {
1664: i = GET(ecode, 1);
1665: while (i-- > 0)
1666: {
1667: eptr--;
1668: if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1669: BACKCHAR(eptr);
1670: }
1671: }
1672: else
1673: #endif
1674:
1675: /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1676:
1677: {
1678: eptr -= GET(ecode, 1);
1679: if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1680: }
1681:
1682: /* Save the earliest consulted character, then skip to next op code */
1683:
1684: if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1685: ecode += 1 + LINK_SIZE;
1686: break;
1687:
1688: /* The callout item calls an external function, if one is provided, passing
1689: details of the match so far. This is mainly for debugging, though the
1690: function is able to force a failure. */
1691:
1692: case OP_CALLOUT:
1.1.1.2 misho 1693: if (PUBL(callout) != NULL)
1.1 misho 1694: {
1.1.1.2 misho 1695: PUBL(callout_block) cb;
1.1 misho 1696: cb.version = 2; /* Version 1 of the callout block */
1697: cb.callout_number = ecode[1];
1698: cb.offset_vector = md->offset_vector;
1.1.1.4 misho 1699: #if defined COMPILE_PCRE8
1.1 misho 1700: cb.subject = (PCRE_SPTR)md->start_subject;
1.1.1.4 misho 1701: #elif defined COMPILE_PCRE16
1.1.1.2 misho 1702: cb.subject = (PCRE_SPTR16)md->start_subject;
1.1.1.4 misho 1703: #elif defined COMPILE_PCRE32
1704: cb.subject = (PCRE_SPTR32)md->start_subject;
1.1.1.2 misho 1705: #endif
1.1 misho 1706: cb.subject_length = (int)(md->end_subject - md->start_subject);
1707: cb.start_match = (int)(mstart - md->start_subject);
1708: cb.current_position = (int)(eptr - md->start_subject);
1709: cb.pattern_position = GET(ecode, 2);
1710: cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1711: cb.capture_top = offset_top/2;
1.1.1.4 misho 1712: cb.capture_last = md->capture_last & CAPLMASK;
1713: /* Internal change requires this for API compatibility. */
1714: if (cb.capture_last == 0) cb.capture_last = -1;
1.1 misho 1715: cb.callout_data = md->callout_data;
1716: cb.mark = md->nomatch_mark;
1.1.1.2 misho 1717: if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1.1 misho 1718: if (rrc < 0) RRETURN(rrc);
1719: }
1720: ecode += 2 + 2*LINK_SIZE;
1721: break;
1722:
1723: /* Recursion either matches the current regex, or some subexpression. The
1724: offset data is the offset to the starting bracket from the start of the
1725: whole pattern. (This is so that it works from duplicated subpatterns.)
1726:
1727: The state of the capturing groups is preserved over recursion, and
1728: re-instated afterwards. We don't know how many are started and not yet
1729: finished (offset_top records the completed total) so we just have to save
1730: all the potential data. There may be up to 65535 such values, which is too
1731: large to put on the stack, but using malloc for small numbers seems
1732: expensive. As a compromise, the stack is used when there are no more than
1733: REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1734:
1735: There are also other values that have to be saved. We use a chained
1736: sequence of blocks that actually live on the stack. Thanks to Robin Houston
1737: for the original version of this logic. It has, however, been hacked around
1738: a lot, so he is not to blame for the current way it works. */
1739:
1740: case OP_RECURSE:
1741: {
1742: recursion_info *ri;
1.1.1.4 misho 1743: unsigned int recno;
1.1 misho 1744:
1745: callpat = md->start_code + GET(ecode, 1);
1746: recno = (callpat == md->start_code)? 0 :
1747: GET2(callpat, 1 + LINK_SIZE);
1748:
1749: /* Check for repeating a recursion without advancing the subject pointer.
1750: This should catch convoluted mutual recursions. (Some simple cases are
1751: caught at compile time.) */
1752:
1753: for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1754: if (recno == ri->group_num && eptr == ri->subject_position)
1755: RRETURN(PCRE_ERROR_RECURSELOOP);
1756:
1757: /* Add to "recursing stack" */
1758:
1759: new_recursive.group_num = recno;
1.1.1.4 misho 1760: new_recursive.saved_capture_last = md->capture_last;
1.1 misho 1761: new_recursive.subject_position = eptr;
1762: new_recursive.prevrec = md->recursive;
1763: md->recursive = &new_recursive;
1764:
1765: /* Where to continue from afterwards */
1766:
1767: ecode += 1 + LINK_SIZE;
1768:
1769: /* Now save the offset data */
1770:
1771: new_recursive.saved_max = md->offset_end;
1772: if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1773: new_recursive.offset_save = stacksave;
1774: else
1775: {
1776: new_recursive.offset_save =
1.1.1.2 misho 1777: (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1.1 misho 1778: if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1779: }
1780: memcpy(new_recursive.offset_save, md->offset_vector,
1781: new_recursive.saved_max * sizeof(int));
1782:
1783: /* OK, now we can do the recursion. After processing each alternative,
1.1.1.4 misho 1784: restore the offset data and the last captured value. If there were nested
1785: recursions, md->recursive might be changed, so reset it before looping.
1786: */
1.1 misho 1787:
1788: DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1789: cbegroup = (*callpat >= OP_SBRA);
1790: do
1791: {
1792: if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1.1.1.2 misho 1793: RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1.1 misho 1794: md, eptrb, RM6);
1795: memcpy(md->offset_vector, new_recursive.offset_save,
1796: new_recursive.saved_max * sizeof(int));
1.1.1.4 misho 1797: md->capture_last = new_recursive.saved_capture_last;
1.1 misho 1798: md->recursive = new_recursive.prevrec;
1799: if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1800: {
1801: DPRINTF(("Recursion matched\n"));
1802: if (new_recursive.offset_save != stacksave)
1.1.1.2 misho 1803: (PUBL(free))(new_recursive.offset_save);
1.1 misho 1804:
1805: /* Set where we got to in the subject, and reset the start in case
1806: it was changed by \K. This *is* propagated back out of a recursion,
1807: for Perl compatibility. */
1808:
1809: eptr = md->end_match_ptr;
1810: mstart = md->start_match_ptr;
1811: goto RECURSION_MATCHED; /* Exit loop; end processing */
1812: }
1813:
1.1.1.4 misho 1814: /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
1815: recursion; they cause a NOMATCH for the entire recursion. These codes
1816: are defined in a range that can be tested for. */
1817:
1818: if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX)
1819: RRETURN(MATCH_NOMATCH);
1820:
1821: /* Any return code other than NOMATCH is an error. */
1.1 misho 1822:
1.1.1.4 misho 1823: if (rrc != MATCH_NOMATCH)
1.1 misho 1824: {
1825: DPRINTF(("Recursion gave error %d\n", rrc));
1826: if (new_recursive.offset_save != stacksave)
1.1.1.2 misho 1827: (PUBL(free))(new_recursive.offset_save);
1.1 misho 1828: RRETURN(rrc);
1829: }
1830:
1831: md->recursive = &new_recursive;
1832: callpat += GET(callpat, 1);
1833: }
1834: while (*callpat == OP_ALT);
1835:
1836: DPRINTF(("Recursion didn't match\n"));
1837: md->recursive = new_recursive.prevrec;
1838: if (new_recursive.offset_save != stacksave)
1.1.1.2 misho 1839: (PUBL(free))(new_recursive.offset_save);
1.1 misho 1840: RRETURN(MATCH_NOMATCH);
1841: }
1842:
1843: RECURSION_MATCHED:
1844: break;
1845:
1846: /* An alternation is the end of a branch; scan along to find the end of the
1847: bracketed group and go to there. */
1848:
1849: case OP_ALT:
1850: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1851: break;
1852:
1853: /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1854: indicating that it may occur zero times. It may repeat infinitely, or not
1855: at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1856: with fixed upper repeat limits are compiled as a number of copies, with the
1857: optional ones preceded by BRAZERO or BRAMINZERO. */
1858:
1859: case OP_BRAZERO:
1860: next = ecode + 1;
1861: RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1862: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1863: do next += GET(next, 1); while (*next == OP_ALT);
1864: ecode = next + 1 + LINK_SIZE;
1865: break;
1866:
1867: case OP_BRAMINZERO:
1868: next = ecode + 1;
1869: do next += GET(next, 1); while (*next == OP_ALT);
1870: RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1871: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1872: ecode++;
1873: break;
1874:
1875: case OP_SKIPZERO:
1876: next = ecode+1;
1877: do next += GET(next,1); while (*next == OP_ALT);
1878: ecode = next + 1 + LINK_SIZE;
1879: break;
1880:
1881: /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1882: here; just jump to the group, with allow_zero set TRUE. */
1883:
1884: case OP_BRAPOSZERO:
1885: op = *(++ecode);
1886: allow_zero = TRUE;
1887: if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1888: goto POSSESSIVE_NON_CAPTURE;
1889:
1890: /* End of a group, repeated or non-repeating. */
1891:
1892: case OP_KET:
1893: case OP_KETRMIN:
1894: case OP_KETRMAX:
1895: case OP_KETRPOS:
1896: prev = ecode - GET(ecode, 1);
1897:
1898: /* If this was a group that remembered the subject start, in order to break
1899: infinite repeats of empty string matches, retrieve the subject start from
1900: the chain. Otherwise, set it NULL. */
1901:
1902: if (*prev >= OP_SBRA || *prev == OP_ONCE)
1903: {
1904: saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1905: eptrb = eptrb->epb_prev; /* Backup to previous group */
1906: }
1907: else saved_eptr = NULL;
1908:
1909: /* If we are at the end of an assertion group or a non-capturing atomic
1910: group, stop matching and return MATCH_MATCH, but record the current high
1911: water mark for use by positive assertions. We also need to record the match
1912: start in case it was changed by \K. */
1913:
1914: if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1915: *prev == OP_ONCE_NC)
1916: {
1917: md->end_match_ptr = eptr; /* For ONCE_NC */
1918: md->end_offset_top = offset_top;
1919: md->start_match_ptr = mstart;
1920: RRETURN(MATCH_MATCH); /* Sets md->mark */
1921: }
1922:
1923: /* For capturing groups we have to check the group number back at the start
1924: and if necessary complete handling an extraction by setting the offsets and
1925: bumping the high water mark. Whole-pattern recursion is coded as a recurse
1926: into group 0, so it won't be picked up here. Instead, we catch it when the
1927: OP_END is reached. Other recursion is handled here. We just have to record
1928: the current subject position and start match pointer and give a MATCH
1929: return. */
1930:
1931: if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1932: *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1933: {
1934: number = GET2(prev, 1+LINK_SIZE);
1935: offset = number << 1;
1936:
1937: #ifdef PCRE_DEBUG
1938: printf("end bracket %d", number);
1939: printf("\n");
1940: #endif
1941:
1942: /* Handle a recursively called group. */
1943:
1944: if (md->recursive != NULL && md->recursive->group_num == number)
1945: {
1946: md->end_match_ptr = eptr;
1947: md->start_match_ptr = mstart;
1948: RRETURN(MATCH_MATCH);
1949: }
1950:
1951: /* Deal with capturing */
1952:
1.1.1.4 misho 1953: md->capture_last = (md->capture_last & OVFLMASK) | number;
1954: if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1.1 misho 1955: {
1956: /* If offset is greater than offset_top, it means that we are
1957: "skipping" a capturing group, and that group's offsets must be marked
1958: unset. In earlier versions of PCRE, all the offsets were unset at the
1959: start of matching, but this doesn't work because atomic groups and
1960: assertions can cause a value to be set that should later be unset.
1961: Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1962: part of the atomic group, but this is not on the final matching path,
1963: so must be unset when 2 is set. (If there is no group 2, there is no
1964: problem, because offset_top will then be 2, indicating no capture.) */
1965:
1966: if (offset > offset_top)
1967: {
1968: register int *iptr = md->offset_vector + offset_top;
1969: register int *iend = md->offset_vector + offset;
1970: while (iptr < iend) *iptr++ = -1;
1971: }
1972:
1973: /* Now make the extraction */
1974:
1975: md->offset_vector[offset] =
1976: md->offset_vector[md->offset_end - number];
1977: md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1978: if (offset_top <= offset) offset_top = offset + 2;
1979: }
1980: }
1981:
1982: /* For an ordinary non-repeating ket, just continue at this level. This
1983: also happens for a repeating ket if no characters were matched in the
1984: group. This is the forcible breaking of infinite loops as implemented in
1985: Perl 5.005. For a non-repeating atomic group that includes captures,
1986: establish a backup point by processing the rest of the pattern at a lower
1987: level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1988: original OP_ONCE level, thereby bypassing intermediate backup points, but
1989: resetting any captures that happened along the way. */
1990:
1991: if (*ecode == OP_KET || eptr == saved_eptr)
1992: {
1993: if (*prev == OP_ONCE)
1994: {
1995: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1996: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1997: md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1998: RRETURN(MATCH_ONCE);
1999: }
2000: ecode += 1 + LINK_SIZE; /* Carry on at this level */
2001: break;
2002: }
2003:
2004: /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
2005: and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2006: at a time from the outer level, thus saving stack. */
2007:
2008: if (*ecode == OP_KETRPOS)
2009: {
1.1.1.5 ! misho 2010: md->start_match_ptr = mstart; /* In case \K reset it */
1.1 misho 2011: md->end_match_ptr = eptr;
2012: md->end_offset_top = offset_top;
2013: RRETURN(MATCH_KETRPOS);
2014: }
2015:
2016: /* The normal repeating kets try the rest of the pattern or restart from
2017: the preceding bracket, in the appropriate order. In the second case, we can
2018: use tail recursion to avoid using another stack frame, unless we have an
2019: an atomic group or an unlimited repeat of a group that can match an empty
2020: string. */
2021:
2022: if (*ecode == OP_KETRMIN)
2023: {
2024: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2025: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2026: if (*prev == OP_ONCE)
2027: {
2028: RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2029: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2030: md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2031: RRETURN(MATCH_ONCE);
2032: }
2033: if (*prev >= OP_SBRA) /* Could match an empty string */
2034: {
2035: RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2036: RRETURN(rrc);
2037: }
2038: ecode = prev;
2039: goto TAIL_RECURSE;
2040: }
2041: else /* OP_KETRMAX */
2042: {
2043: RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2044: if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2045: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2046: if (*prev == OP_ONCE)
2047: {
2048: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2049: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2050: md->once_target = prev;
2051: RRETURN(MATCH_ONCE);
2052: }
2053: ecode += 1 + LINK_SIZE;
2054: goto TAIL_RECURSE;
2055: }
2056: /* Control never gets here */
2057:
2058: /* Not multiline mode: start of subject assertion, unless notbol. */
2059:
2060: case OP_CIRC:
2061: if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2062:
2063: /* Start of subject assertion */
2064:
2065: case OP_SOD:
2066: if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2067: ecode++;
2068: break;
2069:
2070: /* Multiline mode: start of subject unless notbol, or after any newline. */
2071:
2072: case OP_CIRCM:
2073: if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2074: if (eptr != md->start_subject &&
2075: (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2076: RRETURN(MATCH_NOMATCH);
2077: ecode++;
2078: break;
2079:
2080: /* Start of match assertion */
2081:
2082: case OP_SOM:
2083: if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2084: ecode++;
2085: break;
2086:
2087: /* Reset the start of match point */
2088:
2089: case OP_SET_SOM:
2090: mstart = eptr;
2091: ecode++;
2092: break;
2093:
2094: /* Multiline mode: assert before any newline, or before end of subject
2095: unless noteol is set. */
2096:
2097: case OP_DOLLM:
2098: if (eptr < md->end_subject)
1.1.1.3 misho 2099: {
2100: if (!IS_NEWLINE(eptr))
2101: {
2102: if (md->partial != 0 &&
2103: eptr + 1 >= md->end_subject &&
2104: NLBLOCK->nltype == NLTYPE_FIXED &&
2105: NLBLOCK->nllen == 2 &&
1.1.1.4 misho 2106: RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
1.1.1.3 misho 2107: {
2108: md->hitend = TRUE;
2109: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2110: }
2111: RRETURN(MATCH_NOMATCH);
2112: }
2113: }
1.1 misho 2114: else
2115: {
2116: if (md->noteol) RRETURN(MATCH_NOMATCH);
2117: SCHECK_PARTIAL();
2118: }
2119: ecode++;
2120: break;
2121:
2122: /* Not multiline mode: assert before a terminating newline or before end of
2123: subject unless noteol is set. */
2124:
2125: case OP_DOLL:
2126: if (md->noteol) RRETURN(MATCH_NOMATCH);
2127: if (!md->endonly) goto ASSERT_NL_OR_EOS;
2128:
2129: /* ... else fall through for endonly */
2130:
2131: /* End of subject assertion (\z) */
2132:
2133: case OP_EOD:
2134: if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2135: SCHECK_PARTIAL();
2136: ecode++;
2137: break;
2138:
2139: /* End of subject or ending \n assertion (\Z) */
2140:
2141: case OP_EODN:
2142: ASSERT_NL_OR_EOS:
2143: if (eptr < md->end_subject &&
2144: (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1.1.1.3 misho 2145: {
2146: if (md->partial != 0 &&
2147: eptr + 1 >= md->end_subject &&
2148: NLBLOCK->nltype == NLTYPE_FIXED &&
2149: NLBLOCK->nllen == 2 &&
1.1.1.4 misho 2150: RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
1.1.1.3 misho 2151: {
2152: md->hitend = TRUE;
2153: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2154: }
1.1 misho 2155: RRETURN(MATCH_NOMATCH);
1.1.1.3 misho 2156: }
1.1 misho 2157:
2158: /* Either at end of string or \n before end. */
2159:
2160: SCHECK_PARTIAL();
2161: ecode++;
2162: break;
2163:
2164: /* Word boundary assertions */
2165:
2166: case OP_NOT_WORD_BOUNDARY:
2167: case OP_WORD_BOUNDARY:
2168: {
2169:
2170: /* Find out if the previous and current characters are "word" characters.
2171: It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2172: be "non-word" characters. Remember the earliest consulted character for
2173: partial matching. */
2174:
1.1.1.2 misho 2175: #ifdef SUPPORT_UTF
2176: if (utf)
1.1 misho 2177: {
2178: /* Get status of previous character */
2179:
2180: if (eptr == md->start_subject) prev_is_word = FALSE; else
2181: {
1.1.1.2 misho 2182: PCRE_PUCHAR lastptr = eptr - 1;
2183: BACKCHAR(lastptr);
1.1 misho 2184: if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2185: GETCHAR(c, lastptr);
2186: #ifdef SUPPORT_UCP
2187: if (md->use_ucp)
2188: {
2189: if (c == '_') prev_is_word = TRUE; else
2190: {
2191: int cat = UCD_CATEGORY(c);
2192: prev_is_word = (cat == ucp_L || cat == ucp_N);
2193: }
2194: }
2195: else
2196: #endif
2197: prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2198: }
2199:
2200: /* Get status of next character */
2201:
2202: if (eptr >= md->end_subject)
2203: {
2204: SCHECK_PARTIAL();
2205: cur_is_word = FALSE;
2206: }
2207: else
2208: {
2209: GETCHAR(c, eptr);
2210: #ifdef SUPPORT_UCP
2211: if (md->use_ucp)
2212: {
2213: if (c == '_') cur_is_word = TRUE; else
2214: {
2215: int cat = UCD_CATEGORY(c);
2216: cur_is_word = (cat == ucp_L || cat == ucp_N);
2217: }
2218: }
2219: else
2220: #endif
2221: cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2222: }
2223: }
2224: else
2225: #endif
2226:
2227: /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2228: consistency with the behaviour of \w we do use it in this case. */
2229:
2230: {
2231: /* Get status of previous character */
2232:
2233: if (eptr == md->start_subject) prev_is_word = FALSE; else
2234: {
2235: if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2236: #ifdef SUPPORT_UCP
2237: if (md->use_ucp)
2238: {
2239: c = eptr[-1];
2240: if (c == '_') prev_is_word = TRUE; else
2241: {
2242: int cat = UCD_CATEGORY(c);
2243: prev_is_word = (cat == ucp_L || cat == ucp_N);
2244: }
2245: }
2246: else
2247: #endif
1.1.1.2 misho 2248: prev_is_word = MAX_255(eptr[-1])
2249: && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1.1 misho 2250: }
2251:
2252: /* Get status of next character */
2253:
2254: if (eptr >= md->end_subject)
2255: {
2256: SCHECK_PARTIAL();
2257: cur_is_word = FALSE;
2258: }
2259: else
2260: #ifdef SUPPORT_UCP
2261: if (md->use_ucp)
2262: {
2263: c = *eptr;
2264: if (c == '_') cur_is_word = TRUE; else
2265: {
2266: int cat = UCD_CATEGORY(c);
2267: cur_is_word = (cat == ucp_L || cat == ucp_N);
2268: }
2269: }
2270: else
2271: #endif
1.1.1.2 misho 2272: cur_is_word = MAX_255(*eptr)
2273: && ((md->ctypes[*eptr] & ctype_word) != 0);
1.1 misho 2274: }
2275:
2276: /* Now see if the situation is what we want */
2277:
2278: if ((*ecode++ == OP_WORD_BOUNDARY)?
2279: cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2280: RRETURN(MATCH_NOMATCH);
2281: }
2282: break;
2283:
1.1.1.3 misho 2284: /* Match any single character type except newline; have to take care with
2285: CRLF newlines and partial matching. */
1.1 misho 2286:
2287: case OP_ANY:
2288: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1.1.1.3 misho 2289: if (md->partial != 0 &&
2290: eptr + 1 >= md->end_subject &&
2291: NLBLOCK->nltype == NLTYPE_FIXED &&
2292: NLBLOCK->nllen == 2 &&
1.1.1.4 misho 2293: RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
1.1.1.3 misho 2294: {
2295: md->hitend = TRUE;
2296: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2297: }
2298:
1.1 misho 2299: /* Fall through */
2300:
1.1.1.3 misho 2301: /* Match any single character whatsoever. */
2302:
1.1 misho 2303: case OP_ALLANY:
2304: if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2305: { /* not be updated before SCHECK_PARTIAL. */
2306: SCHECK_PARTIAL();
2307: RRETURN(MATCH_NOMATCH);
2308: }
2309: eptr++;
1.1.1.2 misho 2310: #ifdef SUPPORT_UTF
2311: if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2312: #endif
1.1 misho 2313: ecode++;
2314: break;
2315:
2316: /* Match a single byte, even in UTF-8 mode. This opcode really does match
2317: any byte, even newline, independent of the setting of PCRE_DOTALL. */
2318:
2319: case OP_ANYBYTE:
2320: if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2321: { /* not be updated before SCHECK_PARTIAL. */
2322: SCHECK_PARTIAL();
2323: RRETURN(MATCH_NOMATCH);
2324: }
2325: eptr++;
2326: ecode++;
2327: break;
2328:
2329: case OP_NOT_DIGIT:
2330: if (eptr >= md->end_subject)
2331: {
2332: SCHECK_PARTIAL();
2333: RRETURN(MATCH_NOMATCH);
2334: }
2335: GETCHARINCTEST(c, eptr);
2336: if (
1.1.1.2 misho 2337: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
1.1 misho 2338: c < 256 &&
2339: #endif
2340: (md->ctypes[c] & ctype_digit) != 0
2341: )
2342: RRETURN(MATCH_NOMATCH);
2343: ecode++;
2344: break;
2345:
2346: case OP_DIGIT:
2347: if (eptr >= md->end_subject)
2348: {
2349: SCHECK_PARTIAL();
2350: RRETURN(MATCH_NOMATCH);
2351: }
2352: GETCHARINCTEST(c, eptr);
2353: if (
1.1.1.2 misho 2354: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2355: c > 255 ||
1.1 misho 2356: #endif
2357: (md->ctypes[c] & ctype_digit) == 0
2358: )
2359: RRETURN(MATCH_NOMATCH);
2360: ecode++;
2361: break;
2362:
2363: case OP_NOT_WHITESPACE:
2364: if (eptr >= md->end_subject)
2365: {
2366: SCHECK_PARTIAL();
2367: RRETURN(MATCH_NOMATCH);
2368: }
2369: GETCHARINCTEST(c, eptr);
2370: if (
1.1.1.2 misho 2371: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
1.1 misho 2372: c < 256 &&
2373: #endif
2374: (md->ctypes[c] & ctype_space) != 0
2375: )
2376: RRETURN(MATCH_NOMATCH);
2377: ecode++;
2378: break;
2379:
2380: case OP_WHITESPACE:
2381: if (eptr >= md->end_subject)
2382: {
2383: SCHECK_PARTIAL();
2384: RRETURN(MATCH_NOMATCH);
2385: }
2386: GETCHARINCTEST(c, eptr);
2387: if (
1.1.1.2 misho 2388: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2389: c > 255 ||
1.1 misho 2390: #endif
2391: (md->ctypes[c] & ctype_space) == 0
2392: )
2393: RRETURN(MATCH_NOMATCH);
2394: ecode++;
2395: break;
2396:
2397: case OP_NOT_WORDCHAR:
2398: if (eptr >= md->end_subject)
2399: {
2400: SCHECK_PARTIAL();
2401: RRETURN(MATCH_NOMATCH);
2402: }
2403: GETCHARINCTEST(c, eptr);
2404: if (
1.1.1.2 misho 2405: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
1.1 misho 2406: c < 256 &&
2407: #endif
2408: (md->ctypes[c] & ctype_word) != 0
2409: )
2410: RRETURN(MATCH_NOMATCH);
2411: ecode++;
2412: break;
2413:
2414: case OP_WORDCHAR:
2415: if (eptr >= md->end_subject)
2416: {
2417: SCHECK_PARTIAL();
2418: RRETURN(MATCH_NOMATCH);
2419: }
2420: GETCHARINCTEST(c, eptr);
2421: if (
1.1.1.2 misho 2422: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2423: c > 255 ||
1.1 misho 2424: #endif
2425: (md->ctypes[c] & ctype_word) == 0
2426: )
2427: RRETURN(MATCH_NOMATCH);
2428: ecode++;
2429: break;
2430:
2431: case OP_ANYNL:
2432: if (eptr >= md->end_subject)
2433: {
2434: SCHECK_PARTIAL();
2435: RRETURN(MATCH_NOMATCH);
2436: }
2437: GETCHARINCTEST(c, eptr);
2438: switch(c)
2439: {
2440: default: RRETURN(MATCH_NOMATCH);
2441:
1.1.1.4 misho 2442: case CHAR_CR:
1.1.1.3 misho 2443: if (eptr >= md->end_subject)
2444: {
2445: SCHECK_PARTIAL();
2446: }
1.1.1.4 misho 2447: else if (RAWUCHARTEST(eptr) == CHAR_LF) eptr++;
1.1 misho 2448: break;
2449:
1.1.1.4 misho 2450: case CHAR_LF:
1.1 misho 2451: break;
2452:
1.1.1.4 misho 2453: case CHAR_VT:
2454: case CHAR_FF:
2455: case CHAR_NEL:
2456: #ifndef EBCDIC
1.1 misho 2457: case 0x2028:
2458: case 0x2029:
1.1.1.4 misho 2459: #endif /* Not EBCDIC */
1.1 misho 2460: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2461: break;
2462: }
2463: ecode++;
2464: break;
2465:
2466: case OP_NOT_HSPACE:
2467: if (eptr >= md->end_subject)
2468: {
2469: SCHECK_PARTIAL();
2470: RRETURN(MATCH_NOMATCH);
2471: }
2472: GETCHARINCTEST(c, eptr);
2473: switch(c)
2474: {
1.1.1.4 misho 2475: HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
1.1 misho 2476: default: break;
2477: }
2478: ecode++;
2479: break;
2480:
2481: case OP_HSPACE:
2482: if (eptr >= md->end_subject)
2483: {
2484: SCHECK_PARTIAL();
2485: RRETURN(MATCH_NOMATCH);
2486: }
2487: GETCHARINCTEST(c, eptr);
2488: switch(c)
2489: {
1.1.1.4 misho 2490: HSPACE_CASES: break; /* Byte and multibyte cases */
1.1 misho 2491: default: RRETURN(MATCH_NOMATCH);
2492: }
2493: ecode++;
2494: break;
2495:
2496: case OP_NOT_VSPACE:
2497: if (eptr >= md->end_subject)
2498: {
2499: SCHECK_PARTIAL();
2500: RRETURN(MATCH_NOMATCH);
2501: }
2502: GETCHARINCTEST(c, eptr);
2503: switch(c)
2504: {
1.1.1.4 misho 2505: VSPACE_CASES: RRETURN(MATCH_NOMATCH);
1.1 misho 2506: default: break;
2507: }
2508: ecode++;
2509: break;
2510:
2511: case OP_VSPACE:
2512: if (eptr >= md->end_subject)
2513: {
2514: SCHECK_PARTIAL();
2515: RRETURN(MATCH_NOMATCH);
2516: }
2517: GETCHARINCTEST(c, eptr);
2518: switch(c)
2519: {
1.1.1.4 misho 2520: VSPACE_CASES: break;
1.1 misho 2521: default: RRETURN(MATCH_NOMATCH);
2522: }
2523: ecode++;
2524: break;
2525:
2526: #ifdef SUPPORT_UCP
2527: /* Check the next character by Unicode property. We will get here only
2528: if the support is in the binary; otherwise a compile-time error occurs. */
2529:
2530: case OP_PROP:
2531: case OP_NOTPROP:
2532: if (eptr >= md->end_subject)
2533: {
2534: SCHECK_PARTIAL();
2535: RRETURN(MATCH_NOMATCH);
2536: }
2537: GETCHARINCTEST(c, eptr);
2538: {
1.1.1.4 misho 2539: const pcre_uint32 *cp;
1.1 misho 2540: const ucd_record *prop = GET_UCD(c);
2541:
2542: switch(ecode[1])
2543: {
2544: case PT_ANY:
2545: if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2546: break;
2547:
2548: case PT_LAMP:
2549: if ((prop->chartype == ucp_Lu ||
2550: prop->chartype == ucp_Ll ||
2551: prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2552: RRETURN(MATCH_NOMATCH);
2553: break;
2554:
2555: case PT_GC:
1.1.1.2 misho 2556: if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
1.1 misho 2557: RRETURN(MATCH_NOMATCH);
2558: break;
2559:
2560: case PT_PC:
2561: if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2562: RRETURN(MATCH_NOMATCH);
2563: break;
2564:
2565: case PT_SC:
2566: if ((ecode[2] != prop->script) == (op == OP_PROP))
2567: RRETURN(MATCH_NOMATCH);
2568: break;
2569:
2570: /* These are specials */
2571:
2572: case PT_ALNUM:
1.1.1.2 misho 2573: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2574: PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
1.1 misho 2575: RRETURN(MATCH_NOMATCH);
2576: break;
2577:
1.1.1.5 ! misho 2578: /* Perl space used to exclude VT, but from Perl 5.18 it is included,
! 2579: which means that Perl space and POSIX space are now identical. PCRE
! 2580: was changed at release 8.34. */
1.1 misho 2581:
1.1.1.5 ! misho 2582: case PT_SPACE: /* Perl space */
1.1 misho 2583: case PT_PXSPACE: /* POSIX space */
1.1.1.5 ! misho 2584: switch(c)
! 2585: {
! 2586: HSPACE_CASES:
! 2587: VSPACE_CASES:
! 2588: if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
! 2589: break;
! 2590:
! 2591: default:
! 2592: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) ==
! 2593: (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH);
! 2594: break;
! 2595: }
1.1 misho 2596: break;
2597:
2598: case PT_WORD:
1.1.1.2 misho 2599: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2600: PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1.1 misho 2601: c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2602: RRETURN(MATCH_NOMATCH);
2603: break;
2604:
1.1.1.4 misho 2605: case PT_CLIST:
2606: cp = PRIV(ucd_caseless_sets) + ecode[2];
2607: for (;;)
2608: {
2609: if (c < *cp)
2610: { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2611: if (c == *cp++)
2612: { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2613: }
2614: break;
2615:
2616: case PT_UCNC:
2617: if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2618: c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2619: c >= 0xe000) == (op == OP_NOTPROP))
2620: RRETURN(MATCH_NOMATCH);
2621: break;
2622:
1.1 misho 2623: /* This should never occur */
2624:
2625: default:
2626: RRETURN(PCRE_ERROR_INTERNAL);
2627: }
2628:
2629: ecode += 3;
2630: }
2631: break;
2632:
2633: /* Match an extended Unicode sequence. We will get here only if the support
2634: is in the binary; otherwise a compile-time error occurs. */
2635:
2636: case OP_EXTUNI:
2637: if (eptr >= md->end_subject)
2638: {
2639: SCHECK_PARTIAL();
2640: RRETURN(MATCH_NOMATCH);
2641: }
1.1.1.4 misho 2642: else
1.1 misho 2643: {
1.1.1.4 misho 2644: int lgb, rgb;
2645: GETCHARINCTEST(c, eptr);
2646: lgb = UCD_GRAPHBREAK(c);
2647: while (eptr < md->end_subject)
2648: {
2649: int len = 1;
2650: if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2651: rgb = UCD_GRAPHBREAK(c);
2652: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2653: lgb = rgb;
2654: eptr += len;
2655: }
1.1 misho 2656: }
1.1.1.3 misho 2657: CHECK_PARTIAL();
1.1 misho 2658: ecode++;
2659: break;
1.1.1.4 misho 2660: #endif /* SUPPORT_UCP */
1.1 misho 2661:
2662:
2663: /* Match a back reference, possibly repeatedly. Look past the end of the
2664: item to see if there is repeat information following. The code is similar
2665: to that for character classes, but repeated for efficiency. Then obey
2666: similar code to character type repeats - written out again for speed.
2667: However, if the referenced string is the empty string, always treat
2668: it as matched, any number of times (otherwise there could be infinite
1.1.1.5 ! misho 2669: loops). If the reference is unset, there are two possibilities:
1.1 misho 2670:
2671: (a) In the default, Perl-compatible state, set the length negative;
2672: this ensures that every attempt at a match fails. We can't just fail
2673: here, because of the possibility of quantifiers with zero minima.
2674:
2675: (b) If the JavaScript compatibility flag is set, set the length to zero
2676: so that the back reference matches an empty string.
2677:
2678: Otherwise, set the length to the length of what was matched by the
1.1.1.5 ! misho 2679: referenced subpattern.
! 2680:
! 2681: The OP_REF and OP_REFI opcodes are used for a reference to a numbered group
! 2682: or to a non-duplicated named group. For a duplicated named group, OP_DNREF
! 2683: and OP_DNREFI are used. In this case we must scan the list of groups to
! 2684: which the name refers, and use the first one that is set. */
! 2685:
! 2686: case OP_DNREF:
! 2687: case OP_DNREFI:
! 2688: caseless = op == OP_DNREFI;
! 2689: {
! 2690: int count = GET2(ecode, 1+IMM2_SIZE);
! 2691: pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
! 2692: ecode += 1 + 2*IMM2_SIZE;
! 2693:
! 2694: while (count-- > 0)
! 2695: {
! 2696: offset = GET2(slot, 0) << 1;
! 2697: if (offset < offset_top && md->offset_vector[offset] >= 0) break;
! 2698: slot += md->name_entry_size;
! 2699: }
! 2700: if (count < 0)
! 2701: length = (md->jscript_compat)? 0 : -1;
! 2702: else
! 2703: length = md->offset_vector[offset+1] - md->offset_vector[offset];
! 2704: }
! 2705: goto REF_REPEAT;
1.1 misho 2706:
1.1.1.5 ! misho 2707: case OP_REF:
! 2708: case OP_REFI:
! 2709: caseless = op == OP_REFI;
! 2710: offset = GET2(ecode, 1) << 1; /* Doubled ref number */
! 2711: ecode += 1 + IMM2_SIZE;
1.1 misho 2712: if (offset >= offset_top || md->offset_vector[offset] < 0)
2713: length = (md->jscript_compat)? 0 : -1;
2714: else
2715: length = md->offset_vector[offset+1] - md->offset_vector[offset];
2716:
2717: /* Set up for repetition, or handle the non-repeated case */
2718:
1.1.1.5 ! misho 2719: REF_REPEAT:
1.1 misho 2720: switch (*ecode)
2721: {
2722: case OP_CRSTAR:
2723: case OP_CRMINSTAR:
2724: case OP_CRPLUS:
2725: case OP_CRMINPLUS:
2726: case OP_CRQUERY:
2727: case OP_CRMINQUERY:
2728: c = *ecode++ - OP_CRSTAR;
2729: minimize = (c & 1) != 0;
2730: min = rep_min[c]; /* Pick up values from tables; */
2731: max = rep_max[c]; /* zero for max => infinity */
2732: if (max == 0) max = INT_MAX;
2733: break;
2734:
2735: case OP_CRRANGE:
2736: case OP_CRMINRANGE:
2737: minimize = (*ecode == OP_CRMINRANGE);
2738: min = GET2(ecode, 1);
1.1.1.2 misho 2739: max = GET2(ecode, 1 + IMM2_SIZE);
1.1 misho 2740: if (max == 0) max = INT_MAX;
1.1.1.2 misho 2741: ecode += 1 + 2 * IMM2_SIZE;
1.1 misho 2742: break;
2743:
2744: default: /* No repeat follows */
2745: if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2746: {
1.1.1.3 misho 2747: if (length == -2) eptr = md->end_subject; /* Partial match */
1.1 misho 2748: CHECK_PARTIAL();
2749: RRETURN(MATCH_NOMATCH);
2750: }
2751: eptr += length;
2752: continue; /* With the main loop */
2753: }
2754:
2755: /* Handle repeated back references. If the length of the reference is
1.1.1.2 misho 2756: zero, just continue with the main loop. If the length is negative, it
2757: means the reference is unset in non-Java-compatible mode. If the minimum is
2758: zero, we can continue at the same level without recursion. For any other
2759: minimum, carrying on will result in NOMATCH. */
1.1 misho 2760:
2761: if (length == 0) continue;
1.1.1.2 misho 2762: if (length < 0 && min == 0) continue;
1.1 misho 2763:
2764: /* First, ensure the minimum number of matches are present. We get back
2765: the length of the reference string explicitly rather than passing the
2766: address of eptr, so that eptr can be a register variable. */
2767:
2768: for (i = 1; i <= min; i++)
2769: {
2770: int slength;
2771: if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2772: {
1.1.1.3 misho 2773: if (slength == -2) eptr = md->end_subject; /* Partial match */
1.1 misho 2774: CHECK_PARTIAL();
2775: RRETURN(MATCH_NOMATCH);
2776: }
2777: eptr += slength;
2778: }
2779:
2780: /* If min = max, continue at the same level without recursion.
2781: They are not both allowed to be zero. */
2782:
2783: if (min == max) continue;
2784:
2785: /* If minimizing, keep trying and advancing the pointer */
2786:
2787: if (minimize)
2788: {
2789: for (fi = min;; fi++)
2790: {
2791: int slength;
2792: RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2793: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2794: if (fi >= max) RRETURN(MATCH_NOMATCH);
2795: if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2796: {
1.1.1.3 misho 2797: if (slength == -2) eptr = md->end_subject; /* Partial match */
1.1 misho 2798: CHECK_PARTIAL();
2799: RRETURN(MATCH_NOMATCH);
2800: }
2801: eptr += slength;
2802: }
2803: /* Control never gets here */
2804: }
2805:
2806: /* If maximizing, find the longest string and work backwards */
2807:
2808: else
2809: {
2810: pp = eptr;
2811: for (i = min; i < max; i++)
2812: {
2813: int slength;
2814: if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2815: {
1.1.1.3 misho 2816: /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2817: the soft partial matching case. */
2818:
2819: if (slength == -2 && md->partial != 0 &&
2820: md->end_subject > md->start_used_ptr)
2821: {
2822: md->hitend = TRUE;
2823: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2824: }
1.1 misho 2825: break;
2826: }
2827: eptr += slength;
2828: }
1.1.1.3 misho 2829:
1.1 misho 2830: while (eptr >= pp)
2831: {
2832: RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2833: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2834: eptr -= length;
2835: }
2836: RRETURN(MATCH_NOMATCH);
2837: }
2838: /* Control never gets here */
2839:
2840: /* Match a bit-mapped character class, possibly repeatedly. This op code is
2841: used when all the characters in the class have values in the range 0-255,
2842: and either the matching is caseful, or the characters are in the range
2843: 0-127 when UTF-8 processing is enabled. The only difference between
2844: OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2845: encountered.
2846:
2847: First, look past the end of the item to see if there is repeat information
2848: following. Then obey similar code to character type repeats - written out
2849: again for speed. */
2850:
2851: case OP_NCLASS:
2852: case OP_CLASS:
2853: {
1.1.1.2 misho 2854: /* The data variable is saved across frames, so the byte map needs to
2855: be stored there. */
2856: #define BYTE_MAP ((pcre_uint8 *)data)
1.1 misho 2857: data = ecode + 1; /* Save for matching */
1.1.1.2 misho 2858: ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
1.1 misho 2859:
2860: switch (*ecode)
2861: {
2862: case OP_CRSTAR:
2863: case OP_CRMINSTAR:
2864: case OP_CRPLUS:
2865: case OP_CRMINPLUS:
2866: case OP_CRQUERY:
2867: case OP_CRMINQUERY:
1.1.1.5 ! misho 2868: case OP_CRPOSSTAR:
! 2869: case OP_CRPOSPLUS:
! 2870: case OP_CRPOSQUERY:
1.1 misho 2871: c = *ecode++ - OP_CRSTAR;
1.1.1.5 ! misho 2872: if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
! 2873: else possessive = TRUE;
1.1 misho 2874: min = rep_min[c]; /* Pick up values from tables; */
2875: max = rep_max[c]; /* zero for max => infinity */
2876: if (max == 0) max = INT_MAX;
2877: break;
2878:
2879: case OP_CRRANGE:
2880: case OP_CRMINRANGE:
1.1.1.5 ! misho 2881: case OP_CRPOSRANGE:
1.1 misho 2882: minimize = (*ecode == OP_CRMINRANGE);
1.1.1.5 ! misho 2883: possessive = (*ecode == OP_CRPOSRANGE);
1.1 misho 2884: min = GET2(ecode, 1);
1.1.1.2 misho 2885: max = GET2(ecode, 1 + IMM2_SIZE);
1.1 misho 2886: if (max == 0) max = INT_MAX;
1.1.1.2 misho 2887: ecode += 1 + 2 * IMM2_SIZE;
1.1 misho 2888: break;
2889:
2890: default: /* No repeat follows */
2891: min = max = 1;
2892: break;
2893: }
2894:
2895: /* First, ensure the minimum number of matches are present. */
2896:
1.1.1.2 misho 2897: #ifdef SUPPORT_UTF
2898: if (utf)
1.1 misho 2899: {
2900: for (i = 1; i <= min; i++)
2901: {
2902: if (eptr >= md->end_subject)
2903: {
2904: SCHECK_PARTIAL();
2905: RRETURN(MATCH_NOMATCH);
2906: }
2907: GETCHARINC(c, eptr);
2908: if (c > 255)
2909: {
2910: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2911: }
2912: else
1.1.1.2 misho 2913: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1.1 misho 2914: }
2915: }
2916: else
2917: #endif
1.1.1.2 misho 2918: /* Not UTF mode */
1.1 misho 2919: {
2920: for (i = 1; i <= min; i++)
2921: {
2922: if (eptr >= md->end_subject)
2923: {
2924: SCHECK_PARTIAL();
2925: RRETURN(MATCH_NOMATCH);
2926: }
2927: c = *eptr++;
1.1.1.2 misho 2928: #ifndef COMPILE_PCRE8
2929: if (c > 255)
2930: {
2931: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2932: }
2933: else
2934: #endif
2935: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1.1 misho 2936: }
2937: }
2938:
2939: /* If max == min we can continue with the main loop without the
2940: need to recurse. */
2941:
2942: if (min == max) continue;
2943:
2944: /* If minimizing, keep testing the rest of the expression and advancing
2945: the pointer while it matches the class. */
2946:
2947: if (minimize)
2948: {
1.1.1.2 misho 2949: #ifdef SUPPORT_UTF
2950: if (utf)
1.1 misho 2951: {
2952: for (fi = min;; fi++)
2953: {
2954: RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2955: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2956: if (fi >= max) RRETURN(MATCH_NOMATCH);
2957: if (eptr >= md->end_subject)
2958: {
2959: SCHECK_PARTIAL();
2960: RRETURN(MATCH_NOMATCH);
2961: }
2962: GETCHARINC(c, eptr);
2963: if (c > 255)
2964: {
2965: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2966: }
2967: else
1.1.1.2 misho 2968: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1.1 misho 2969: }
2970: }
2971: else
2972: #endif
1.1.1.2 misho 2973: /* Not UTF mode */
1.1 misho 2974: {
2975: for (fi = min;; fi++)
2976: {
2977: RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2978: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2979: if (fi >= max) RRETURN(MATCH_NOMATCH);
2980: if (eptr >= md->end_subject)
2981: {
2982: SCHECK_PARTIAL();
2983: RRETURN(MATCH_NOMATCH);
2984: }
2985: c = *eptr++;
1.1.1.2 misho 2986: #ifndef COMPILE_PCRE8
2987: if (c > 255)
2988: {
2989: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2990: }
2991: else
2992: #endif
2993: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1.1 misho 2994: }
2995: }
2996: /* Control never gets here */
2997: }
2998:
2999: /* If maximizing, find the longest possible run, then work backwards. */
3000:
3001: else
3002: {
3003: pp = eptr;
3004:
1.1.1.2 misho 3005: #ifdef SUPPORT_UTF
3006: if (utf)
1.1 misho 3007: {
3008: for (i = min; i < max; i++)
3009: {
3010: int len = 1;
3011: if (eptr >= md->end_subject)
3012: {
3013: SCHECK_PARTIAL();
3014: break;
3015: }
3016: GETCHARLEN(c, eptr, len);
3017: if (c > 255)
3018: {
3019: if (op == OP_CLASS) break;
3020: }
3021: else
1.1.1.2 misho 3022: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
1.1 misho 3023: eptr += len;
3024: }
1.1.1.5 ! misho 3025:
! 3026: if (possessive) continue; /* No backtracking */
! 3027:
1.1 misho 3028: for (;;)
3029: {
3030: RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
3031: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3032: if (eptr-- == pp) break; /* Stop if tried at original pos */
3033: BACKCHAR(eptr);
3034: }
3035: }
3036: else
3037: #endif
1.1.1.2 misho 3038: /* Not UTF mode */
1.1 misho 3039: {
3040: for (i = min; i < max; i++)
3041: {
3042: if (eptr >= md->end_subject)
3043: {
3044: SCHECK_PARTIAL();
3045: break;
3046: }
3047: c = *eptr;
1.1.1.2 misho 3048: #ifndef COMPILE_PCRE8
3049: if (c > 255)
3050: {
3051: if (op == OP_CLASS) break;
3052: }
3053: else
3054: #endif
3055: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
1.1 misho 3056: eptr++;
3057: }
1.1.1.5 ! misho 3058:
! 3059: if (possessive) continue; /* No backtracking */
! 3060:
1.1 misho 3061: while (eptr >= pp)
3062: {
3063: RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3064: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3065: eptr--;
3066: }
3067: }
3068:
3069: RRETURN(MATCH_NOMATCH);
3070: }
1.1.1.2 misho 3071: #undef BYTE_MAP
1.1 misho 3072: }
3073: /* Control never gets here */
3074:
3075:
1.1.1.5 ! misho 3076: /* Match an extended character class. In the 8-bit library, this opcode is
! 3077: encountered only when UTF-8 mode mode is supported. In the 16-bit and
! 3078: 32-bit libraries, codepoints greater than 255 may be encountered even when
! 3079: UTF is not supported. */
1.1 misho 3080:
1.1.1.2 misho 3081: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1.1 misho 3082: case OP_XCLASS:
3083: {
3084: data = ecode + 1 + LINK_SIZE; /* Save for matching */
3085: ecode += GET(ecode, 1); /* Advance past the item */
3086:
3087: switch (*ecode)
3088: {
3089: case OP_CRSTAR:
3090: case OP_CRMINSTAR:
3091: case OP_CRPLUS:
3092: case OP_CRMINPLUS:
3093: case OP_CRQUERY:
3094: case OP_CRMINQUERY:
1.1.1.5 ! misho 3095: case OP_CRPOSSTAR:
! 3096: case OP_CRPOSPLUS:
! 3097: case OP_CRPOSQUERY:
1.1 misho 3098: c = *ecode++ - OP_CRSTAR;
1.1.1.5 ! misho 3099: if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
! 3100: else possessive = TRUE;
1.1 misho 3101: min = rep_min[c]; /* Pick up values from tables; */
3102: max = rep_max[c]; /* zero for max => infinity */
3103: if (max == 0) max = INT_MAX;
3104: break;
3105:
3106: case OP_CRRANGE:
3107: case OP_CRMINRANGE:
1.1.1.5 ! misho 3108: case OP_CRPOSRANGE:
1.1 misho 3109: minimize = (*ecode == OP_CRMINRANGE);
1.1.1.5 ! misho 3110: possessive = (*ecode == OP_CRPOSRANGE);
1.1 misho 3111: min = GET2(ecode, 1);
1.1.1.2 misho 3112: max = GET2(ecode, 1 + IMM2_SIZE);
1.1 misho 3113: if (max == 0) max = INT_MAX;
1.1.1.2 misho 3114: ecode += 1 + 2 * IMM2_SIZE;
1.1 misho 3115: break;
3116:
3117: default: /* No repeat follows */
3118: min = max = 1;
3119: break;
3120: }
3121:
3122: /* First, ensure the minimum number of matches are present. */
3123:
3124: for (i = 1; i <= min; i++)
3125: {
3126: if (eptr >= md->end_subject)
3127: {
3128: SCHECK_PARTIAL();
3129: RRETURN(MATCH_NOMATCH);
3130: }
3131: GETCHARINCTEST(c, eptr);
1.1.1.2 misho 3132: if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
1.1 misho 3133: }
3134:
3135: /* If max == min we can continue with the main loop without the
3136: need to recurse. */
3137:
3138: if (min == max) continue;
3139:
3140: /* If minimizing, keep testing the rest of the expression and advancing
3141: the pointer while it matches the class. */
3142:
3143: if (minimize)
3144: {
3145: for (fi = min;; fi++)
3146: {
3147: RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3148: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3149: if (fi >= max) RRETURN(MATCH_NOMATCH);
3150: if (eptr >= md->end_subject)
3151: {
3152: SCHECK_PARTIAL();
3153: RRETURN(MATCH_NOMATCH);
3154: }
3155: GETCHARINCTEST(c, eptr);
1.1.1.2 misho 3156: if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
1.1 misho 3157: }
3158: /* Control never gets here */
3159: }
3160:
3161: /* If maximizing, find the longest possible run, then work backwards. */
3162:
3163: else
3164: {
3165: pp = eptr;
3166: for (i = min; i < max; i++)
3167: {
3168: int len = 1;
3169: if (eptr >= md->end_subject)
3170: {
3171: SCHECK_PARTIAL();
3172: break;
3173: }
1.1.1.2 misho 3174: #ifdef SUPPORT_UTF
1.1 misho 3175: GETCHARLENTEST(c, eptr, len);
1.1.1.2 misho 3176: #else
3177: c = *eptr;
3178: #endif
3179: if (!PRIV(xclass)(c, data, utf)) break;
1.1 misho 3180: eptr += len;
3181: }
1.1.1.5 ! misho 3182:
! 3183: if (possessive) continue; /* No backtracking */
! 3184:
1.1 misho 3185: for(;;)
3186: {
3187: RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3188: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3189: if (eptr-- == pp) break; /* Stop if tried at original pos */
1.1.1.2 misho 3190: #ifdef SUPPORT_UTF
3191: if (utf) BACKCHAR(eptr);
3192: #endif
1.1 misho 3193: }
3194: RRETURN(MATCH_NOMATCH);
3195: }
3196:
3197: /* Control never gets here */
3198: }
3199: #endif /* End of XCLASS */
3200:
3201: /* Match a single character, casefully */
3202:
3203: case OP_CHAR:
1.1.1.2 misho 3204: #ifdef SUPPORT_UTF
3205: if (utf)
1.1 misho 3206: {
3207: length = 1;
3208: ecode++;
3209: GETCHARLEN(fc, ecode, length);
3210: if (length > md->end_subject - eptr)
3211: {
3212: CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3213: RRETURN(MATCH_NOMATCH);
3214: }
1.1.1.4 misho 3215: while (length-- > 0) if (*ecode++ != RAWUCHARINC(eptr)) RRETURN(MATCH_NOMATCH);
1.1 misho 3216: }
3217: else
3218: #endif
1.1.1.2 misho 3219: /* Not UTF mode */
1.1 misho 3220: {
3221: if (md->end_subject - eptr < 1)
3222: {
3223: SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3224: RRETURN(MATCH_NOMATCH);
3225: }
3226: if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3227: ecode += 2;
3228: }
3229: break;
3230:
3231: /* Match a single character, caselessly. If we are at the end of the
3232: subject, give up immediately. */
3233:
3234: case OP_CHARI:
3235: if (eptr >= md->end_subject)
3236: {
3237: SCHECK_PARTIAL();
3238: RRETURN(MATCH_NOMATCH);
3239: }
3240:
1.1.1.2 misho 3241: #ifdef SUPPORT_UTF
3242: if (utf)
1.1 misho 3243: {
3244: length = 1;
3245: ecode++;
3246: GETCHARLEN(fc, ecode, length);
3247:
3248: /* If the pattern character's value is < 128, we have only one byte, and
3249: we know that its other case must also be one byte long, so we can use the
3250: fast lookup table. We know that there is at least one byte left in the
3251: subject. */
3252:
3253: if (fc < 128)
3254: {
1.1.1.4 misho 3255: pcre_uint32 cc = RAWUCHAR(eptr);
3256: if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
1.1.1.2 misho 3257: ecode++;
3258: eptr++;
1.1 misho 3259: }
3260:
3261: /* Otherwise we must pick up the subject character. Note that we cannot
3262: use the value of "length" to check for sufficient bytes left, because the
3263: other case of the character may have more or fewer bytes. */
3264:
3265: else
3266: {
1.1.1.4 misho 3267: pcre_uint32 dc;
1.1 misho 3268: GETCHARINC(dc, eptr);
3269: ecode += length;
3270:
3271: /* If we have Unicode property support, we can use it to test the other
3272: case of the character, if there is one. */
3273:
3274: if (fc != dc)
3275: {
3276: #ifdef SUPPORT_UCP
3277: if (dc != UCD_OTHERCASE(fc))
3278: #endif
3279: RRETURN(MATCH_NOMATCH);
3280: }
3281: }
3282: }
3283: else
1.1.1.2 misho 3284: #endif /* SUPPORT_UTF */
1.1 misho 3285:
1.1.1.2 misho 3286: /* Not UTF mode */
1.1 misho 3287: {
1.1.1.2 misho 3288: if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3289: != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3290: eptr++;
1.1 misho 3291: ecode += 2;
3292: }
3293: break;
3294:
3295: /* Match a single character repeatedly. */
3296:
3297: case OP_EXACT:
3298: case OP_EXACTI:
3299: min = max = GET2(ecode, 1);
1.1.1.2 misho 3300: ecode += 1 + IMM2_SIZE;
1.1 misho 3301: goto REPEATCHAR;
3302:
3303: case OP_POSUPTO:
3304: case OP_POSUPTOI:
3305: possessive = TRUE;
3306: /* Fall through */
3307:
3308: case OP_UPTO:
3309: case OP_UPTOI:
3310: case OP_MINUPTO:
3311: case OP_MINUPTOI:
3312: min = 0;
3313: max = GET2(ecode, 1);
3314: minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
1.1.1.2 misho 3315: ecode += 1 + IMM2_SIZE;
1.1 misho 3316: goto REPEATCHAR;
3317:
3318: case OP_POSSTAR:
3319: case OP_POSSTARI:
3320: possessive = TRUE;
3321: min = 0;
3322: max = INT_MAX;
3323: ecode++;
3324: goto REPEATCHAR;
3325:
3326: case OP_POSPLUS:
3327: case OP_POSPLUSI:
3328: possessive = TRUE;
3329: min = 1;
3330: max = INT_MAX;
3331: ecode++;
3332: goto REPEATCHAR;
3333:
3334: case OP_POSQUERY:
3335: case OP_POSQUERYI:
3336: possessive = TRUE;
3337: min = 0;
3338: max = 1;
3339: ecode++;
3340: goto REPEATCHAR;
3341:
3342: case OP_STAR:
3343: case OP_STARI:
3344: case OP_MINSTAR:
3345: case OP_MINSTARI:
3346: case OP_PLUS:
3347: case OP_PLUSI:
3348: case OP_MINPLUS:
3349: case OP_MINPLUSI:
3350: case OP_QUERY:
3351: case OP_QUERYI:
3352: case OP_MINQUERY:
3353: case OP_MINQUERYI:
3354: c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3355: minimize = (c & 1) != 0;
3356: min = rep_min[c]; /* Pick up values from tables; */
3357: max = rep_max[c]; /* zero for max => infinity */
3358: if (max == 0) max = INT_MAX;
3359:
1.1.1.4 misho 3360: /* Common code for all repeated single-character matches. We first check
3361: for the minimum number of characters. If the minimum equals the maximum, we
3362: are done. Otherwise, if minimizing, check the rest of the pattern for a
3363: match; if there isn't one, advance up to the maximum, one character at a
3364: time.
3365:
3366: If maximizing, advance up to the maximum number of matching characters,
3367: until eptr is past the end of the maximum run. If possessive, we are
3368: then done (no backing up). Otherwise, match at this position; anything
3369: other than no match is immediately returned. For nomatch, back up one
3370: character, unless we are matching \R and the last thing matched was
3371: \r\n, in which case, back up two bytes. When we reach the first optional
3372: character position, we can save stack by doing a tail recurse.
3373:
3374: The various UTF/non-UTF and caseful/caseless cases are handled separately,
3375: for speed. */
1.1 misho 3376:
3377: REPEATCHAR:
1.1.1.2 misho 3378: #ifdef SUPPORT_UTF
3379: if (utf)
1.1 misho 3380: {
3381: length = 1;
3382: charptr = ecode;
3383: GETCHARLEN(fc, ecode, length);
3384: ecode += length;
3385:
3386: /* Handle multibyte character matching specially here. There is
3387: support for caseless matching if UCP support is present. */
3388:
3389: if (length > 1)
3390: {
3391: #ifdef SUPPORT_UCP
1.1.1.4 misho 3392: pcre_uint32 othercase;
1.1 misho 3393: if (op >= OP_STARI && /* Caseless */
3394: (othercase = UCD_OTHERCASE(fc)) != fc)
1.1.1.2 misho 3395: oclength = PRIV(ord2utf)(othercase, occhars);
1.1 misho 3396: else oclength = 0;
3397: #endif /* SUPPORT_UCP */
3398:
3399: for (i = 1; i <= min; i++)
3400: {
3401: if (eptr <= md->end_subject - length &&
1.1.1.2 misho 3402: memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
1.1 misho 3403: #ifdef SUPPORT_UCP
3404: else if (oclength > 0 &&
3405: eptr <= md->end_subject - oclength &&
1.1.1.2 misho 3406: memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
1.1 misho 3407: #endif /* SUPPORT_UCP */
3408: else
3409: {
3410: CHECK_PARTIAL();
3411: RRETURN(MATCH_NOMATCH);
3412: }
3413: }
3414:
3415: if (min == max) continue;
3416:
3417: if (minimize)
3418: {
3419: for (fi = min;; fi++)
3420: {
3421: RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3422: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3423: if (fi >= max) RRETURN(MATCH_NOMATCH);
3424: if (eptr <= md->end_subject - length &&
1.1.1.2 misho 3425: memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
1.1 misho 3426: #ifdef SUPPORT_UCP
3427: else if (oclength > 0 &&
3428: eptr <= md->end_subject - oclength &&
1.1.1.2 misho 3429: memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
1.1 misho 3430: #endif /* SUPPORT_UCP */
3431: else
3432: {
3433: CHECK_PARTIAL();
3434: RRETURN(MATCH_NOMATCH);
3435: }
3436: }
3437: /* Control never gets here */
3438: }
3439:
3440: else /* Maximize */
3441: {
3442: pp = eptr;
3443: for (i = min; i < max; i++)
3444: {
3445: if (eptr <= md->end_subject - length &&
1.1.1.2 misho 3446: memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
1.1 misho 3447: #ifdef SUPPORT_UCP
3448: else if (oclength > 0 &&
3449: eptr <= md->end_subject - oclength &&
1.1.1.2 misho 3450: memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
1.1 misho 3451: #endif /* SUPPORT_UCP */
3452: else
3453: {
3454: CHECK_PARTIAL();
3455: break;
3456: }
3457: }
3458:
1.1.1.4 misho 3459: if (possessive) continue; /* No backtracking */
1.1 misho 3460: for(;;)
3461: {
1.1.1.4 misho 3462: if (eptr == pp) goto TAIL_RECURSE;
1.1 misho 3463: RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3464: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3465: #ifdef SUPPORT_UCP
3466: eptr--;
3467: BACKCHAR(eptr);
3468: #else /* without SUPPORT_UCP */
3469: eptr -= length;
3470: #endif /* SUPPORT_UCP */
3471: }
3472: }
3473: /* Control never gets here */
3474: }
3475:
3476: /* If the length of a UTF-8 character is 1, we fall through here, and
3477: obey the code as for non-UTF-8 characters below, though in this case the
3478: value of fc will always be < 128. */
3479: }
3480: else
1.1.1.2 misho 3481: #endif /* SUPPORT_UTF */
3482: /* When not in UTF-8 mode, load a single-byte character. */
3483: fc = *ecode++;
1.1 misho 3484:
1.1.1.2 misho 3485: /* The value of fc at this point is always one character, though we may
3486: or may not be in UTF mode. The code is duplicated for the caseless and
1.1 misho 3487: caseful cases, for speed, since matching characters is likely to be quite
3488: common. First, ensure the minimum number of matches are present. If min =
3489: max, continue at the same level without recursing. Otherwise, if
3490: minimizing, keep trying the rest of the expression and advancing one
3491: matching character if failing, up to the maximum. Alternatively, if
3492: maximizing, find the maximum number of characters and work backwards. */
3493:
3494: DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
1.1.1.3 misho 3495: max, (char *)eptr));
1.1 misho 3496:
3497: if (op >= OP_STARI) /* Caseless */
3498: {
1.1.1.2 misho 3499: #ifdef COMPILE_PCRE8
3500: /* fc must be < 128 if UTF is enabled. */
3501: foc = md->fcc[fc];
3502: #else
3503: #ifdef SUPPORT_UTF
3504: #ifdef SUPPORT_UCP
3505: if (utf && fc > 127)
3506: foc = UCD_OTHERCASE(fc);
3507: #else
3508: if (utf && fc > 127)
3509: foc = fc;
3510: #endif /* SUPPORT_UCP */
3511: else
3512: #endif /* SUPPORT_UTF */
3513: foc = TABLE_GET(fc, md->fcc, fc);
3514: #endif /* COMPILE_PCRE8 */
3515:
1.1 misho 3516: for (i = 1; i <= min; i++)
3517: {
1.1.1.4 misho 3518: pcre_uint32 cc; /* Faster than pcre_uchar */
1.1 misho 3519: if (eptr >= md->end_subject)
3520: {
3521: SCHECK_PARTIAL();
3522: RRETURN(MATCH_NOMATCH);
3523: }
1.1.1.4 misho 3524: cc = RAWUCHARTEST(eptr);
3525: if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
1.1.1.2 misho 3526: eptr++;
1.1 misho 3527: }
3528: if (min == max) continue;
3529: if (minimize)
3530: {
3531: for (fi = min;; fi++)
3532: {
1.1.1.4 misho 3533: pcre_uint32 cc; /* Faster than pcre_uchar */
1.1 misho 3534: RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3535: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3536: if (fi >= max) RRETURN(MATCH_NOMATCH);
3537: if (eptr >= md->end_subject)
3538: {
3539: SCHECK_PARTIAL();
3540: RRETURN(MATCH_NOMATCH);
3541: }
1.1.1.4 misho 3542: cc = RAWUCHARTEST(eptr);
3543: if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
1.1.1.2 misho 3544: eptr++;
1.1 misho 3545: }
3546: /* Control never gets here */
3547: }
3548: else /* Maximize */
3549: {
3550: pp = eptr;
3551: for (i = min; i < max; i++)
3552: {
1.1.1.4 misho 3553: pcre_uint32 cc; /* Faster than pcre_uchar */
1.1 misho 3554: if (eptr >= md->end_subject)
3555: {
3556: SCHECK_PARTIAL();
3557: break;
3558: }
1.1.1.4 misho 3559: cc = RAWUCHARTEST(eptr);
3560: if (fc != cc && foc != cc) break;
1.1 misho 3561: eptr++;
3562: }
1.1.1.4 misho 3563: if (possessive) continue; /* No backtracking */
3564: for (;;)
1.1 misho 3565: {
1.1.1.4 misho 3566: if (eptr == pp) goto TAIL_RECURSE;
1.1 misho 3567: RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3568: eptr--;
3569: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3570: }
1.1.1.5 ! misho 3571: /* Control never gets here */
1.1 misho 3572: }
3573: }
3574:
3575: /* Caseful comparisons (includes all multi-byte characters) */
3576:
3577: else
3578: {
3579: for (i = 1; i <= min; i++)
3580: {
3581: if (eptr >= md->end_subject)
3582: {
3583: SCHECK_PARTIAL();
3584: RRETURN(MATCH_NOMATCH);
3585: }
1.1.1.4 misho 3586: if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
1.1 misho 3587: }
3588:
3589: if (min == max) continue;
3590:
3591: if (minimize)
3592: {
3593: for (fi = min;; fi++)
3594: {
3595: RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3596: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3597: if (fi >= max) RRETURN(MATCH_NOMATCH);
3598: if (eptr >= md->end_subject)
3599: {
3600: SCHECK_PARTIAL();
3601: RRETURN(MATCH_NOMATCH);
3602: }
1.1.1.4 misho 3603: if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
1.1 misho 3604: }
3605: /* Control never gets here */
3606: }
3607: else /* Maximize */
3608: {
3609: pp = eptr;
3610: for (i = min; i < max; i++)
3611: {
3612: if (eptr >= md->end_subject)
3613: {
3614: SCHECK_PARTIAL();
3615: break;
3616: }
1.1.1.4 misho 3617: if (fc != RAWUCHARTEST(eptr)) break;
1.1 misho 3618: eptr++;
3619: }
1.1.1.4 misho 3620: if (possessive) continue; /* No backtracking */
3621: for (;;)
1.1 misho 3622: {
1.1.1.4 misho 3623: if (eptr == pp) goto TAIL_RECURSE;
1.1 misho 3624: RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3625: eptr--;
3626: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3627: }
1.1.1.5 ! misho 3628: /* Control never gets here */
1.1 misho 3629: }
3630: }
3631: /* Control never gets here */
3632:
3633: /* Match a negated single one-byte character. The character we are
3634: checking can be multibyte. */
3635:
3636: case OP_NOT:
3637: case OP_NOTI:
3638: if (eptr >= md->end_subject)
3639: {
3640: SCHECK_PARTIAL();
3641: RRETURN(MATCH_NOMATCH);
3642: }
1.1.1.3 misho 3643: #ifdef SUPPORT_UTF
3644: if (utf)
1.1 misho 3645: {
1.1.1.4 misho 3646: register pcre_uint32 ch, och;
1.1.1.3 misho 3647:
3648: ecode++;
3649: GETCHARINC(ch, ecode);
3650: GETCHARINC(c, eptr);
3651:
3652: if (op == OP_NOT)
3653: {
3654: if (ch == c) RRETURN(MATCH_NOMATCH);
3655: }
3656: else
3657: {
1.1.1.2 misho 3658: #ifdef SUPPORT_UCP
1.1.1.3 misho 3659: if (ch > 127)
3660: och = UCD_OTHERCASE(ch);
1.1.1.2 misho 3661: #else
1.1.1.3 misho 3662: if (ch > 127)
3663: och = ch;
1.1.1.2 misho 3664: #endif /* SUPPORT_UCP */
1.1.1.3 misho 3665: else
3666: och = TABLE_GET(ch, md->fcc, ch);
3667: if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3668: }
1.1 misho 3669: }
1.1.1.3 misho 3670: else
3671: #endif
1.1 misho 3672: {
1.1.1.4 misho 3673: register pcre_uint32 ch = ecode[1];
1.1.1.3 misho 3674: c = *eptr++;
3675: if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3676: RRETURN(MATCH_NOMATCH);
3677: ecode += 2;
1.1 misho 3678: }
3679: break;
3680:
3681: /* Match a negated single one-byte character repeatedly. This is almost a
3682: repeat of the code for a repeated single character, but I haven't found a
3683: nice way of commoning these up that doesn't require a test of the
3684: positive/negative option for each character match. Maybe that wouldn't add
3685: very much to the time taken, but character matching *is* what this is all
3686: about... */
3687:
3688: case OP_NOTEXACT:
3689: case OP_NOTEXACTI:
3690: min = max = GET2(ecode, 1);
1.1.1.2 misho 3691: ecode += 1 + IMM2_SIZE;
1.1 misho 3692: goto REPEATNOTCHAR;
3693:
3694: case OP_NOTUPTO:
3695: case OP_NOTUPTOI:
3696: case OP_NOTMINUPTO:
3697: case OP_NOTMINUPTOI:
3698: min = 0;
3699: max = GET2(ecode, 1);
3700: minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
1.1.1.2 misho 3701: ecode += 1 + IMM2_SIZE;
1.1 misho 3702: goto REPEATNOTCHAR;
3703:
3704: case OP_NOTPOSSTAR:
3705: case OP_NOTPOSSTARI:
3706: possessive = TRUE;
3707: min = 0;
3708: max = INT_MAX;
3709: ecode++;
3710: goto REPEATNOTCHAR;
3711:
3712: case OP_NOTPOSPLUS:
3713: case OP_NOTPOSPLUSI:
3714: possessive = TRUE;
3715: min = 1;
3716: max = INT_MAX;
3717: ecode++;
3718: goto REPEATNOTCHAR;
3719:
3720: case OP_NOTPOSQUERY:
3721: case OP_NOTPOSQUERYI:
3722: possessive = TRUE;
3723: min = 0;
3724: max = 1;
3725: ecode++;
3726: goto REPEATNOTCHAR;
3727:
3728: case OP_NOTPOSUPTO:
3729: case OP_NOTPOSUPTOI:
3730: possessive = TRUE;
3731: min = 0;
3732: max = GET2(ecode, 1);
1.1.1.2 misho 3733: ecode += 1 + IMM2_SIZE;
1.1 misho 3734: goto REPEATNOTCHAR;
3735:
3736: case OP_NOTSTAR:
3737: case OP_NOTSTARI:
3738: case OP_NOTMINSTAR:
3739: case OP_NOTMINSTARI:
3740: case OP_NOTPLUS:
3741: case OP_NOTPLUSI:
3742: case OP_NOTMINPLUS:
3743: case OP_NOTMINPLUSI:
3744: case OP_NOTQUERY:
3745: case OP_NOTQUERYI:
3746: case OP_NOTMINQUERY:
3747: case OP_NOTMINQUERYI:
3748: c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3749: minimize = (c & 1) != 0;
3750: min = rep_min[c]; /* Pick up values from tables; */
3751: max = rep_max[c]; /* zero for max => infinity */
3752: if (max == 0) max = INT_MAX;
3753:
3754: /* Common code for all repeated single-byte matches. */
3755:
3756: REPEATNOTCHAR:
1.1.1.3 misho 3757: GETCHARINCTEST(fc, ecode);
1.1 misho 3758:
3759: /* The code is duplicated for the caseless and caseful cases, for speed,
3760: since matching characters is likely to be quite common. First, ensure the
3761: minimum number of matches are present. If min = max, continue at the same
3762: level without recursing. Otherwise, if minimizing, keep trying the rest of
3763: the expression and advancing one matching character if failing, up to the
3764: maximum. Alternatively, if maximizing, find the maximum number of
3765: characters and work backwards. */
3766:
3767: DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
1.1.1.3 misho 3768: max, (char *)eptr));
1.1 misho 3769:
3770: if (op >= OP_NOTSTARI) /* Caseless */
3771: {
1.1.1.2 misho 3772: #ifdef SUPPORT_UTF
3773: #ifdef SUPPORT_UCP
3774: if (utf && fc > 127)
3775: foc = UCD_OTHERCASE(fc);
3776: #else
3777: if (utf && fc > 127)
3778: foc = fc;
3779: #endif /* SUPPORT_UCP */
3780: else
3781: #endif /* SUPPORT_UTF */
3782: foc = TABLE_GET(fc, md->fcc, fc);
1.1 misho 3783:
1.1.1.2 misho 3784: #ifdef SUPPORT_UTF
3785: if (utf)
1.1 misho 3786: {
1.1.1.4 misho 3787: register pcre_uint32 d;
1.1 misho 3788: for (i = 1; i <= min; i++)
3789: {
3790: if (eptr >= md->end_subject)
3791: {
3792: SCHECK_PARTIAL();
3793: RRETURN(MATCH_NOMATCH);
3794: }
3795: GETCHARINC(d, eptr);
1.1.1.3 misho 3796: if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
1.1 misho 3797: }
3798: }
3799: else
1.1.1.4 misho 3800: #endif /* SUPPORT_UTF */
1.1.1.2 misho 3801: /* Not UTF mode */
1.1 misho 3802: {
3803: for (i = 1; i <= min; i++)
3804: {
3805: if (eptr >= md->end_subject)
3806: {
3807: SCHECK_PARTIAL();
3808: RRETURN(MATCH_NOMATCH);
3809: }
1.1.1.2 misho 3810: if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3811: eptr++;
1.1 misho 3812: }
3813: }
3814:
3815: if (min == max) continue;
3816:
3817: if (minimize)
3818: {
1.1.1.2 misho 3819: #ifdef SUPPORT_UTF
3820: if (utf)
1.1 misho 3821: {
1.1.1.4 misho 3822: register pcre_uint32 d;
1.1 misho 3823: for (fi = min;; fi++)
3824: {
3825: RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3826: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3827: if (fi >= max) RRETURN(MATCH_NOMATCH);
3828: if (eptr >= md->end_subject)
3829: {
3830: SCHECK_PARTIAL();
3831: RRETURN(MATCH_NOMATCH);
3832: }
3833: GETCHARINC(d, eptr);
1.1.1.2 misho 3834: if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
1.1 misho 3835: }
3836: }
3837: else
1.1.1.4 misho 3838: #endif /*SUPPORT_UTF */
1.1.1.2 misho 3839: /* Not UTF mode */
1.1 misho 3840: {
3841: for (fi = min;; fi++)
3842: {
3843: RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3844: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3845: if (fi >= max) RRETURN(MATCH_NOMATCH);
3846: if (eptr >= md->end_subject)
3847: {
3848: SCHECK_PARTIAL();
3849: RRETURN(MATCH_NOMATCH);
3850: }
1.1.1.2 misho 3851: if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3852: eptr++;
1.1 misho 3853: }
3854: }
3855: /* Control never gets here */
3856: }
3857:
3858: /* Maximize case */
3859:
3860: else
3861: {
3862: pp = eptr;
3863:
1.1.1.2 misho 3864: #ifdef SUPPORT_UTF
3865: if (utf)
1.1 misho 3866: {
1.1.1.4 misho 3867: register pcre_uint32 d;
1.1 misho 3868: for (i = min; i < max; i++)
3869: {
3870: int len = 1;
3871: if (eptr >= md->end_subject)
3872: {
3873: SCHECK_PARTIAL();
3874: break;
3875: }
3876: GETCHARLEN(d, eptr, len);
1.1.1.2 misho 3877: if (fc == d || (unsigned int)foc == d) break;
1.1 misho 3878: eptr += len;
3879: }
1.1.1.4 misho 3880: if (possessive) continue; /* No backtracking */
1.1.1.2 misho 3881: for(;;)
1.1 misho 3882: {
1.1.1.4 misho 3883: if (eptr == pp) goto TAIL_RECURSE;
1.1 misho 3884: RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3885: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1.1.4 misho 3886: eptr--;
1.1 misho 3887: BACKCHAR(eptr);
3888: }
3889: }
3890: else
1.1.1.4 misho 3891: #endif /* SUPPORT_UTF */
1.1.1.2 misho 3892: /* Not UTF mode */
1.1 misho 3893: {
3894: for (i = min; i < max; i++)
3895: {
3896: if (eptr >= md->end_subject)
3897: {
3898: SCHECK_PARTIAL();
3899: break;
3900: }
1.1.1.2 misho 3901: if (fc == *eptr || foc == *eptr) break;
1.1 misho 3902: eptr++;
3903: }
1.1.1.4 misho 3904: if (possessive) continue; /* No backtracking */
3905: for (;;)
1.1 misho 3906: {
1.1.1.4 misho 3907: if (eptr == pp) goto TAIL_RECURSE;
1.1 misho 3908: RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3909: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3910: eptr--;
3911: }
3912: }
1.1.1.5 ! misho 3913: /* Control never gets here */
1.1 misho 3914: }
3915: }
3916:
3917: /* Caseful comparisons */
3918:
3919: else
3920: {
1.1.1.2 misho 3921: #ifdef SUPPORT_UTF
3922: if (utf)
1.1 misho 3923: {
1.1.1.4 misho 3924: register pcre_uint32 d;
1.1 misho 3925: for (i = 1; i <= min; i++)
3926: {
3927: if (eptr >= md->end_subject)
3928: {
3929: SCHECK_PARTIAL();
3930: RRETURN(MATCH_NOMATCH);
3931: }
3932: GETCHARINC(d, eptr);
3933: if (fc == d) RRETURN(MATCH_NOMATCH);
3934: }
3935: }
3936: else
3937: #endif
1.1.1.2 misho 3938: /* Not UTF mode */
1.1 misho 3939: {
3940: for (i = 1; i <= min; i++)
3941: {
3942: if (eptr >= md->end_subject)
3943: {
3944: SCHECK_PARTIAL();
3945: RRETURN(MATCH_NOMATCH);
3946: }
3947: if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3948: }
3949: }
3950:
3951: if (min == max) continue;
3952:
3953: if (minimize)
3954: {
1.1.1.2 misho 3955: #ifdef SUPPORT_UTF
3956: if (utf)
1.1 misho 3957: {
1.1.1.4 misho 3958: register pcre_uint32 d;
1.1 misho 3959: for (fi = min;; fi++)
3960: {
3961: RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3962: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3963: if (fi >= max) RRETURN(MATCH_NOMATCH);
3964: if (eptr >= md->end_subject)
3965: {
3966: SCHECK_PARTIAL();
3967: RRETURN(MATCH_NOMATCH);
3968: }
3969: GETCHARINC(d, eptr);
3970: if (fc == d) RRETURN(MATCH_NOMATCH);
3971: }
3972: }
3973: else
3974: #endif
1.1.1.2 misho 3975: /* Not UTF mode */
1.1 misho 3976: {
3977: for (fi = min;; fi++)
3978: {
3979: RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3980: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3981: if (fi >= max) RRETURN(MATCH_NOMATCH);
3982: if (eptr >= md->end_subject)
3983: {
3984: SCHECK_PARTIAL();
3985: RRETURN(MATCH_NOMATCH);
3986: }
3987: if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3988: }
3989: }
3990: /* Control never gets here */
3991: }
3992:
3993: /* Maximize case */
3994:
3995: else
3996: {
3997: pp = eptr;
3998:
1.1.1.2 misho 3999: #ifdef SUPPORT_UTF
4000: if (utf)
1.1 misho 4001: {
1.1.1.4 misho 4002: register pcre_uint32 d;
1.1 misho 4003: for (i = min; i < max; i++)
4004: {
4005: int len = 1;
4006: if (eptr >= md->end_subject)
4007: {
4008: SCHECK_PARTIAL();
4009: break;
4010: }
4011: GETCHARLEN(d, eptr, len);
4012: if (fc == d) break;
4013: eptr += len;
4014: }
1.1.1.4 misho 4015: if (possessive) continue; /* No backtracking */
1.1 misho 4016: for(;;)
4017: {
1.1.1.4 misho 4018: if (eptr == pp) goto TAIL_RECURSE;
1.1 misho 4019: RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
4020: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1.1.4 misho 4021: eptr--;
1.1 misho 4022: BACKCHAR(eptr);
4023: }
4024: }
4025: else
4026: #endif
1.1.1.2 misho 4027: /* Not UTF mode */
1.1 misho 4028: {
4029: for (i = min; i < max; i++)
4030: {
4031: if (eptr >= md->end_subject)
4032: {
4033: SCHECK_PARTIAL();
4034: break;
4035: }
4036: if (fc == *eptr) break;
4037: eptr++;
4038: }
1.1.1.4 misho 4039: if (possessive) continue; /* No backtracking */
4040: for (;;)
1.1 misho 4041: {
1.1.1.4 misho 4042: if (eptr == pp) goto TAIL_RECURSE;
1.1 misho 4043: RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
4044: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4045: eptr--;
4046: }
4047: }
1.1.1.5 ! misho 4048: /* Control never gets here */
1.1 misho 4049: }
4050: }
4051: /* Control never gets here */
4052:
4053: /* Match a single character type repeatedly; several different opcodes
4054: share code. This is very similar to the code for single characters, but we
4055: repeat it in the interests of efficiency. */
4056:
4057: case OP_TYPEEXACT:
4058: min = max = GET2(ecode, 1);
4059: minimize = TRUE;
1.1.1.2 misho 4060: ecode += 1 + IMM2_SIZE;
1.1 misho 4061: goto REPEATTYPE;
4062:
4063: case OP_TYPEUPTO:
4064: case OP_TYPEMINUPTO:
4065: min = 0;
4066: max = GET2(ecode, 1);
4067: minimize = *ecode == OP_TYPEMINUPTO;
1.1.1.2 misho 4068: ecode += 1 + IMM2_SIZE;
1.1 misho 4069: goto REPEATTYPE;
4070:
4071: case OP_TYPEPOSSTAR:
4072: possessive = TRUE;
4073: min = 0;
4074: max = INT_MAX;
4075: ecode++;
4076: goto REPEATTYPE;
4077:
4078: case OP_TYPEPOSPLUS:
4079: possessive = TRUE;
4080: min = 1;
4081: max = INT_MAX;
4082: ecode++;
4083: goto REPEATTYPE;
4084:
4085: case OP_TYPEPOSQUERY:
4086: possessive = TRUE;
4087: min = 0;
4088: max = 1;
4089: ecode++;
4090: goto REPEATTYPE;
4091:
4092: case OP_TYPEPOSUPTO:
4093: possessive = TRUE;
4094: min = 0;
4095: max = GET2(ecode, 1);
1.1.1.2 misho 4096: ecode += 1 + IMM2_SIZE;
1.1 misho 4097: goto REPEATTYPE;
4098:
4099: case OP_TYPESTAR:
4100: case OP_TYPEMINSTAR:
4101: case OP_TYPEPLUS:
4102: case OP_TYPEMINPLUS:
4103: case OP_TYPEQUERY:
4104: case OP_TYPEMINQUERY:
4105: c = *ecode++ - OP_TYPESTAR;
4106: minimize = (c & 1) != 0;
4107: min = rep_min[c]; /* Pick up values from tables; */
4108: max = rep_max[c]; /* zero for max => infinity */
4109: if (max == 0) max = INT_MAX;
4110:
4111: /* Common code for all repeated single character type matches. Note that
4112: in UTF-8 mode, '.' matches a character of any length, but for the other
4113: character types, the valid characters are all one-byte long. */
4114:
4115: REPEATTYPE:
4116: ctype = *ecode++; /* Code for the character type */
4117:
4118: #ifdef SUPPORT_UCP
4119: if (ctype == OP_PROP || ctype == OP_NOTPROP)
4120: {
4121: prop_fail_result = ctype == OP_NOTPROP;
4122: prop_type = *ecode++;
4123: prop_value = *ecode++;
4124: }
4125: else prop_type = -1;
4126: #endif
4127:
4128: /* First, ensure the minimum number of matches are present. Use inline
4129: code for maximizing the speed, and do the type test once at the start
4130: (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4131: is tidier. Also separate the UCP code, which can be the same for both UTF-8
4132: and single-bytes. */
4133:
4134: if (min > 0)
4135: {
4136: #ifdef SUPPORT_UCP
4137: if (prop_type >= 0)
4138: {
4139: switch(prop_type)
4140: {
4141: case PT_ANY:
4142: if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4143: for (i = 1; i <= min; i++)
4144: {
4145: if (eptr >= md->end_subject)
4146: {
4147: SCHECK_PARTIAL();
4148: RRETURN(MATCH_NOMATCH);
4149: }
4150: GETCHARINCTEST(c, eptr);
4151: }
4152: break;
4153:
4154: case PT_LAMP:
4155: for (i = 1; i <= min; i++)
4156: {
4157: int chartype;
4158: if (eptr >= md->end_subject)
4159: {
4160: SCHECK_PARTIAL();
4161: RRETURN(MATCH_NOMATCH);
4162: }
4163: GETCHARINCTEST(c, eptr);
4164: chartype = UCD_CHARTYPE(c);
4165: if ((chartype == ucp_Lu ||
4166: chartype == ucp_Ll ||
4167: chartype == ucp_Lt) == prop_fail_result)
4168: RRETURN(MATCH_NOMATCH);
4169: }
4170: break;
4171:
4172: case PT_GC:
4173: for (i = 1; i <= min; i++)
4174: {
4175: if (eptr >= md->end_subject)
4176: {
4177: SCHECK_PARTIAL();
4178: RRETURN(MATCH_NOMATCH);
4179: }
4180: GETCHARINCTEST(c, eptr);
4181: if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4182: RRETURN(MATCH_NOMATCH);
4183: }
4184: break;
4185:
4186: case PT_PC:
4187: for (i = 1; i <= min; i++)
4188: {
4189: if (eptr >= md->end_subject)
4190: {
4191: SCHECK_PARTIAL();
4192: RRETURN(MATCH_NOMATCH);
4193: }
4194: GETCHARINCTEST(c, eptr);
4195: if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4196: RRETURN(MATCH_NOMATCH);
4197: }
4198: break;
4199:
4200: case PT_SC:
4201: for (i = 1; i <= min; i++)
4202: {
4203: if (eptr >= md->end_subject)
4204: {
4205: SCHECK_PARTIAL();
4206: RRETURN(MATCH_NOMATCH);
4207: }
4208: GETCHARINCTEST(c, eptr);
4209: if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4210: RRETURN(MATCH_NOMATCH);
4211: }
4212: break;
4213:
4214: case PT_ALNUM:
4215: for (i = 1; i <= min; i++)
4216: {
4217: int category;
4218: if (eptr >= md->end_subject)
4219: {
4220: SCHECK_PARTIAL();
4221: RRETURN(MATCH_NOMATCH);
4222: }
4223: GETCHARINCTEST(c, eptr);
4224: category = UCD_CATEGORY(c);
4225: if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4226: RRETURN(MATCH_NOMATCH);
4227: }
4228: break;
4229:
1.1.1.5 ! misho 4230: /* Perl space used to exclude VT, but from Perl 5.18 it is included,
! 4231: which means that Perl space and POSIX space are now identical. PCRE
! 4232: was changed at release 8.34. */
! 4233:
1.1 misho 4234: case PT_SPACE: /* Perl space */
1.1.1.5 ! misho 4235: case PT_PXSPACE: /* POSIX space */
1.1 misho 4236: for (i = 1; i <= min; i++)
4237: {
4238: if (eptr >= md->end_subject)
4239: {
4240: SCHECK_PARTIAL();
4241: RRETURN(MATCH_NOMATCH);
4242: }
4243: GETCHARINCTEST(c, eptr);
1.1.1.5 ! misho 4244: switch(c)
1.1 misho 4245: {
1.1.1.5 ! misho 4246: HSPACE_CASES:
! 4247: VSPACE_CASES:
! 4248: if (prop_fail_result) RRETURN(MATCH_NOMATCH);
! 4249: break;
! 4250:
! 4251: default:
! 4252: if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
! 4253: RRETURN(MATCH_NOMATCH);
! 4254: break;
1.1 misho 4255: }
4256: }
4257: break;
4258:
4259: case PT_WORD:
4260: for (i = 1; i <= min; i++)
4261: {
4262: int category;
4263: if (eptr >= md->end_subject)
4264: {
4265: SCHECK_PARTIAL();
4266: RRETURN(MATCH_NOMATCH);
4267: }
4268: GETCHARINCTEST(c, eptr);
4269: category = UCD_CATEGORY(c);
4270: if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4271: == prop_fail_result)
4272: RRETURN(MATCH_NOMATCH);
4273: }
4274: break;
4275:
1.1.1.4 misho 4276: case PT_CLIST:
4277: for (i = 1; i <= min; i++)
4278: {
4279: const pcre_uint32 *cp;
4280: if (eptr >= md->end_subject)
4281: {
4282: SCHECK_PARTIAL();
4283: RRETURN(MATCH_NOMATCH);
4284: }
4285: GETCHARINCTEST(c, eptr);
4286: cp = PRIV(ucd_caseless_sets) + prop_value;
4287: for (;;)
4288: {
4289: if (c < *cp)
4290: { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4291: if (c == *cp++)
4292: { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4293: }
4294: }
4295: break;
4296:
4297: case PT_UCNC:
4298: for (i = 1; i <= min; i++)
4299: {
4300: if (eptr >= md->end_subject)
4301: {
4302: SCHECK_PARTIAL();
4303: RRETURN(MATCH_NOMATCH);
4304: }
4305: GETCHARINCTEST(c, eptr);
4306: if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
4307: c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
4308: c >= 0xe000) == prop_fail_result)
4309: RRETURN(MATCH_NOMATCH);
4310: }
4311: break;
4312:
1.1 misho 4313: /* This should not occur */
4314:
4315: default:
4316: RRETURN(PCRE_ERROR_INTERNAL);
4317: }
4318: }
4319:
4320: /* Match extended Unicode sequences. We will get here only if the
4321: support is in the binary; otherwise a compile-time error occurs. */
4322:
4323: else if (ctype == OP_EXTUNI)
4324: {
4325: for (i = 1; i <= min; i++)
4326: {
4327: if (eptr >= md->end_subject)
4328: {
4329: SCHECK_PARTIAL();
4330: RRETURN(MATCH_NOMATCH);
4331: }
1.1.1.4 misho 4332: else
1.1 misho 4333: {
1.1.1.4 misho 4334: int lgb, rgb;
4335: GETCHARINCTEST(c, eptr);
4336: lgb = UCD_GRAPHBREAK(c);
4337: while (eptr < md->end_subject)
4338: {
4339: int len = 1;
4340: if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4341: rgb = UCD_GRAPHBREAK(c);
4342: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4343: lgb = rgb;
4344: eptr += len;
4345: }
1.1 misho 4346: }
1.1.1.3 misho 4347: CHECK_PARTIAL();
1.1 misho 4348: }
4349: }
4350:
4351: else
4352: #endif /* SUPPORT_UCP */
4353:
4354: /* Handle all other cases when the coding is UTF-8 */
4355:
1.1.1.2 misho 4356: #ifdef SUPPORT_UTF
4357: if (utf) switch(ctype)
1.1 misho 4358: {
4359: case OP_ANY:
4360: for (i = 1; i <= min; i++)
4361: {
4362: if (eptr >= md->end_subject)
4363: {
4364: SCHECK_PARTIAL();
4365: RRETURN(MATCH_NOMATCH);
4366: }
4367: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1.1.1.3 misho 4368: if (md->partial != 0 &&
4369: eptr + 1 >= md->end_subject &&
4370: NLBLOCK->nltype == NLTYPE_FIXED &&
4371: NLBLOCK->nllen == 2 &&
1.1.1.4 misho 4372: RAWUCHAR(eptr) == NLBLOCK->nl[0])
1.1.1.3 misho 4373: {
4374: md->hitend = TRUE;
4375: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4376: }
1.1 misho 4377: eptr++;
1.1.1.2 misho 4378: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misho 4379: }
4380: break;
4381:
4382: case OP_ALLANY:
4383: for (i = 1; i <= min; i++)
4384: {
4385: if (eptr >= md->end_subject)
4386: {
4387: SCHECK_PARTIAL();
4388: RRETURN(MATCH_NOMATCH);
4389: }
4390: eptr++;
1.1.1.2 misho 4391: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misho 4392: }
4393: break;
4394:
4395: case OP_ANYBYTE:
4396: if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4397: eptr += min;
4398: break;
4399:
4400: case OP_ANYNL:
4401: for (i = 1; i <= min; i++)
4402: {
4403: if (eptr >= md->end_subject)
4404: {
4405: SCHECK_PARTIAL();
4406: RRETURN(MATCH_NOMATCH);
4407: }
4408: GETCHARINC(c, eptr);
4409: switch(c)
4410: {
4411: default: RRETURN(MATCH_NOMATCH);
4412:
1.1.1.4 misho 4413: case CHAR_CR:
4414: if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
1.1 misho 4415: break;
4416:
1.1.1.4 misho 4417: case CHAR_LF:
1.1 misho 4418: break;
4419:
1.1.1.4 misho 4420: case CHAR_VT:
4421: case CHAR_FF:
4422: case CHAR_NEL:
4423: #ifndef EBCDIC
1.1 misho 4424: case 0x2028:
4425: case 0x2029:
1.1.1.4 misho 4426: #endif /* Not EBCDIC */
1.1 misho 4427: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4428: break;
4429: }
4430: }
4431: break;
4432:
4433: case OP_NOT_HSPACE:
4434: for (i = 1; i <= min; i++)
4435: {
4436: if (eptr >= md->end_subject)
4437: {
4438: SCHECK_PARTIAL();
4439: RRETURN(MATCH_NOMATCH);
4440: }
4441: GETCHARINC(c, eptr);
4442: switch(c)
4443: {
1.1.1.4 misho 4444: HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
1.1 misho 4445: default: break;
4446: }
4447: }
4448: break;
4449:
4450: case OP_HSPACE:
4451: for (i = 1; i <= min; i++)
4452: {
4453: if (eptr >= md->end_subject)
4454: {
4455: SCHECK_PARTIAL();
4456: RRETURN(MATCH_NOMATCH);
4457: }
4458: GETCHARINC(c, eptr);
4459: switch(c)
4460: {
1.1.1.4 misho 4461: HSPACE_CASES: break; /* Byte and multibyte cases */
1.1 misho 4462: default: RRETURN(MATCH_NOMATCH);
4463: }
4464: }
4465: break;
4466:
4467: case OP_NOT_VSPACE:
4468: for (i = 1; i <= min; i++)
4469: {
4470: if (eptr >= md->end_subject)
4471: {
4472: SCHECK_PARTIAL();
4473: RRETURN(MATCH_NOMATCH);
4474: }
4475: GETCHARINC(c, eptr);
4476: switch(c)
4477: {
1.1.1.4 misho 4478: VSPACE_CASES: RRETURN(MATCH_NOMATCH);
1.1 misho 4479: default: break;
4480: }
4481: }
4482: break;
4483:
4484: case OP_VSPACE:
4485: for (i = 1; i <= min; i++)
4486: {
4487: if (eptr >= md->end_subject)
4488: {
4489: SCHECK_PARTIAL();
4490: RRETURN(MATCH_NOMATCH);
4491: }
4492: GETCHARINC(c, eptr);
4493: switch(c)
4494: {
1.1.1.4 misho 4495: VSPACE_CASES: break;
1.1 misho 4496: default: RRETURN(MATCH_NOMATCH);
4497: }
4498: }
4499: break;
4500:
4501: case OP_NOT_DIGIT:
4502: for (i = 1; i <= min; i++)
4503: {
4504: if (eptr >= md->end_subject)
4505: {
4506: SCHECK_PARTIAL();
4507: RRETURN(MATCH_NOMATCH);
4508: }
4509: GETCHARINC(c, eptr);
4510: if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4511: RRETURN(MATCH_NOMATCH);
4512: }
4513: break;
4514:
4515: case OP_DIGIT:
4516: for (i = 1; i <= min; i++)
4517: {
1.1.1.4 misho 4518: pcre_uint32 cc;
1.1 misho 4519: if (eptr >= md->end_subject)
4520: {
4521: SCHECK_PARTIAL();
4522: RRETURN(MATCH_NOMATCH);
4523: }
1.1.1.4 misho 4524: cc = RAWUCHAR(eptr);
4525: if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
1.1 misho 4526: RRETURN(MATCH_NOMATCH);
1.1.1.2 misho 4527: eptr++;
1.1 misho 4528: /* No need to skip more bytes - we know it's a 1-byte character */
4529: }
4530: break;
4531:
4532: case OP_NOT_WHITESPACE:
4533: for (i = 1; i <= min; i++)
4534: {
1.1.1.4 misho 4535: pcre_uint32 cc;
1.1 misho 4536: if (eptr >= md->end_subject)
4537: {
4538: SCHECK_PARTIAL();
4539: RRETURN(MATCH_NOMATCH);
4540: }
1.1.1.4 misho 4541: cc = RAWUCHAR(eptr);
4542: if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
1.1 misho 4543: RRETURN(MATCH_NOMATCH);
1.1.1.2 misho 4544: eptr++;
4545: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misho 4546: }
4547: break;
4548:
4549: case OP_WHITESPACE:
4550: for (i = 1; i <= min; i++)
4551: {
1.1.1.4 misho 4552: pcre_uint32 cc;
1.1 misho 4553: if (eptr >= md->end_subject)
4554: {
4555: SCHECK_PARTIAL();
4556: RRETURN(MATCH_NOMATCH);
4557: }
1.1.1.4 misho 4558: cc = RAWUCHAR(eptr);
4559: if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
1.1 misho 4560: RRETURN(MATCH_NOMATCH);
1.1.1.2 misho 4561: eptr++;
1.1 misho 4562: /* No need to skip more bytes - we know it's a 1-byte character */
4563: }
4564: break;
4565:
4566: case OP_NOT_WORDCHAR:
4567: for (i = 1; i <= min; i++)
4568: {
1.1.1.4 misho 4569: pcre_uint32 cc;
1.1 misho 4570: if (eptr >= md->end_subject)
4571: {
4572: SCHECK_PARTIAL();
4573: RRETURN(MATCH_NOMATCH);
4574: }
1.1.1.4 misho 4575: cc = RAWUCHAR(eptr);
4576: if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
1.1 misho 4577: RRETURN(MATCH_NOMATCH);
1.1.1.2 misho 4578: eptr++;
4579: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misho 4580: }
4581: break;
4582:
4583: case OP_WORDCHAR:
4584: for (i = 1; i <= min; i++)
4585: {
1.1.1.4 misho 4586: pcre_uint32 cc;
1.1 misho 4587: if (eptr >= md->end_subject)
4588: {
4589: SCHECK_PARTIAL();
4590: RRETURN(MATCH_NOMATCH);
4591: }
1.1.1.4 misho 4592: cc = RAWUCHAR(eptr);
4593: if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
1.1 misho 4594: RRETURN(MATCH_NOMATCH);
1.1.1.2 misho 4595: eptr++;
1.1 misho 4596: /* No need to skip more bytes - we know it's a 1-byte character */
4597: }
4598: break;
4599:
4600: default:
4601: RRETURN(PCRE_ERROR_INTERNAL);
4602: } /* End switch(ctype) */
4603:
4604: else
1.1.1.2 misho 4605: #endif /* SUPPORT_UTF */
1.1 misho 4606:
4607: /* Code for the non-UTF-8 case for minimum matching of operators other
4608: than OP_PROP and OP_NOTPROP. */
4609:
4610: switch(ctype)
4611: {
4612: case OP_ANY:
4613: for (i = 1; i <= min; i++)
4614: {
4615: if (eptr >= md->end_subject)
4616: {
4617: SCHECK_PARTIAL();
4618: RRETURN(MATCH_NOMATCH);
4619: }
4620: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1.1.1.3 misho 4621: if (md->partial != 0 &&
4622: eptr + 1 >= md->end_subject &&
4623: NLBLOCK->nltype == NLTYPE_FIXED &&
4624: NLBLOCK->nllen == 2 &&
4625: *eptr == NLBLOCK->nl[0])
4626: {
4627: md->hitend = TRUE;
4628: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4629: }
1.1 misho 4630: eptr++;
4631: }
4632: break;
4633:
4634: case OP_ALLANY:
4635: if (eptr > md->end_subject - min)
4636: {
4637: SCHECK_PARTIAL();
4638: RRETURN(MATCH_NOMATCH);
4639: }
4640: eptr += min;
4641: break;
4642:
4643: case OP_ANYBYTE:
4644: if (eptr > md->end_subject - min)
4645: {
4646: SCHECK_PARTIAL();
4647: RRETURN(MATCH_NOMATCH);
4648: }
4649: eptr += min;
4650: break;
4651:
4652: case OP_ANYNL:
4653: for (i = 1; i <= min; i++)
4654: {
4655: if (eptr >= md->end_subject)
4656: {
4657: SCHECK_PARTIAL();
4658: RRETURN(MATCH_NOMATCH);
4659: }
4660: switch(*eptr++)
4661: {
4662: default: RRETURN(MATCH_NOMATCH);
4663:
1.1.1.4 misho 4664: case CHAR_CR:
4665: if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
1.1 misho 4666: break;
4667:
1.1.1.4 misho 4668: case CHAR_LF:
1.1 misho 4669: break;
4670:
1.1.1.4 misho 4671: case CHAR_VT:
4672: case CHAR_FF:
4673: case CHAR_NEL:
4674: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1.1.1.2 misho 4675: case 0x2028:
4676: case 0x2029:
4677: #endif
1.1 misho 4678: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4679: break;
4680: }
4681: }
4682: break;
4683:
4684: case OP_NOT_HSPACE:
4685: for (i = 1; i <= min; i++)
4686: {
4687: if (eptr >= md->end_subject)
4688: {
4689: SCHECK_PARTIAL();
4690: RRETURN(MATCH_NOMATCH);
4691: }
4692: switch(*eptr++)
4693: {
4694: default: break;
1.1.1.4 misho 4695: HSPACE_BYTE_CASES:
4696: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4697: HSPACE_MULTIBYTE_CASES:
1.1.1.2 misho 4698: #endif
1.1 misho 4699: RRETURN(MATCH_NOMATCH);
4700: }
4701: }
4702: break;
4703:
4704: case OP_HSPACE:
4705: for (i = 1; i <= min; i++)
4706: {
4707: if (eptr >= md->end_subject)
4708: {
4709: SCHECK_PARTIAL();
4710: RRETURN(MATCH_NOMATCH);
4711: }
4712: switch(*eptr++)
4713: {
4714: default: RRETURN(MATCH_NOMATCH);
1.1.1.4 misho 4715: HSPACE_BYTE_CASES:
4716: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4717: HSPACE_MULTIBYTE_CASES:
1.1.1.2 misho 4718: #endif
1.1 misho 4719: break;
4720: }
4721: }
4722: break;
4723:
4724: case OP_NOT_VSPACE:
4725: for (i = 1; i <= min; i++)
4726: {
4727: if (eptr >= md->end_subject)
4728: {
4729: SCHECK_PARTIAL();
4730: RRETURN(MATCH_NOMATCH);
4731: }
4732: switch(*eptr++)
4733: {
1.1.1.4 misho 4734: VSPACE_BYTE_CASES:
4735: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4736: VSPACE_MULTIBYTE_CASES:
1.1.1.2 misho 4737: #endif
1.1 misho 4738: RRETURN(MATCH_NOMATCH);
1.1.1.4 misho 4739: default: break;
1.1 misho 4740: }
4741: }
4742: break;
4743:
4744: case OP_VSPACE:
4745: for (i = 1; i <= min; i++)
4746: {
4747: if (eptr >= md->end_subject)
4748: {
4749: SCHECK_PARTIAL();
4750: RRETURN(MATCH_NOMATCH);
4751: }
4752: switch(*eptr++)
4753: {
4754: default: RRETURN(MATCH_NOMATCH);
1.1.1.4 misho 4755: VSPACE_BYTE_CASES:
4756: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4757: VSPACE_MULTIBYTE_CASES:
1.1.1.2 misho 4758: #endif
1.1 misho 4759: break;
4760: }
4761: }
4762: break;
4763:
4764: case OP_NOT_DIGIT:
4765: for (i = 1; i <= min; i++)
4766: {
4767: if (eptr >= md->end_subject)
4768: {
4769: SCHECK_PARTIAL();
4770: RRETURN(MATCH_NOMATCH);
4771: }
1.1.1.2 misho 4772: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4773: RRETURN(MATCH_NOMATCH);
4774: eptr++;
1.1 misho 4775: }
4776: break;
4777:
4778: case OP_DIGIT:
4779: for (i = 1; i <= min; i++)
4780: {
4781: if (eptr >= md->end_subject)
4782: {
4783: SCHECK_PARTIAL();
4784: RRETURN(MATCH_NOMATCH);
4785: }
1.1.1.2 misho 4786: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4787: RRETURN(MATCH_NOMATCH);
4788: eptr++;
1.1 misho 4789: }
4790: break;
4791:
4792: case OP_NOT_WHITESPACE:
4793: for (i = 1; i <= min; i++)
4794: {
4795: if (eptr >= md->end_subject)
4796: {
4797: SCHECK_PARTIAL();
4798: RRETURN(MATCH_NOMATCH);
4799: }
1.1.1.2 misho 4800: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4801: RRETURN(MATCH_NOMATCH);
4802: eptr++;
1.1 misho 4803: }
4804: break;
4805:
4806: case OP_WHITESPACE:
4807: for (i = 1; i <= min; i++)
4808: {
4809: if (eptr >= md->end_subject)
4810: {
4811: SCHECK_PARTIAL();
4812: RRETURN(MATCH_NOMATCH);
4813: }
1.1.1.2 misho 4814: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4815: RRETURN(MATCH_NOMATCH);
4816: eptr++;
1.1 misho 4817: }
4818: break;
4819:
4820: case OP_NOT_WORDCHAR:
4821: for (i = 1; i <= min; i++)
4822: {
4823: if (eptr >= md->end_subject)
4824: {
4825: SCHECK_PARTIAL();
4826: RRETURN(MATCH_NOMATCH);
4827: }
1.1.1.2 misho 4828: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
1.1 misho 4829: RRETURN(MATCH_NOMATCH);
1.1.1.2 misho 4830: eptr++;
1.1 misho 4831: }
4832: break;
4833:
4834: case OP_WORDCHAR:
4835: for (i = 1; i <= min; i++)
4836: {
4837: if (eptr >= md->end_subject)
4838: {
4839: SCHECK_PARTIAL();
4840: RRETURN(MATCH_NOMATCH);
4841: }
1.1.1.2 misho 4842: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
1.1 misho 4843: RRETURN(MATCH_NOMATCH);
1.1.1.2 misho 4844: eptr++;
1.1 misho 4845: }
4846: break;
4847:
4848: default:
4849: RRETURN(PCRE_ERROR_INTERNAL);
4850: }
4851: }
4852:
4853: /* If min = max, continue at the same level without recursing */
4854:
4855: if (min == max) continue;
4856:
4857: /* If minimizing, we have to test the rest of the pattern before each
4858: subsequent match. Again, separate the UTF-8 case for speed, and also
4859: separate the UCP cases. */
4860:
4861: if (minimize)
4862: {
4863: #ifdef SUPPORT_UCP
4864: if (prop_type >= 0)
4865: {
4866: switch(prop_type)
4867: {
4868: case PT_ANY:
4869: for (fi = min;; fi++)
4870: {
4871: RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4872: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4873: if (fi >= max) RRETURN(MATCH_NOMATCH);
4874: if (eptr >= md->end_subject)
4875: {
4876: SCHECK_PARTIAL();
4877: RRETURN(MATCH_NOMATCH);
4878: }
4879: GETCHARINCTEST(c, eptr);
4880: if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4881: }
4882: /* Control never gets here */
4883:
4884: case PT_LAMP:
4885: for (fi = min;; fi++)
4886: {
4887: int chartype;
4888: RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4889: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4890: if (fi >= max) RRETURN(MATCH_NOMATCH);
4891: if (eptr >= md->end_subject)
4892: {
4893: SCHECK_PARTIAL();
4894: RRETURN(MATCH_NOMATCH);
4895: }
4896: GETCHARINCTEST(c, eptr);
4897: chartype = UCD_CHARTYPE(c);
4898: if ((chartype == ucp_Lu ||
4899: chartype == ucp_Ll ||
4900: chartype == ucp_Lt) == prop_fail_result)
4901: RRETURN(MATCH_NOMATCH);
4902: }
4903: /* Control never gets here */
4904:
4905: case PT_GC:
4906: for (fi = min;; fi++)
4907: {
4908: RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4909: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4910: if (fi >= max) RRETURN(MATCH_NOMATCH);
4911: if (eptr >= md->end_subject)
4912: {
4913: SCHECK_PARTIAL();
4914: RRETURN(MATCH_NOMATCH);
4915: }
4916: GETCHARINCTEST(c, eptr);
4917: if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4918: RRETURN(MATCH_NOMATCH);
4919: }
4920: /* Control never gets here */
4921:
4922: case PT_PC:
4923: for (fi = min;; fi++)
4924: {
4925: RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4926: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4927: if (fi >= max) RRETURN(MATCH_NOMATCH);
4928: if (eptr >= md->end_subject)
4929: {
4930: SCHECK_PARTIAL();
4931: RRETURN(MATCH_NOMATCH);
4932: }
4933: GETCHARINCTEST(c, eptr);
4934: if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4935: RRETURN(MATCH_NOMATCH);
4936: }
4937: /* Control never gets here */
4938:
4939: case PT_SC:
4940: for (fi = min;; fi++)
4941: {
4942: RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4943: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4944: if (fi >= max) RRETURN(MATCH_NOMATCH);
4945: if (eptr >= md->end_subject)
4946: {
4947: SCHECK_PARTIAL();
4948: RRETURN(MATCH_NOMATCH);
4949: }
4950: GETCHARINCTEST(c, eptr);
4951: if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4952: RRETURN(MATCH_NOMATCH);
4953: }
4954: /* Control never gets here */
4955:
4956: case PT_ALNUM:
4957: for (fi = min;; fi++)
4958: {
4959: int category;
4960: RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4961: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4962: if (fi >= max) RRETURN(MATCH_NOMATCH);
4963: if (eptr >= md->end_subject)
4964: {
4965: SCHECK_PARTIAL();
4966: RRETURN(MATCH_NOMATCH);
4967: }
4968: GETCHARINCTEST(c, eptr);
4969: category = UCD_CATEGORY(c);
4970: if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4971: RRETURN(MATCH_NOMATCH);
4972: }
4973: /* Control never gets here */
4974:
1.1.1.5 ! misho 4975: /* Perl space used to exclude VT, but from Perl 5.18 it is included,
! 4976: which means that Perl space and POSIX space are now identical. PCRE
! 4977: was changed at release 8.34. */
1.1 misho 4978:
1.1.1.5 ! misho 4979: case PT_SPACE: /* Perl space */
1.1 misho 4980: case PT_PXSPACE: /* POSIX space */
4981: for (fi = min;; fi++)
4982: {
4983: RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4984: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4985: if (fi >= max) RRETURN(MATCH_NOMATCH);
4986: if (eptr >= md->end_subject)
4987: {
4988: SCHECK_PARTIAL();
4989: RRETURN(MATCH_NOMATCH);
4990: }
4991: GETCHARINCTEST(c, eptr);
1.1.1.5 ! misho 4992: switch(c)
! 4993: {
! 4994: HSPACE_CASES:
! 4995: VSPACE_CASES:
! 4996: if (prop_fail_result) RRETURN(MATCH_NOMATCH);
! 4997: break;
! 4998:
! 4999: default:
! 5000: if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
! 5001: RRETURN(MATCH_NOMATCH);
! 5002: break;
! 5003: }
1.1 misho 5004: }
5005: /* Control never gets here */
5006:
5007: case PT_WORD:
5008: for (fi = min;; fi++)
5009: {
5010: int category;
5011: RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
5012: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5013: if (fi >= max) RRETURN(MATCH_NOMATCH);
5014: if (eptr >= md->end_subject)
5015: {
5016: SCHECK_PARTIAL();
5017: RRETURN(MATCH_NOMATCH);
5018: }
5019: GETCHARINCTEST(c, eptr);
5020: category = UCD_CATEGORY(c);
5021: if ((category == ucp_L ||
5022: category == ucp_N ||
5023: c == CHAR_UNDERSCORE)
5024: == prop_fail_result)
5025: RRETURN(MATCH_NOMATCH);
5026: }
5027: /* Control never gets here */
5028:
1.1.1.4 misho 5029: case PT_CLIST:
5030: for (fi = min;; fi++)
5031: {
5032: const pcre_uint32 *cp;
5033: RMATCH(eptr, ecode, offset_top, md, eptrb, RM67);
5034: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5035: if (fi >= max) RRETURN(MATCH_NOMATCH);
5036: if (eptr >= md->end_subject)
5037: {
5038: SCHECK_PARTIAL();
5039: RRETURN(MATCH_NOMATCH);
5040: }
5041: GETCHARINCTEST(c, eptr);
5042: cp = PRIV(ucd_caseless_sets) + prop_value;
5043: for (;;)
5044: {
5045: if (c < *cp)
5046: { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
5047: if (c == *cp++)
5048: { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
5049: }
5050: }
5051: /* Control never gets here */
1.1 misho 5052:
1.1.1.4 misho 5053: case PT_UCNC:
5054: for (fi = min;; fi++)
5055: {
1.1.1.5 ! misho 5056: RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
1.1.1.4 misho 5057: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5058: if (fi >= max) RRETURN(MATCH_NOMATCH);
5059: if (eptr >= md->end_subject)
5060: {
5061: SCHECK_PARTIAL();
5062: RRETURN(MATCH_NOMATCH);
5063: }
5064: GETCHARINCTEST(c, eptr);
5065: if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5066: c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5067: c >= 0xe000) == prop_fail_result)
5068: RRETURN(MATCH_NOMATCH);
5069: }
5070: /* Control never gets here */
5071:
5072: /* This should never occur */
1.1 misho 5073: default:
5074: RRETURN(PCRE_ERROR_INTERNAL);
5075: }
5076: }
5077:
5078: /* Match extended Unicode sequences. We will get here only if the
5079: support is in the binary; otherwise a compile-time error occurs. */
5080:
5081: else if (ctype == OP_EXTUNI)
5082: {
5083: for (fi = min;; fi++)
5084: {
5085: RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5086: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5087: if (fi >= max) RRETURN(MATCH_NOMATCH);
5088: if (eptr >= md->end_subject)
5089: {
5090: SCHECK_PARTIAL();
5091: RRETURN(MATCH_NOMATCH);
5092: }
1.1.1.4 misho 5093: else
1.1 misho 5094: {
1.1.1.4 misho 5095: int lgb, rgb;
5096: GETCHARINCTEST(c, eptr);
5097: lgb = UCD_GRAPHBREAK(c);
5098: while (eptr < md->end_subject)
5099: {
5100: int len = 1;
5101: if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5102: rgb = UCD_GRAPHBREAK(c);
5103: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5104: lgb = rgb;
5105: eptr += len;
5106: }
1.1 misho 5107: }
1.1.1.3 misho 5108: CHECK_PARTIAL();
1.1 misho 5109: }
5110: }
5111: else
5112: #endif /* SUPPORT_UCP */
5113:
1.1.1.2 misho 5114: #ifdef SUPPORT_UTF
5115: if (utf)
1.1 misho 5116: {
5117: for (fi = min;; fi++)
5118: {
5119: RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5120: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5121: if (fi >= max) RRETURN(MATCH_NOMATCH);
5122: if (eptr >= md->end_subject)
5123: {
5124: SCHECK_PARTIAL();
5125: RRETURN(MATCH_NOMATCH);
5126: }
5127: if (ctype == OP_ANY && IS_NEWLINE(eptr))
5128: RRETURN(MATCH_NOMATCH);
5129: GETCHARINC(c, eptr);
5130: switch(ctype)
5131: {
1.1.1.3 misho 5132: case OP_ANY: /* This is the non-NL case */
5133: if (md->partial != 0 && /* Take care with CRLF partial */
5134: eptr >= md->end_subject &&
5135: NLBLOCK->nltype == NLTYPE_FIXED &&
5136: NLBLOCK->nllen == 2 &&
5137: c == NLBLOCK->nl[0])
5138: {
5139: md->hitend = TRUE;
5140: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5141: }
5142: break;
5143:
1.1 misho 5144: case OP_ALLANY:
5145: case OP_ANYBYTE:
5146: break;
5147:
5148: case OP_ANYNL:
5149: switch(c)
5150: {
5151: default: RRETURN(MATCH_NOMATCH);
1.1.1.4 misho 5152: case CHAR_CR:
5153: if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
1.1 misho 5154: break;
1.1.1.4 misho 5155:
5156: case CHAR_LF:
1.1 misho 5157: break;
5158:
1.1.1.4 misho 5159: case CHAR_VT:
5160: case CHAR_FF:
5161: case CHAR_NEL:
5162: #ifndef EBCDIC
1.1 misho 5163: case 0x2028:
5164: case 0x2029:
1.1.1.4 misho 5165: #endif /* Not EBCDIC */
1.1 misho 5166: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5167: break;
5168: }
5169: break;
5170:
5171: case OP_NOT_HSPACE:
5172: switch(c)
5173: {
1.1.1.4 misho 5174: HSPACE_CASES: RRETURN(MATCH_NOMATCH);
1.1 misho 5175: default: break;
5176: }
5177: break;
5178:
5179: case OP_HSPACE:
5180: switch(c)
5181: {
1.1.1.4 misho 5182: HSPACE_CASES: break;
1.1 misho 5183: default: RRETURN(MATCH_NOMATCH);
5184: }
5185: break;
5186:
5187: case OP_NOT_VSPACE:
5188: switch(c)
5189: {
1.1.1.4 misho 5190: VSPACE_CASES: RRETURN(MATCH_NOMATCH);
1.1 misho 5191: default: break;
5192: }
5193: break;
5194:
5195: case OP_VSPACE:
5196: switch(c)
5197: {
1.1.1.4 misho 5198: VSPACE_CASES: break;
1.1 misho 5199: default: RRETURN(MATCH_NOMATCH);
5200: }
5201: break;
5202:
5203: case OP_NOT_DIGIT:
5204: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5205: RRETURN(MATCH_NOMATCH);
5206: break;
5207:
5208: case OP_DIGIT:
5209: if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5210: RRETURN(MATCH_NOMATCH);
5211: break;
5212:
5213: case OP_NOT_WHITESPACE:
5214: if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5215: RRETURN(MATCH_NOMATCH);
5216: break;
5217:
5218: case OP_WHITESPACE:
1.1.1.2 misho 5219: if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
1.1 misho 5220: RRETURN(MATCH_NOMATCH);
5221: break;
5222:
5223: case OP_NOT_WORDCHAR:
5224: if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5225: RRETURN(MATCH_NOMATCH);
5226: break;
5227:
5228: case OP_WORDCHAR:
5229: if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5230: RRETURN(MATCH_NOMATCH);
5231: break;
5232:
5233: default:
5234: RRETURN(PCRE_ERROR_INTERNAL);
5235: }
5236: }
5237: }
5238: else
5239: #endif
1.1.1.2 misho 5240: /* Not UTF mode */
1.1 misho 5241: {
5242: for (fi = min;; fi++)
5243: {
5244: RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5245: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5246: if (fi >= max) RRETURN(MATCH_NOMATCH);
5247: if (eptr >= md->end_subject)
5248: {
5249: SCHECK_PARTIAL();
5250: RRETURN(MATCH_NOMATCH);
5251: }
5252: if (ctype == OP_ANY && IS_NEWLINE(eptr))
5253: RRETURN(MATCH_NOMATCH);
5254: c = *eptr++;
5255: switch(ctype)
5256: {
1.1.1.3 misho 5257: case OP_ANY: /* This is the non-NL case */
5258: if (md->partial != 0 && /* Take care with CRLF partial */
5259: eptr >= md->end_subject &&
5260: NLBLOCK->nltype == NLTYPE_FIXED &&
5261: NLBLOCK->nllen == 2 &&
5262: c == NLBLOCK->nl[0])
5263: {
5264: md->hitend = TRUE;
5265: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5266: }
5267: break;
5268:
1.1 misho 5269: case OP_ALLANY:
5270: case OP_ANYBYTE:
5271: break;
5272:
5273: case OP_ANYNL:
5274: switch(c)
5275: {
5276: default: RRETURN(MATCH_NOMATCH);
1.1.1.4 misho 5277: case CHAR_CR:
5278: if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
1.1 misho 5279: break;
5280:
1.1.1.4 misho 5281: case CHAR_LF:
1.1 misho 5282: break;
5283:
1.1.1.4 misho 5284: case CHAR_VT:
5285: case CHAR_FF:
5286: case CHAR_NEL:
5287: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1.1.1.2 misho 5288: case 0x2028:
5289: case 0x2029:
5290: #endif
1.1 misho 5291: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5292: break;
5293: }
5294: break;
5295:
5296: case OP_NOT_HSPACE:
5297: switch(c)
5298: {
5299: default: break;
1.1.1.4 misho 5300: HSPACE_BYTE_CASES:
5301: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5302: HSPACE_MULTIBYTE_CASES:
1.1.1.2 misho 5303: #endif
1.1 misho 5304: RRETURN(MATCH_NOMATCH);
5305: }
5306: break;
5307:
5308: case OP_HSPACE:
5309: switch(c)
5310: {
5311: default: RRETURN(MATCH_NOMATCH);
1.1.1.4 misho 5312: HSPACE_BYTE_CASES:
5313: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5314: HSPACE_MULTIBYTE_CASES:
1.1.1.2 misho 5315: #endif
1.1 misho 5316: break;
5317: }
5318: break;
5319:
5320: case OP_NOT_VSPACE:
5321: switch(c)
5322: {
5323: default: break;
1.1.1.4 misho 5324: VSPACE_BYTE_CASES:
5325: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5326: VSPACE_MULTIBYTE_CASES:
1.1.1.2 misho 5327: #endif
1.1 misho 5328: RRETURN(MATCH_NOMATCH);
5329: }
5330: break;
5331:
5332: case OP_VSPACE:
5333: switch(c)
5334: {
5335: default: RRETURN(MATCH_NOMATCH);
1.1.1.4 misho 5336: VSPACE_BYTE_CASES:
5337: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5338: VSPACE_MULTIBYTE_CASES:
1.1.1.2 misho 5339: #endif
1.1 misho 5340: break;
5341: }
5342: break;
5343:
5344: case OP_NOT_DIGIT:
1.1.1.2 misho 5345: if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
1.1 misho 5346: break;
5347:
5348: case OP_DIGIT:
1.1.1.2 misho 5349: if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
1.1 misho 5350: break;
5351:
5352: case OP_NOT_WHITESPACE:
1.1.1.2 misho 5353: if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
1.1 misho 5354: break;
5355:
5356: case OP_WHITESPACE:
1.1.1.2 misho 5357: if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
1.1 misho 5358: break;
5359:
5360: case OP_NOT_WORDCHAR:
1.1.1.2 misho 5361: if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
1.1 misho 5362: break;
5363:
5364: case OP_WORDCHAR:
1.1.1.2 misho 5365: if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
1.1 misho 5366: break;
5367:
5368: default:
5369: RRETURN(PCRE_ERROR_INTERNAL);
5370: }
5371: }
5372: }
5373: /* Control never gets here */
5374: }
5375:
5376: /* If maximizing, it is worth using inline code for speed, doing the type
5377: test once at the start (i.e. keep it out of the loop). Again, keep the
5378: UTF-8 and UCP stuff separate. */
5379:
5380: else
5381: {
5382: pp = eptr; /* Remember where we started */
5383:
5384: #ifdef SUPPORT_UCP
5385: if (prop_type >= 0)
5386: {
5387: switch(prop_type)
5388: {
5389: case PT_ANY:
5390: for (i = min; i < max; i++)
5391: {
5392: int len = 1;
5393: if (eptr >= md->end_subject)
5394: {
5395: SCHECK_PARTIAL();
5396: break;
5397: }
5398: GETCHARLENTEST(c, eptr, len);
5399: if (prop_fail_result) break;
5400: eptr+= len;
5401: }
5402: break;
5403:
5404: case PT_LAMP:
5405: for (i = min; i < max; i++)
5406: {
5407: int chartype;
5408: int len = 1;
5409: if (eptr >= md->end_subject)
5410: {
5411: SCHECK_PARTIAL();
5412: break;
5413: }
5414: GETCHARLENTEST(c, eptr, len);
5415: chartype = UCD_CHARTYPE(c);
5416: if ((chartype == ucp_Lu ||
5417: chartype == ucp_Ll ||
5418: chartype == ucp_Lt) == prop_fail_result)
5419: break;
5420: eptr+= len;
5421: }
5422: break;
5423:
5424: case PT_GC:
5425: for (i = min; i < max; i++)
5426: {
5427: int len = 1;
5428: if (eptr >= md->end_subject)
5429: {
5430: SCHECK_PARTIAL();
5431: break;
5432: }
5433: GETCHARLENTEST(c, eptr, len);
5434: if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5435: eptr+= len;
5436: }
5437: break;
5438:
5439: case PT_PC:
5440: for (i = min; i < max; i++)
5441: {
5442: int len = 1;
5443: if (eptr >= md->end_subject)
5444: {
5445: SCHECK_PARTIAL();
5446: break;
5447: }
5448: GETCHARLENTEST(c, eptr, len);
5449: if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5450: eptr+= len;
5451: }
5452: break;
5453:
5454: case PT_SC:
5455: for (i = min; i < max; i++)
5456: {
5457: int len = 1;
5458: if (eptr >= md->end_subject)
5459: {
5460: SCHECK_PARTIAL();
5461: break;
5462: }
5463: GETCHARLENTEST(c, eptr, len);
5464: if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5465: eptr+= len;
5466: }
5467: break;
5468:
5469: case PT_ALNUM:
5470: for (i = min; i < max; i++)
5471: {
5472: int category;
5473: int len = 1;
5474: if (eptr >= md->end_subject)
5475: {
5476: SCHECK_PARTIAL();
5477: break;
5478: }
5479: GETCHARLENTEST(c, eptr, len);
5480: category = UCD_CATEGORY(c);
5481: if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5482: break;
5483: eptr+= len;
5484: }
5485: break;
5486:
1.1.1.5 ! misho 5487: /* Perl space used to exclude VT, but from Perl 5.18 it is included,
! 5488: which means that Perl space and POSIX space are now identical. PCRE
! 5489: was changed at release 8.34. */
! 5490:
1.1 misho 5491: case PT_SPACE: /* Perl space */
1.1.1.5 ! misho 5492: case PT_PXSPACE: /* POSIX space */
1.1 misho 5493: for (i = min; i < max; i++)
5494: {
5495: int len = 1;
5496: if (eptr >= md->end_subject)
5497: {
5498: SCHECK_PARTIAL();
5499: break;
5500: }
5501: GETCHARLENTEST(c, eptr, len);
1.1.1.5 ! misho 5502: switch(c)
! 5503: {
! 5504: HSPACE_CASES:
! 5505: VSPACE_CASES:
! 5506: if (prop_fail_result) goto ENDLOOP99; /* Break the loop */
1.1 misho 5507: break;
5508:
1.1.1.5 ! misho 5509: default:
! 5510: if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
! 5511: goto ENDLOOP99; /* Break the loop */
1.1 misho 5512: break;
5513: }
5514: eptr+= len;
5515: }
1.1.1.5 ! misho 5516: ENDLOOP99:
1.1 misho 5517: break;
5518:
5519: case PT_WORD:
5520: for (i = min; i < max; i++)
5521: {
5522: int category;
5523: int len = 1;
5524: if (eptr >= md->end_subject)
5525: {
5526: SCHECK_PARTIAL();
5527: break;
5528: }
5529: GETCHARLENTEST(c, eptr, len);
5530: category = UCD_CATEGORY(c);
5531: if ((category == ucp_L || category == ucp_N ||
5532: c == CHAR_UNDERSCORE) == prop_fail_result)
5533: break;
5534: eptr+= len;
5535: }
5536: break;
5537:
1.1.1.4 misho 5538: case PT_CLIST:
5539: for (i = min; i < max; i++)
5540: {
5541: const pcre_uint32 *cp;
5542: int len = 1;
5543: if (eptr >= md->end_subject)
5544: {
5545: SCHECK_PARTIAL();
5546: break;
5547: }
5548: GETCHARLENTEST(c, eptr, len);
5549: cp = PRIV(ucd_caseless_sets) + prop_value;
5550: for (;;)
5551: {
5552: if (c < *cp)
5553: { if (prop_fail_result) break; else goto GOT_MAX; }
5554: if (c == *cp++)
5555: { if (prop_fail_result) goto GOT_MAX; else break; }
5556: }
5557: eptr += len;
5558: }
5559: GOT_MAX:
5560: break;
5561:
5562: case PT_UCNC:
5563: for (i = min; i < max; i++)
5564: {
5565: int len = 1;
5566: if (eptr >= md->end_subject)
5567: {
5568: SCHECK_PARTIAL();
5569: break;
5570: }
5571: GETCHARLENTEST(c, eptr, len);
5572: if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5573: c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5574: c >= 0xe000) == prop_fail_result)
5575: break;
5576: eptr += len;
5577: }
5578: break;
5579:
1.1 misho 5580: default:
5581: RRETURN(PCRE_ERROR_INTERNAL);
5582: }
5583:
5584: /* eptr is now past the end of the maximum run */
5585:
1.1.1.4 misho 5586: if (possessive) continue; /* No backtracking */
1.1 misho 5587: for(;;)
5588: {
1.1.1.4 misho 5589: if (eptr == pp) goto TAIL_RECURSE;
1.1 misho 5590: RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5591: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1.1.4 misho 5592: eptr--;
1.1.1.2 misho 5593: if (utf) BACKCHAR(eptr);
1.1 misho 5594: }
5595: }
5596:
1.1.1.5 ! misho 5597: /* Match extended Unicode grapheme clusters. We will get here only if the
1.1 misho 5598: support is in the binary; otherwise a compile-time error occurs. */
5599:
5600: else if (ctype == OP_EXTUNI)
5601: {
5602: for (i = min; i < max; i++)
5603: {
5604: if (eptr >= md->end_subject)
5605: {
5606: SCHECK_PARTIAL();
5607: break;
5608: }
1.1.1.4 misho 5609: else
1.1 misho 5610: {
1.1.1.4 misho 5611: int lgb, rgb;
5612: GETCHARINCTEST(c, eptr);
5613: lgb = UCD_GRAPHBREAK(c);
5614: while (eptr < md->end_subject)
5615: {
5616: int len = 1;
5617: if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5618: rgb = UCD_GRAPHBREAK(c);
5619: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5620: lgb = rgb;
5621: eptr += len;
5622: }
1.1 misho 5623: }
1.1.1.3 misho 5624: CHECK_PARTIAL();
1.1 misho 5625: }
5626:
5627: /* eptr is now past the end of the maximum run */
5628:
1.1.1.4 misho 5629: if (possessive) continue; /* No backtracking */
1.1.1.5 ! misho 5630:
1.1 misho 5631: for(;;)
5632: {
1.1.1.5 ! misho 5633: int lgb, rgb;
! 5634: PCRE_PUCHAR fptr;
! 5635:
! 5636: if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
1.1 misho 5637: RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5638: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1.1.5 ! misho 5639:
! 5640: /* Backtracking over an extended grapheme cluster involves inspecting
! 5641: the previous two characters (if present) to see if a break is
! 5642: permitted between them. */
! 5643:
1.1.1.4 misho 5644: eptr--;
1.1.1.5 ! misho 5645: if (!utf) c = *eptr; else
! 5646: {
! 5647: BACKCHAR(eptr);
! 5648: GETCHAR(c, eptr);
! 5649: }
! 5650: rgb = UCD_GRAPHBREAK(c);
! 5651:
! 5652: for (;;)
1.1 misho 5653: {
1.1.1.5 ! misho 5654: if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
! 5655: fptr = eptr - 1;
! 5656: if (!utf) c = *fptr; else
1.1 misho 5657: {
1.1.1.5 ! misho 5658: BACKCHAR(fptr);
! 5659: GETCHAR(c, fptr);
1.1 misho 5660: }
1.1.1.5 ! misho 5661: lgb = UCD_GRAPHBREAK(c);
! 5662: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
! 5663: eptr = fptr;
! 5664: rgb = lgb;
1.1 misho 5665: }
5666: }
5667: }
5668:
5669: else
5670: #endif /* SUPPORT_UCP */
5671:
1.1.1.2 misho 5672: #ifdef SUPPORT_UTF
5673: if (utf)
1.1 misho 5674: {
5675: switch(ctype)
5676: {
5677: case OP_ANY:
5678: if (max < INT_MAX)
5679: {
5680: for (i = min; i < max; i++)
5681: {
5682: if (eptr >= md->end_subject)
5683: {
5684: SCHECK_PARTIAL();
5685: break;
5686: }
5687: if (IS_NEWLINE(eptr)) break;
1.1.1.3 misho 5688: if (md->partial != 0 && /* Take care with CRLF partial */
5689: eptr + 1 >= md->end_subject &&
5690: NLBLOCK->nltype == NLTYPE_FIXED &&
5691: NLBLOCK->nllen == 2 &&
1.1.1.4 misho 5692: RAWUCHAR(eptr) == NLBLOCK->nl[0])
1.1.1.3 misho 5693: {
5694: md->hitend = TRUE;
5695: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5696: }
1.1 misho 5697: eptr++;
1.1.1.2 misho 5698: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misho 5699: }
5700: }
5701:
5702: /* Handle unlimited UTF-8 repeat */
5703:
5704: else
5705: {
5706: for (i = min; i < max; i++)
5707: {
5708: if (eptr >= md->end_subject)
5709: {
5710: SCHECK_PARTIAL();
5711: break;
5712: }
5713: if (IS_NEWLINE(eptr)) break;
1.1.1.3 misho 5714: if (md->partial != 0 && /* Take care with CRLF partial */
5715: eptr + 1 >= md->end_subject &&
5716: NLBLOCK->nltype == NLTYPE_FIXED &&
5717: NLBLOCK->nllen == 2 &&
1.1.1.4 misho 5718: RAWUCHAR(eptr) == NLBLOCK->nl[0])
1.1.1.3 misho 5719: {
5720: md->hitend = TRUE;
5721: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5722: }
1.1 misho 5723: eptr++;
1.1.1.2 misho 5724: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misho 5725: }
5726: }
5727: break;
5728:
5729: case OP_ALLANY:
5730: if (max < INT_MAX)
5731: {
5732: for (i = min; i < max; i++)
5733: {
5734: if (eptr >= md->end_subject)
5735: {
5736: SCHECK_PARTIAL();
5737: break;
5738: }
5739: eptr++;
1.1.1.2 misho 5740: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misho 5741: }
5742: }
5743: else
5744: {
5745: eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5746: SCHECK_PARTIAL();
5747: }
5748: break;
5749:
5750: /* The byte case is the same as non-UTF8 */
5751:
5752: case OP_ANYBYTE:
5753: c = max - min;
5754: if (c > (unsigned int)(md->end_subject - eptr))
5755: {
5756: eptr = md->end_subject;
5757: SCHECK_PARTIAL();
5758: }
5759: else eptr += c;
5760: break;
5761:
5762: case OP_ANYNL:
5763: for (i = min; i < max; i++)
5764: {
5765: int len = 1;
5766: if (eptr >= md->end_subject)
5767: {
5768: SCHECK_PARTIAL();
5769: break;
5770: }
5771: GETCHARLEN(c, eptr, len);
1.1.1.4 misho 5772: if (c == CHAR_CR)
1.1 misho 5773: {
5774: if (++eptr >= md->end_subject) break;
1.1.1.4 misho 5775: if (RAWUCHAR(eptr) == CHAR_LF) eptr++;
1.1 misho 5776: }
5777: else
5778: {
1.1.1.4 misho 5779: if (c != CHAR_LF &&
1.1 misho 5780: (md->bsr_anycrlf ||
1.1.1.4 misho 5781: (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5782: #ifndef EBCDIC
5783: && c != 0x2028 && c != 0x2029
5784: #endif /* Not EBCDIC */
5785: )))
1.1 misho 5786: break;
5787: eptr += len;
5788: }
5789: }
5790: break;
5791:
5792: case OP_NOT_HSPACE:
5793: case OP_HSPACE:
5794: for (i = min; i < max; i++)
5795: {
5796: BOOL gotspace;
5797: int len = 1;
5798: if (eptr >= md->end_subject)
5799: {
5800: SCHECK_PARTIAL();
5801: break;
5802: }
5803: GETCHARLEN(c, eptr, len);
5804: switch(c)
5805: {
1.1.1.4 misho 5806: HSPACE_CASES: gotspace = TRUE; break;
1.1 misho 5807: default: gotspace = FALSE; break;
5808: }
5809: if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5810: eptr += len;
5811: }
5812: break;
5813:
5814: case OP_NOT_VSPACE:
5815: case OP_VSPACE:
5816: for (i = min; i < max; i++)
5817: {
5818: BOOL gotspace;
5819: int len = 1;
5820: if (eptr >= md->end_subject)
5821: {
5822: SCHECK_PARTIAL();
5823: break;
5824: }
5825: GETCHARLEN(c, eptr, len);
5826: switch(c)
5827: {
1.1.1.4 misho 5828: VSPACE_CASES: gotspace = TRUE; break;
1.1 misho 5829: default: gotspace = FALSE; break;
5830: }
5831: if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5832: eptr += len;
5833: }
5834: break;
5835:
5836: case OP_NOT_DIGIT:
5837: for (i = min; i < max; i++)
5838: {
5839: int len = 1;
5840: if (eptr >= md->end_subject)
5841: {
5842: SCHECK_PARTIAL();
5843: break;
5844: }
5845: GETCHARLEN(c, eptr, len);
5846: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5847: eptr+= len;
5848: }
5849: break;
5850:
5851: case OP_DIGIT:
5852: for (i = min; i < max; i++)
5853: {
5854: int len = 1;
5855: if (eptr >= md->end_subject)
5856: {
5857: SCHECK_PARTIAL();
5858: break;
5859: }
5860: GETCHARLEN(c, eptr, len);
5861: if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5862: eptr+= len;
5863: }
5864: break;
5865:
5866: case OP_NOT_WHITESPACE:
5867: for (i = min; i < max; i++)
5868: {
5869: int len = 1;
5870: if (eptr >= md->end_subject)
5871: {
5872: SCHECK_PARTIAL();
5873: break;
5874: }
5875: GETCHARLEN(c, eptr, len);
5876: if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5877: eptr+= len;
5878: }
5879: break;
5880:
5881: case OP_WHITESPACE:
5882: for (i = min; i < max; i++)
5883: {
5884: int len = 1;
5885: if (eptr >= md->end_subject)
5886: {
5887: SCHECK_PARTIAL();
5888: break;
5889: }
5890: GETCHARLEN(c, eptr, len);
5891: if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5892: eptr+= len;
5893: }
5894: break;
5895:
5896: case OP_NOT_WORDCHAR:
5897: for (i = min; i < max; i++)
5898: {
5899: int len = 1;
5900: if (eptr >= md->end_subject)
5901: {
5902: SCHECK_PARTIAL();
5903: break;
5904: }
5905: GETCHARLEN(c, eptr, len);
5906: if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5907: eptr+= len;
5908: }
5909: break;
5910:
5911: case OP_WORDCHAR:
5912: for (i = min; i < max; i++)
5913: {
5914: int len = 1;
5915: if (eptr >= md->end_subject)
5916: {
5917: SCHECK_PARTIAL();
5918: break;
5919: }
5920: GETCHARLEN(c, eptr, len);
5921: if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5922: eptr+= len;
5923: }
5924: break;
5925:
5926: default:
5927: RRETURN(PCRE_ERROR_INTERNAL);
5928: }
5929:
1.1.1.4 misho 5930: if (possessive) continue; /* No backtracking */
1.1 misho 5931: for(;;)
5932: {
1.1.1.4 misho 5933: if (eptr == pp) goto TAIL_RECURSE;
1.1 misho 5934: RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5935: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1.1.4 misho 5936: eptr--;
1.1 misho 5937: BACKCHAR(eptr);
1.1.1.4 misho 5938: if (ctype == OP_ANYNL && eptr > pp && RAWUCHAR(eptr) == CHAR_NL &&
5939: RAWUCHAR(eptr - 1) == CHAR_CR) eptr--;
1.1 misho 5940: }
5941: }
5942: else
1.1.1.2 misho 5943: #endif /* SUPPORT_UTF */
5944: /* Not UTF mode */
1.1 misho 5945: {
5946: switch(ctype)
5947: {
5948: case OP_ANY:
5949: for (i = min; i < max; i++)
5950: {
5951: if (eptr >= md->end_subject)
5952: {
5953: SCHECK_PARTIAL();
5954: break;
5955: }
5956: if (IS_NEWLINE(eptr)) break;
1.1.1.3 misho 5957: if (md->partial != 0 && /* Take care with CRLF partial */
5958: eptr + 1 >= md->end_subject &&
5959: NLBLOCK->nltype == NLTYPE_FIXED &&
5960: NLBLOCK->nllen == 2 &&
5961: *eptr == NLBLOCK->nl[0])
5962: {
5963: md->hitend = TRUE;
5964: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5965: }
1.1 misho 5966: eptr++;
5967: }
5968: break;
5969:
5970: case OP_ALLANY:
5971: case OP_ANYBYTE:
5972: c = max - min;
5973: if (c > (unsigned int)(md->end_subject - eptr))
5974: {
5975: eptr = md->end_subject;
5976: SCHECK_PARTIAL();
5977: }
5978: else eptr += c;
5979: break;
5980:
5981: case OP_ANYNL:
5982: for (i = min; i < max; i++)
5983: {
5984: if (eptr >= md->end_subject)
5985: {
5986: SCHECK_PARTIAL();
5987: break;
5988: }
5989: c = *eptr;
1.1.1.4 misho 5990: if (c == CHAR_CR)
1.1 misho 5991: {
5992: if (++eptr >= md->end_subject) break;
1.1.1.4 misho 5993: if (*eptr == CHAR_LF) eptr++;
1.1 misho 5994: }
5995: else
5996: {
1.1.1.4 misho 5997: if (c != CHAR_LF && (md->bsr_anycrlf ||
5998: (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5999: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6000: && c != 0x2028 && c != 0x2029
1.1.1.2 misho 6001: #endif
1.1.1.4 misho 6002: ))) break;
1.1 misho 6003: eptr++;
6004: }
6005: }
6006: break;
6007:
6008: case OP_NOT_HSPACE:
6009: for (i = min; i < max; i++)
6010: {
6011: if (eptr >= md->end_subject)
6012: {
6013: SCHECK_PARTIAL();
6014: break;
6015: }
1.1.1.4 misho 6016: switch(*eptr)
6017: {
6018: default: eptr++; break;
6019: HSPACE_BYTE_CASES:
6020: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6021: HSPACE_MULTIBYTE_CASES:
1.1.1.2 misho 6022: #endif
1.1.1.4 misho 6023: goto ENDLOOP00;
6024: }
1.1 misho 6025: }
1.1.1.4 misho 6026: ENDLOOP00:
1.1 misho 6027: break;
6028:
6029: case OP_HSPACE:
6030: for (i = min; i < max; i++)
6031: {
6032: if (eptr >= md->end_subject)
6033: {
6034: SCHECK_PARTIAL();
6035: break;
6036: }
1.1.1.4 misho 6037: switch(*eptr)
6038: {
6039: default: goto ENDLOOP01;
6040: HSPACE_BYTE_CASES:
6041: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6042: HSPACE_MULTIBYTE_CASES:
1.1.1.2 misho 6043: #endif
1.1.1.4 misho 6044: eptr++; break;
6045: }
1.1 misho 6046: }
1.1.1.4 misho 6047: ENDLOOP01:
1.1 misho 6048: break;
6049:
6050: case OP_NOT_VSPACE:
6051: for (i = min; i < max; i++)
6052: {
6053: if (eptr >= md->end_subject)
6054: {
6055: SCHECK_PARTIAL();
6056: break;
6057: }
1.1.1.4 misho 6058: switch(*eptr)
6059: {
6060: default: eptr++; break;
6061: VSPACE_BYTE_CASES:
6062: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6063: VSPACE_MULTIBYTE_CASES:
1.1.1.2 misho 6064: #endif
1.1.1.4 misho 6065: goto ENDLOOP02;
6066: }
1.1 misho 6067: }
1.1.1.4 misho 6068: ENDLOOP02:
1.1 misho 6069: break;
6070:
6071: case OP_VSPACE:
6072: for (i = min; i < max; i++)
6073: {
6074: if (eptr >= md->end_subject)
6075: {
6076: SCHECK_PARTIAL();
6077: break;
6078: }
1.1.1.4 misho 6079: switch(*eptr)
6080: {
6081: default: goto ENDLOOP03;
6082: VSPACE_BYTE_CASES:
6083: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6084: VSPACE_MULTIBYTE_CASES:
1.1.1.2 misho 6085: #endif
1.1.1.4 misho 6086: eptr++; break;
6087: }
1.1 misho 6088: }
1.1.1.4 misho 6089: ENDLOOP03:
1.1 misho 6090: break;
6091:
6092: case OP_NOT_DIGIT:
6093: for (i = min; i < max; i++)
6094: {
6095: if (eptr >= md->end_subject)
6096: {
6097: SCHECK_PARTIAL();
6098: break;
6099: }
1.1.1.2 misho 6100: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
1.1 misho 6101: eptr++;
6102: }
6103: break;
6104:
6105: case OP_DIGIT:
6106: for (i = min; i < max; i++)
6107: {
6108: if (eptr >= md->end_subject)
6109: {
6110: SCHECK_PARTIAL();
6111: break;
6112: }
1.1.1.2 misho 6113: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
1.1 misho 6114: eptr++;
6115: }
6116: break;
6117:
6118: case OP_NOT_WHITESPACE:
6119: for (i = min; i < max; i++)
6120: {
6121: if (eptr >= md->end_subject)
6122: {
6123: SCHECK_PARTIAL();
6124: break;
6125: }
1.1.1.2 misho 6126: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
1.1 misho 6127: eptr++;
6128: }
6129: break;
6130:
6131: case OP_WHITESPACE:
6132: for (i = min; i < max; i++)
6133: {
6134: if (eptr >= md->end_subject)
6135: {
6136: SCHECK_PARTIAL();
6137: break;
6138: }
1.1.1.2 misho 6139: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
1.1 misho 6140: eptr++;
6141: }
6142: break;
6143:
6144: case OP_NOT_WORDCHAR:
6145: for (i = min; i < max; i++)
6146: {
6147: if (eptr >= md->end_subject)
6148: {
6149: SCHECK_PARTIAL();
6150: break;
6151: }
1.1.1.2 misho 6152: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
1.1 misho 6153: eptr++;
6154: }
6155: break;
6156:
6157: case OP_WORDCHAR:
6158: for (i = min; i < max; i++)
6159: {
6160: if (eptr >= md->end_subject)
6161: {
6162: SCHECK_PARTIAL();
6163: break;
6164: }
1.1.1.2 misho 6165: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
1.1 misho 6166: eptr++;
6167: }
6168: break;
6169:
6170: default:
6171: RRETURN(PCRE_ERROR_INTERNAL);
6172: }
6173:
1.1.1.4 misho 6174: if (possessive) continue; /* No backtracking */
6175: for (;;)
1.1 misho 6176: {
1.1.1.4 misho 6177: if (eptr == pp) goto TAIL_RECURSE;
1.1 misho 6178: RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6179: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6180: eptr--;
1.1.1.4 misho 6181: if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6182: eptr[-1] == CHAR_CR) eptr--;
1.1 misho 6183: }
6184: }
6185:
1.1.1.5 ! misho 6186: /* Control never gets here */
1.1 misho 6187: }
6188:
6189: /* There's been some horrible disaster. Arrival here can only mean there is
6190: something seriously wrong in the code above or the OP_xxx definitions. */
6191:
6192: default:
6193: DPRINTF(("Unknown opcode %d\n", *ecode));
6194: RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6195: }
6196:
6197: /* Do not stick any code in here without much thought; it is assumed
6198: that "continue" in the code above comes out to here to repeat the main
6199: loop. */
6200:
6201: } /* End of main loop */
6202: /* Control never reaches here */
6203:
6204:
6205: /* When compiling to use the heap rather than the stack for recursive calls to
6206: match(), the RRETURN() macro jumps here. The number that is saved in
6207: frame->Xwhere indicates which label we actually want to return to. */
6208:
6209: #ifdef NO_RECURSE
6210: #define LBL(val) case val: goto L_RM##val;
6211: HEAP_RETURN:
6212: switch (frame->Xwhere)
6213: {
6214: LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6215: LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6216: LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6217: LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6218: LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6219: LBL(65) LBL(66)
1.1.1.2 misho 6220: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1.1.1.5 ! misho 6221: LBL(20) LBL(21)
1.1.1.2 misho 6222: #endif
6223: #ifdef SUPPORT_UTF
1.1.1.5 ! misho 6224: LBL(16) LBL(18)
1.1.1.2 misho 6225: LBL(22) LBL(23) LBL(28) LBL(30)
1.1 misho 6226: LBL(32) LBL(34) LBL(42) LBL(46)
6227: #ifdef SUPPORT_UCP
6228: LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
1.1.1.5 ! misho 6229: LBL(59) LBL(60) LBL(61) LBL(62) LBL(67)
1.1 misho 6230: #endif /* SUPPORT_UCP */
1.1.1.2 misho 6231: #endif /* SUPPORT_UTF */
1.1 misho 6232: default:
6233: DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6234: return PCRE_ERROR_INTERNAL;
6235: }
6236: #undef LBL
6237: #endif /* NO_RECURSE */
6238: }
6239:
6240:
6241: /***************************************************************************
6242: ****************************************************************************
6243: RECURSION IN THE match() FUNCTION
6244:
6245: Undefine all the macros that were defined above to handle this. */
6246:
6247: #ifdef NO_RECURSE
6248: #undef eptr
6249: #undef ecode
6250: #undef mstart
6251: #undef offset_top
6252: #undef eptrb
6253: #undef flags
6254:
6255: #undef callpat
6256: #undef charptr
6257: #undef data
6258: #undef next
6259: #undef pp
6260: #undef prev
6261: #undef saved_eptr
6262:
6263: #undef new_recursive
6264:
6265: #undef cur_is_word
6266: #undef condition
6267: #undef prev_is_word
6268:
6269: #undef ctype
6270: #undef length
6271: #undef max
6272: #undef min
6273: #undef number
6274: #undef offset
6275: #undef op
6276: #undef save_capture_last
6277: #undef save_offset1
6278: #undef save_offset2
6279: #undef save_offset3
6280: #undef stacksave
6281:
6282: #undef newptrb
6283:
6284: #endif
6285:
6286: /* These two are defined as macros in both cases */
6287:
6288: #undef fc
6289: #undef fi
6290:
6291: /***************************************************************************
6292: ***************************************************************************/
6293:
6294:
1.1.1.3 misho 6295: #ifdef NO_RECURSE
6296: /*************************************************
6297: * Release allocated heap frames *
6298: *************************************************/
6299:
6300: /* This function releases all the allocated frames. The base frame is on the
6301: machine stack, and so must not be freed.
6302:
6303: Argument: the address of the base frame
6304: Returns: nothing
6305: */
6306:
6307: static void
6308: release_match_heapframes (heapframe *frame_base)
6309: {
6310: heapframe *nextframe = frame_base->Xnextframe;
6311: while (nextframe != NULL)
6312: {
6313: heapframe *oldframe = nextframe;
6314: nextframe = nextframe->Xnextframe;
6315: (PUBL(stack_free))(oldframe);
6316: }
6317: }
6318: #endif
6319:
1.1 misho 6320:
6321: /*************************************************
6322: * Execute a Regular Expression *
6323: *************************************************/
6324:
6325: /* This function applies a compiled re to a subject string and picks out
6326: portions of the string if it matches. Two elements in the vector are set for
6327: each substring: the offsets to the start and end of the substring.
6328:
6329: Arguments:
6330: argument_re points to the compiled expression
6331: extra_data points to extra data or is NULL
6332: subject points to the subject string
6333: length length of subject string (may contain binary zeros)
6334: start_offset where to start in the subject string
6335: options option bits
6336: offsets points to a vector of ints to be filled in with offsets
6337: offsetcount the number of elements in the vector
6338:
6339: Returns: > 0 => success; value is the number of elements filled in
6340: = 0 => success, but offsets is not big enough
6341: -1 => failed to match
6342: < -1 => some kind of unexpected problem
6343: */
6344:
1.1.1.4 misho 6345: #if defined COMPILE_PCRE8
1.1 misho 6346: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6347: pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6348: PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6349: int offsetcount)
1.1.1.4 misho 6350: #elif defined COMPILE_PCRE16
1.1.1.2 misho 6351: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6352: pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6353: PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6354: int offsetcount)
1.1.1.4 misho 6355: #elif defined COMPILE_PCRE32
6356: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6357: pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6358: PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6359: int offsetcount)
1.1.1.2 misho 6360: #endif
1.1 misho 6361: {
6362: int rc, ocount, arg_offset_max;
6363: int newline;
6364: BOOL using_temporary_offsets = FALSE;
6365: BOOL anchored;
6366: BOOL startline;
6367: BOOL firstline;
1.1.1.2 misho 6368: BOOL utf;
6369: BOOL has_first_char = FALSE;
6370: BOOL has_req_char = FALSE;
6371: pcre_uchar first_char = 0;
6372: pcre_uchar first_char2 = 0;
6373: pcre_uchar req_char = 0;
6374: pcre_uchar req_char2 = 0;
1.1 misho 6375: match_data match_block;
6376: match_data *md = &match_block;
1.1.1.2 misho 6377: const pcre_uint8 *tables;
6378: const pcre_uint8 *start_bits = NULL;
6379: PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6380: PCRE_PUCHAR end_subject;
6381: PCRE_PUCHAR start_partial = NULL;
1.1.1.5 ! misho 6382: PCRE_PUCHAR match_partial = NULL;
1.1.1.2 misho 6383: PCRE_PUCHAR req_char_ptr = start_match - 1;
1.1 misho 6384:
6385: const pcre_study_data *study;
1.1.1.2 misho 6386: const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
1.1 misho 6387:
1.1.1.3 misho 6388: #ifdef NO_RECURSE
6389: heapframe frame_zero;
6390: frame_zero.Xprevframe = NULL; /* Marks the top level */
6391: frame_zero.Xnextframe = NULL; /* None are allocated yet */
6392: md->match_frames_base = &frame_zero;
6393: #endif
6394:
1.1.1.2 misho 6395: /* Check for the special magic call that measures the size of the stack used
1.1.1.3 misho 6396: per recursive call of match(). Without the funny casting for sizeof, a Windows
6397: compiler gave this error: "unary minus operator applied to unsigned type,
6398: result still unsigned". Hopefully the cast fixes that. */
1.1.1.2 misho 6399:
6400: if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6401: start_offset == -999)
6402: #ifdef NO_RECURSE
1.1.1.3 misho 6403: return -((int)sizeof(heapframe));
1.1.1.2 misho 6404: #else
6405: return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6406: #endif
1.1 misho 6407:
6408: /* Plausibility checks */
6409:
6410: if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
1.1.1.2 misho 6411: if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6412: return PCRE_ERROR_NULL;
1.1 misho 6413: if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
1.1.1.4 misho 6414: if (length < 0) return PCRE_ERROR_BADLENGTH;
1.1 misho 6415: if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6416:
1.1.1.2 misho 6417: /* Check that the first field in the block is the magic number. If it is not,
6418: return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6419: REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6420: means that the pattern is likely compiled with different endianness. */
6421:
6422: if (re->magic_number != MAGIC_NUMBER)
6423: return re->magic_number == REVERSED_MAGIC_NUMBER?
6424: PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6425: if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6426:
1.1 misho 6427: /* These two settings are used in the code for checking a UTF-8 string that
6428: follows immediately afterwards. Other values in the md block are used only
6429: during "normal" pcre_exec() processing, not when the JIT support is in use,
6430: so they are set up later. */
6431:
1.1.1.2 misho 6432: /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6433: utf = md->utf = (re->options & PCRE_UTF8) != 0;
1.1 misho 6434: md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6435: ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6436:
6437: /* Check a UTF-8 string if required. Pass back the character offset and error
6438: code for an invalid string if a results vector is available. */
6439:
1.1.1.2 misho 6440: #ifdef SUPPORT_UTF
6441: if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
1.1 misho 6442: {
6443: int erroroffset;
1.1.1.2 misho 6444: int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
1.1 misho 6445: if (errorcode != 0)
6446: {
6447: if (offsetcount >= 2)
6448: {
6449: offsets[0] = erroroffset;
6450: offsets[1] = errorcode;
6451: }
1.1.1.4 misho 6452: #if defined COMPILE_PCRE8
1.1 misho 6453: return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6454: PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
1.1.1.4 misho 6455: #elif defined COMPILE_PCRE16
6456: return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6457: PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6458: #elif defined COMPILE_PCRE32
6459: return PCRE_ERROR_BADUTF32;
1.1.1.2 misho 6460: #endif
1.1 misho 6461: }
1.1.1.4 misho 6462: #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
1.1.1.2 misho 6463: /* Check that a start_offset points to the start of a UTF character. */
1.1 misho 6464: if (start_offset > 0 && start_offset < length &&
1.1.1.2 misho 6465: NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
1.1 misho 6466: return PCRE_ERROR_BADUTF8_OFFSET;
1.1.1.4 misho 6467: #endif
1.1 misho 6468: }
6469: #endif
6470:
6471: /* If the pattern was successfully studied with JIT support, run the JIT
6472: executable instead of the rest of this function. Most options must be set at
6473: compile time for the JIT code to be usable. Fallback to the normal code path if
1.1.1.3 misho 6474: an unsupported flag is set. */
1.1 misho 6475:
6476: #ifdef SUPPORT_JIT
6477: if (extra_data != NULL
1.1.1.3 misho 6478: && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6479: PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
1.1 misho 6480: && extra_data->executable_jit != NULL
1.1.1.4 misho 6481: && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0)
1.1.1.3 misho 6482: {
1.1.1.4 misho 6483: rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length,
1.1.1.3 misho 6484: start_offset, options, offsets, offsetcount);
6485:
6486: /* PCRE_ERROR_NULL means that the selected normal or partial matching
6487: mode is not compiled. In this case we simply fallback to interpreter. */
6488:
1.1.1.4 misho 6489: if (rc != PCRE_ERROR_JIT_BADOPTION) return rc;
1.1.1.3 misho 6490: }
1.1 misho 6491: #endif
6492:
6493: /* Carry on with non-JIT matching. This information is for finding all the
6494: numbers associated with a given name, for condition testing. */
6495:
1.1.1.2 misho 6496: md->name_table = (pcre_uchar *)re + re->name_table_offset;
1.1 misho 6497: md->name_count = re->name_count;
6498: md->name_entry_size = re->name_entry_size;
6499:
6500: /* Fish out the optional data from the extra_data structure, first setting
6501: the default values. */
6502:
6503: study = NULL;
6504: md->match_limit = MATCH_LIMIT;
6505: md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6506: md->callout_data = NULL;
6507:
6508: /* The table pointer is always in native byte order. */
6509:
1.1.1.2 misho 6510: tables = re->tables;
1.1 misho 6511:
1.1.1.4 misho 6512: /* The two limit values override the defaults, whatever their value. */
6513:
1.1 misho 6514: if (extra_data != NULL)
6515: {
6516: register unsigned int flags = extra_data->flags;
6517: if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6518: study = (const pcre_study_data *)extra_data->study_data;
6519: if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6520: md->match_limit = extra_data->match_limit;
6521: if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6522: md->match_limit_recursion = extra_data->match_limit_recursion;
6523: if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6524: md->callout_data = extra_data->callout_data;
6525: if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6526: }
6527:
1.1.1.4 misho 6528: /* Limits in the regex override only if they are smaller. */
6529:
6530: if ((re->flags & PCRE_MLSET) != 0 && re->limit_match < md->match_limit)
6531: md->match_limit = re->limit_match;
6532:
6533: if ((re->flags & PCRE_RLSET) != 0 &&
6534: re->limit_recursion < md->match_limit_recursion)
6535: md->match_limit_recursion = re->limit_recursion;
6536:
1.1 misho 6537: /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6538: is a feature that makes it possible to save compiled regex and re-use them
6539: in other programs later. */
6540:
1.1.1.2 misho 6541: if (tables == NULL) tables = PRIV(default_tables);
1.1 misho 6542:
6543: /* Set up other data */
6544:
6545: anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6546: startline = (re->flags & PCRE_STARTLINE) != 0;
6547: firstline = (re->options & PCRE_FIRSTLINE) != 0;
6548:
6549: /* The code starts after the real_pcre block and the capture name table. */
6550:
1.1.1.2 misho 6551: md->start_code = (const pcre_uchar *)re + re->name_table_offset +
1.1 misho 6552: re->name_count * re->name_entry_size;
6553:
1.1.1.2 misho 6554: md->start_subject = (PCRE_PUCHAR)subject;
1.1 misho 6555: md->start_offset = start_offset;
6556: md->end_subject = md->start_subject + length;
6557: end_subject = md->end_subject;
6558:
6559: md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6560: md->use_ucp = (re->options & PCRE_UCP) != 0;
6561: md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
1.1.1.4 misho 6562: md->ignore_skip_arg = 0;
1.1 misho 6563:
6564: /* Some options are unpacked into BOOL variables in the hope that testing
6565: them will be faster than individual option bits. */
6566:
6567: md->notbol = (options & PCRE_NOTBOL) != 0;
6568: md->noteol = (options & PCRE_NOTEOL) != 0;
6569: md->notempty = (options & PCRE_NOTEMPTY) != 0;
6570: md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6571:
6572: md->hitend = FALSE;
6573: md->mark = md->nomatch_mark = NULL; /* In case never set */
6574:
6575: md->recursive = NULL; /* No recursion at top level */
6576: md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6577:
6578: md->lcc = tables + lcc_offset;
1.1.1.2 misho 6579: md->fcc = tables + fcc_offset;
1.1 misho 6580: md->ctypes = tables + ctypes_offset;
6581:
6582: /* Handle different \R options. */
6583:
6584: switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6585: {
6586: case 0:
6587: if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6588: md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6589: else
6590: #ifdef BSR_ANYCRLF
6591: md->bsr_anycrlf = TRUE;
6592: #else
6593: md->bsr_anycrlf = FALSE;
6594: #endif
6595: break;
6596:
6597: case PCRE_BSR_ANYCRLF:
6598: md->bsr_anycrlf = TRUE;
6599: break;
6600:
6601: case PCRE_BSR_UNICODE:
6602: md->bsr_anycrlf = FALSE;
6603: break;
6604:
6605: default: return PCRE_ERROR_BADNEWLINE;
6606: }
6607:
6608: /* Handle different types of newline. The three bits give eight cases. If
6609: nothing is set at run time, whatever was used at compile time applies. */
6610:
6611: switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6612: (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6613: {
6614: case 0: newline = NEWLINE; break; /* Compile-time default */
6615: case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6616: case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6617: case PCRE_NEWLINE_CR+
6618: PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6619: case PCRE_NEWLINE_ANY: newline = -1; break;
6620: case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6621: default: return PCRE_ERROR_BADNEWLINE;
6622: }
6623:
6624: if (newline == -2)
6625: {
6626: md->nltype = NLTYPE_ANYCRLF;
6627: }
6628: else if (newline < 0)
6629: {
6630: md->nltype = NLTYPE_ANY;
6631: }
6632: else
6633: {
6634: md->nltype = NLTYPE_FIXED;
6635: if (newline > 255)
6636: {
6637: md->nllen = 2;
6638: md->nl[0] = (newline >> 8) & 255;
6639: md->nl[1] = newline & 255;
6640: }
6641: else
6642: {
6643: md->nllen = 1;
6644: md->nl[0] = newline;
6645: }
6646: }
6647:
6648: /* Partial matching was originally supported only for a restricted set of
6649: regexes; from release 8.00 there are no restrictions, but the bits are still
6650: defined (though never set). So there's no harm in leaving this code. */
6651:
6652: if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6653: return PCRE_ERROR_BADPARTIAL;
6654:
6655: /* If the expression has got more back references than the offsets supplied can
6656: hold, we get a temporary chunk of working store to use during the matching.
6657: Otherwise, we can use the vector supplied, rounding down its size to a multiple
6658: of 3. */
6659:
6660: ocount = offsetcount - (offsetcount % 3);
6661: arg_offset_max = (2*ocount)/3;
6662:
6663: if (re->top_backref > 0 && re->top_backref >= ocount/3)
6664: {
6665: ocount = re->top_backref * 3 + 3;
1.1.1.2 misho 6666: md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
1.1 misho 6667: if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6668: using_temporary_offsets = TRUE;
6669: DPRINTF(("Got memory to hold back references\n"));
6670: }
6671: else md->offset_vector = offsets;
6672: md->offset_end = ocount;
6673: md->offset_max = (2*ocount)/3;
1.1.1.4 misho 6674: md->capture_last = 0;
1.1 misho 6675:
6676: /* Reset the working variable associated with each extraction. These should
6677: never be used unless previously set, but they get saved and restored, and so we
6678: initialize them to avoid reading uninitialized locations. Also, unset the
6679: offsets for the matched string. This is really just for tidiness with callouts,
6680: in case they inspect these fields. */
6681:
6682: if (md->offset_vector != NULL)
6683: {
6684: register int *iptr = md->offset_vector + ocount;
6685: register int *iend = iptr - re->top_bracket;
6686: if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6687: while (--iptr >= iend) *iptr = -1;
6688: md->offset_vector[0] = md->offset_vector[1] = -1;
6689: }
6690:
1.1.1.2 misho 6691: /* Set up the first character to match, if available. The first_char value is
1.1 misho 6692: never set for an anchored regular expression, but the anchoring may be forced
6693: at run time, so we have to test for anchoring. The first char may be unset for
6694: an unanchored pattern, of course. If there's no first char and the pattern was
6695: studied, there may be a bitmap of possible first characters. */
6696:
6697: if (!anchored)
6698: {
6699: if ((re->flags & PCRE_FIRSTSET) != 0)
6700: {
1.1.1.2 misho 6701: has_first_char = TRUE;
6702: first_char = first_char2 = (pcre_uchar)(re->first_char);
6703: if ((re->flags & PCRE_FCH_CASELESS) != 0)
6704: {
6705: first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6706: #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6707: if (utf && first_char > 127)
6708: first_char2 = UCD_OTHERCASE(first_char);
6709: #endif
6710: }
1.1 misho 6711: }
6712: else
6713: if (!startline && study != NULL &&
6714: (study->flags & PCRE_STUDY_MAPPED) != 0)
6715: start_bits = study->start_bits;
6716: }
6717:
6718: /* For anchored or unanchored matches, there may be a "last known required
6719: character" set. */
6720:
6721: if ((re->flags & PCRE_REQCHSET) != 0)
6722: {
1.1.1.2 misho 6723: has_req_char = TRUE;
6724: req_char = req_char2 = (pcre_uchar)(re->req_char);
6725: if ((re->flags & PCRE_RCH_CASELESS) != 0)
6726: {
6727: req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6728: #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6729: if (utf && req_char > 127)
6730: req_char2 = UCD_OTHERCASE(req_char);
6731: #endif
6732: }
1.1 misho 6733: }
6734:
6735:
6736: /* ==========================================================================*/
6737:
6738: /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6739: the loop runs just once. */
6740:
6741: for(;;)
6742: {
1.1.1.2 misho 6743: PCRE_PUCHAR save_end_subject = end_subject;
6744: PCRE_PUCHAR new_start_match;
1.1 misho 6745:
6746: /* If firstline is TRUE, the start of the match is constrained to the first
6747: line of a multiline string. That is, the match must be before or at the first
6748: newline. Implement this by temporarily adjusting end_subject so that we stop
6749: scanning at a newline. If the match fails at the newline, later code breaks
6750: this loop. */
6751:
6752: if (firstline)
6753: {
1.1.1.2 misho 6754: PCRE_PUCHAR t = start_match;
6755: #ifdef SUPPORT_UTF
6756: if (utf)
1.1 misho 6757: {
6758: while (t < md->end_subject && !IS_NEWLINE(t))
6759: {
6760: t++;
1.1.1.2 misho 6761: ACROSSCHAR(t < end_subject, *t, t++);
1.1 misho 6762: }
6763: }
6764: else
6765: #endif
6766: while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6767: end_subject = t;
6768: }
6769:
6770: /* There are some optimizations that avoid running the match if a known
6771: starting point is not found, or if a known later character is not present.
6772: However, there is an option that disables these, for testing and for ensuring
6773: that all callouts do actually occur. The option can be set in the regex by
6774: (*NO_START_OPT) or passed in match-time options. */
6775:
6776: if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6777: {
1.1.1.2 misho 6778: /* Advance to a unique first char if there is one. */
1.1 misho 6779:
1.1.1.2 misho 6780: if (has_first_char)
1.1 misho 6781: {
1.1.1.4 misho 6782: pcre_uchar smc;
6783:
1.1.1.2 misho 6784: if (first_char != first_char2)
6785: while (start_match < end_subject &&
1.1.1.4 misho 6786: (smc = RAWUCHARTEST(start_match)) != first_char && smc != first_char2)
1.1 misho 6787: start_match++;
6788: else
1.1.1.4 misho 6789: while (start_match < end_subject && RAWUCHARTEST(start_match) != first_char)
1.1 misho 6790: start_match++;
6791: }
6792:
6793: /* Or to just after a linebreak for a multiline match */
6794:
6795: else if (startline)
6796: {
6797: if (start_match > md->start_subject + start_offset)
6798: {
1.1.1.2 misho 6799: #ifdef SUPPORT_UTF
6800: if (utf)
1.1 misho 6801: {
6802: while (start_match < end_subject && !WAS_NEWLINE(start_match))
6803: {
6804: start_match++;
1.1.1.2 misho 6805: ACROSSCHAR(start_match < end_subject, *start_match,
6806: start_match++);
1.1 misho 6807: }
6808: }
6809: else
6810: #endif
6811: while (start_match < end_subject && !WAS_NEWLINE(start_match))
6812: start_match++;
6813:
6814: /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6815: and we are now at a LF, advance the match position by one more character.
6816: */
6817:
6818: if (start_match[-1] == CHAR_CR &&
6819: (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6820: start_match < end_subject &&
1.1.1.4 misho 6821: RAWUCHARTEST(start_match) == CHAR_NL)
1.1 misho 6822: start_match++;
6823: }
6824: }
6825:
6826: /* Or to a non-unique first byte after study */
6827:
6828: else if (start_bits != NULL)
6829: {
6830: while (start_match < end_subject)
6831: {
1.1.1.4 misho 6832: register pcre_uint32 c = RAWUCHARTEST(start_match);
1.1.1.2 misho 6833: #ifndef COMPILE_PCRE8
6834: if (c > 255) c = 255;
6835: #endif
1.1 misho 6836: if ((start_bits[c/8] & (1 << (c&7))) == 0)
6837: {
6838: start_match++;
1.1.1.2 misho 6839: #if defined SUPPORT_UTF && defined COMPILE_PCRE8
6840: /* In non 8-bit mode, the iteration will stop for
6841: characters > 255 at the beginning or not stop at all. */
6842: if (utf)
6843: ACROSSCHAR(start_match < end_subject, *start_match,
6844: start_match++);
1.1 misho 6845: #endif
6846: }
6847: else break;
6848: }
6849: }
6850: } /* Starting optimizations */
6851:
6852: /* Restore fudged end_subject */
6853:
6854: end_subject = save_end_subject;
6855:
6856: /* The following two optimizations are disabled for partial matching or if
6857: disabling is explicitly requested. */
6858:
6859: if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6860: {
6861: /* If the pattern was studied, a minimum subject length may be set. This is
6862: a lower bound; no actual string of that length may actually match the
6863: pattern. Although the value is, strictly, in characters, we treat it as
6864: bytes to avoid spending too much time in this optimization. */
6865:
6866: if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6867: (pcre_uint32)(end_subject - start_match) < study->minlength)
6868: {
6869: rc = MATCH_NOMATCH;
6870: break;
6871: }
6872:
1.1.1.2 misho 6873: /* If req_char is set, we know that that character must appear in the
6874: subject for the match to succeed. If the first character is set, req_char
1.1 misho 6875: must be later in the subject; otherwise the test starts at the match point.
6876: This optimization can save a huge amount of backtracking in patterns with
6877: nested unlimited repeats that aren't going to match. Writing separate code
6878: for cased/caseless versions makes it go faster, as does using an
6879: autoincrement and backing off on a match.
6880:
6881: HOWEVER: when the subject string is very, very long, searching to its end
6882: can take a long time, and give bad performance on quite ordinary patterns.
6883: This showed up when somebody was matching something like /^\d+C/ on a
6884: 32-megabyte string... so we don't do this when the string is sufficiently
6885: long. */
6886:
1.1.1.2 misho 6887: if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
1.1 misho 6888: {
1.1.1.2 misho 6889: register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
1.1 misho 6890:
6891: /* We don't need to repeat the search if we haven't yet reached the
6892: place we found it at last time. */
6893:
1.1.1.2 misho 6894: if (p > req_char_ptr)
1.1 misho 6895: {
1.1.1.2 misho 6896: if (req_char != req_char2)
1.1 misho 6897: {
6898: while (p < end_subject)
6899: {
1.1.1.4 misho 6900: register pcre_uint32 pp = RAWUCHARINCTEST(p);
1.1.1.2 misho 6901: if (pp == req_char || pp == req_char2) { p--; break; }
1.1 misho 6902: }
6903: }
6904: else
6905: {
6906: while (p < end_subject)
6907: {
1.1.1.4 misho 6908: if (RAWUCHARINCTEST(p) == req_char) { p--; break; }
1.1 misho 6909: }
6910: }
6911:
6912: /* If we can't find the required character, break the matching loop,
6913: forcing a match failure. */
6914:
6915: if (p >= end_subject)
6916: {
6917: rc = MATCH_NOMATCH;
6918: break;
6919: }
6920:
6921: /* If we have found the required character, save the point where we
6922: found it, so that we don't search again next time round the loop if
6923: the start hasn't passed this character yet. */
6924:
1.1.1.2 misho 6925: req_char_ptr = p;
1.1 misho 6926: }
6927: }
6928: }
6929:
6930: #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6931: printf(">>>> Match against: ");
6932: pchars(start_match, end_subject - start_match, TRUE, md);
6933: printf("\n");
6934: #endif
6935:
6936: /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6937: first starting point for which a partial match was found. */
6938:
6939: md->start_match_ptr = start_match;
6940: md->start_used_ptr = start_match;
6941: md->match_call_count = 0;
6942: md->match_function_type = 0;
6943: md->end_offset_top = 0;
1.1.1.4 misho 6944: md->skip_arg_count = 0;
1.1 misho 6945: rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0);
1.1.1.4 misho 6946: if (md->hitend && start_partial == NULL)
6947: {
6948: start_partial = md->start_used_ptr;
6949: match_partial = start_match;
6950: }
1.1 misho 6951:
6952: switch(rc)
6953: {
6954: /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6955: the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
6956: entirely. The only way we can do that is to re-do the match at the same
6957: point, with a flag to force SKIP with an argument to be ignored. Just
6958: treating this case as NOMATCH does not work because it does not check other
6959: alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
6960:
6961: case MATCH_SKIP_ARG:
6962: new_start_match = start_match;
1.1.1.4 misho 6963: md->ignore_skip_arg = md->skip_arg_count;
1.1 misho 6964: break;
6965:
1.1.1.4 misho 6966: /* SKIP passes back the next starting point explicitly, but if it is no
6967: greater than the match we have just done, treat it as NOMATCH. */
1.1 misho 6968:
6969: case MATCH_SKIP:
1.1.1.4 misho 6970: if (md->start_match_ptr > start_match)
1.1 misho 6971: {
6972: new_start_match = md->start_match_ptr;
6973: break;
6974: }
6975: /* Fall through */
6976:
6977: /* NOMATCH and PRUNE advance by one character. THEN at this level acts
1.1.1.4 misho 6978: exactly like PRUNE. Unset ignore SKIP-with-argument. */
1.1 misho 6979:
6980: case MATCH_NOMATCH:
6981: case MATCH_PRUNE:
6982: case MATCH_THEN:
1.1.1.4 misho 6983: md->ignore_skip_arg = 0;
1.1 misho 6984: new_start_match = start_match + 1;
1.1.1.2 misho 6985: #ifdef SUPPORT_UTF
6986: if (utf)
6987: ACROSSCHAR(new_start_match < end_subject, *new_start_match,
6988: new_start_match++);
1.1 misho 6989: #endif
6990: break;
6991:
6992: /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6993:
6994: case MATCH_COMMIT:
6995: rc = MATCH_NOMATCH;
6996: goto ENDLOOP;
6997:
6998: /* Any other return is either a match, or some kind of error. */
6999:
7000: default:
7001: goto ENDLOOP;
7002: }
7003:
7004: /* Control reaches here for the various types of "no match at this point"
7005: result. Reset the code to MATCH_NOMATCH for subsequent checking. */
7006:
7007: rc = MATCH_NOMATCH;
7008:
7009: /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
7010: newline in the subject (though it may continue over the newline). Therefore,
7011: if we have just failed to match, starting at a newline, do not continue. */
7012:
7013: if (firstline && IS_NEWLINE(start_match)) break;
7014:
7015: /* Advance to new matching position */
7016:
7017: start_match = new_start_match;
7018:
7019: /* Break the loop if the pattern is anchored or if we have passed the end of
7020: the subject. */
7021:
7022: if (anchored || start_match > end_subject) break;
7023:
7024: /* If we have just passed a CR and we are now at a LF, and the pattern does
7025: not contain any explicit matches for \r or \n, and the newline option is CRLF
1.1.1.2 misho 7026: or ANY or ANYCRLF, advance the match position by one more character. In
7027: normal matching start_match will aways be greater than the first position at
7028: this stage, but a failed *SKIP can cause a return at the same point, which is
7029: why the first test exists. */
1.1 misho 7030:
1.1.1.2 misho 7031: if (start_match > (PCRE_PUCHAR)subject + start_offset &&
7032: start_match[-1] == CHAR_CR &&
1.1 misho 7033: start_match < end_subject &&
7034: *start_match == CHAR_NL &&
7035: (re->flags & PCRE_HASCRORLF) == 0 &&
7036: (md->nltype == NLTYPE_ANY ||
7037: md->nltype == NLTYPE_ANYCRLF ||
7038: md->nllen == 2))
7039: start_match++;
7040:
7041: md->mark = NULL; /* Reset for start of next match attempt */
7042: } /* End of for(;;) "bumpalong" loop */
7043:
7044: /* ==========================================================================*/
7045:
7046: /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
7047: conditions is true:
7048:
7049: (1) The pattern is anchored or the match was failed by (*COMMIT);
7050:
7051: (2) We are past the end of the subject;
7052:
7053: (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
7054: this option requests that a match occur at or before the first newline in
7055: the subject.
7056:
7057: When we have a match and the offset vector is big enough to deal with any
7058: backreferences, captured substring offsets will already be set up. In the case
7059: where we had to get some local store to hold offsets for backreference
7060: processing, copy those that we can. In this case there need not be overflow if
7061: certain parts of the pattern were not used, even though there are more
7062: capturing parentheses than vector slots. */
7063:
7064: ENDLOOP:
7065:
7066: if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
7067: {
7068: if (using_temporary_offsets)
7069: {
7070: if (arg_offset_max >= 4)
7071: {
7072: memcpy(offsets + 2, md->offset_vector + 2,
7073: (arg_offset_max - 2) * sizeof(int));
7074: DPRINTF(("Copied offsets from temporary memory\n"));
7075: }
1.1.1.4 misho 7076: if (md->end_offset_top > arg_offset_max) md->capture_last |= OVFLBIT;
1.1 misho 7077: DPRINTF(("Freeing temporary memory\n"));
1.1.1.2 misho 7078: (PUBL(free))(md->offset_vector);
1.1 misho 7079: }
7080:
7081: /* Set the return code to the number of captured strings, or 0 if there were
7082: too many to fit into the vector. */
7083:
1.1.1.4 misho 7084: rc = ((md->capture_last & OVFLBIT) != 0 &&
7085: md->end_offset_top >= arg_offset_max)?
1.1 misho 7086: 0 : md->end_offset_top/2;
7087:
7088: /* If there is space in the offset vector, set any unused pairs at the end of
7089: the pattern to -1 for backwards compatibility. It is documented that this
7090: happens. In earlier versions, the whole set of potential capturing offsets
7091: was set to -1 each time round the loop, but this is handled differently now.
7092: "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
7093: those at the end that need unsetting here. We can't just unset them all at
7094: the start of the whole thing because they may get set in one branch that is
7095: not the final matching branch. */
7096:
7097: if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
7098: {
7099: register int *iptr, *iend;
7100: int resetcount = 2 + re->top_bracket * 2;
1.1.1.3 misho 7101: if (resetcount > offsetcount) resetcount = offsetcount;
1.1 misho 7102: iptr = offsets + md->end_offset_top;
7103: iend = offsets + resetcount;
7104: while (iptr < iend) *iptr++ = -1;
7105: }
7106:
7107: /* If there is space, set up the whole thing as substring 0. The value of
7108: md->start_match_ptr might be modified if \K was encountered on the success
7109: matching path. */
7110:
7111: if (offsetcount < 2) rc = 0; else
7112: {
7113: offsets[0] = (int)(md->start_match_ptr - md->start_subject);
7114: offsets[1] = (int)(md->end_match_ptr - md->start_subject);
7115: }
7116:
7117: /* Return MARK data if requested */
7118:
7119: if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
1.1.1.2 misho 7120: *(extra_data->mark) = (pcre_uchar *)md->mark;
1.1 misho 7121: DPRINTF((">>>> returning %d\n", rc));
1.1.1.3 misho 7122: #ifdef NO_RECURSE
7123: release_match_heapframes(&frame_zero);
7124: #endif
1.1 misho 7125: return rc;
7126: }
7127:
7128: /* Control gets here if there has been an error, or if the overall match
7129: attempt has failed at all permitted starting positions. */
7130:
7131: if (using_temporary_offsets)
7132: {
7133: DPRINTF(("Freeing temporary memory\n"));
1.1.1.2 misho 7134: (PUBL(free))(md->offset_vector);
1.1 misho 7135: }
7136:
7137: /* For anything other than nomatch or partial match, just return the code. */
7138:
7139: if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
7140: {
7141: DPRINTF((">>>> error: returning %d\n", rc));
1.1.1.3 misho 7142: #ifdef NO_RECURSE
7143: release_match_heapframes(&frame_zero);
7144: #endif
1.1 misho 7145: return rc;
7146: }
7147:
7148: /* Handle partial matches - disable any mark data */
7149:
1.1.1.5 ! misho 7150: if (match_partial != NULL)
1.1 misho 7151: {
7152: DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
7153: md->mark = NULL;
7154: if (offsetcount > 1)
7155: {
1.1.1.2 misho 7156: offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject);
7157: offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
1.1.1.4 misho 7158: if (offsetcount > 2)
7159: offsets[2] = (int)(match_partial - (PCRE_PUCHAR)subject);
1.1 misho 7160: }
7161: rc = PCRE_ERROR_PARTIAL;
7162: }
7163:
7164: /* This is the classic nomatch case */
7165:
7166: else
7167: {
7168: DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
7169: rc = PCRE_ERROR_NOMATCH;
7170: }
7171:
7172: /* Return the MARK data if it has been requested. */
7173:
7174: if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
1.1.1.2 misho 7175: *(extra_data->mark) = (pcre_uchar *)md->nomatch_mark;
1.1.1.3 misho 7176: #ifdef NO_RECURSE
7177: release_match_heapframes(&frame_zero);
7178: #endif
1.1 misho 7179: return rc;
7180: }
7181:
7182: /* End of pcre_exec.c */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>