Annotation of embedaddon/pcre/pcre_exec.c, revision 1.1.1.4
1.1 misho 1: /*************************************************
2: * Perl-Compatible Regular Expressions *
3: *************************************************/
4:
5: /* PCRE is a library of functions to support regular expressions whose syntax
6: and semantics are as close as possible to those of the Perl 5 language.
7:
8: Written by Philip Hazel
1.1.1.4 ! misho 9: Copyright (c) 1997-2013 University of Cambridge
1.1 misho 10:
11: -----------------------------------------------------------------------------
12: Redistribution and use in source and binary forms, with or without
13: modification, are permitted provided that the following conditions are met:
14:
15: * Redistributions of source code must retain the above copyright notice,
16: this list of conditions and the following disclaimer.
17:
18: * Redistributions in binary form must reproduce the above copyright
19: notice, this list of conditions and the following disclaimer in the
20: documentation and/or other materials provided with the distribution.
21:
22: * Neither the name of the University of Cambridge nor the names of its
23: contributors may be used to endorse or promote products derived from
24: this software without specific prior written permission.
25:
26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36: POSSIBILITY OF SUCH DAMAGE.
37: -----------------------------------------------------------------------------
38: */
39:
40: /* This module contains pcre_exec(), the externally visible function that does
41: pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42: possible. There are also some static supporting functions. */
43:
44: #ifdef HAVE_CONFIG_H
45: #include "config.h"
46: #endif
47:
48: #define NLBLOCK md /* Block containing newline information */
49: #define PSSTART start_subject /* Field containing processed string start */
50: #define PSEND end_subject /* Field containing processed string end */
51:
52: #include "pcre_internal.h"
53:
54: /* Undefine some potentially clashing cpp symbols */
55:
56: #undef min
57: #undef max
58:
1.1.1.4 ! misho 59: /* The md->capture_last field uses the lower 16 bits for the last captured
! 60: substring (which can never be greater than 65535) and a bit in the top half
! 61: to mean "capture vector overflowed". This odd way of doing things was
! 62: implemented when it was realized that preserving and restoring the overflow bit
! 63: whenever the last capture number was saved/restored made for a neater
! 64: interface, and doing it this way saved on (a) another variable, which would
! 65: have increased the stack frame size (a big NO-NO in PCRE) and (b) another
! 66: separate set of save/restore instructions. The following defines are used in
! 67: implementing this. */
! 68:
! 69: #define CAPLMASK 0x0000ffff /* The bits used for last_capture */
! 70: #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
! 71: #define OVFLBIT 0x00010000 /* The bit that is set for overflow */
! 72:
1.1 misho 73: /* Values for setting in md->match_function_type to indicate two special types
74: of call to match(). We do it this way to save on using another stack variable,
75: as stack usage is to be discouraged. */
76:
77: #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
78: #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
79:
80: /* Non-error returns from the match() function. Error returns are externally
81: defined PCRE_ERROR_xxx codes, which are all negative. */
82:
83: #define MATCH_MATCH 1
84: #define MATCH_NOMATCH 0
85:
86: /* Special internal returns from the match() function. Make them sufficiently
87: negative to avoid the external error codes. */
88:
89: #define MATCH_ACCEPT (-999)
1.1.1.4 ! misho 90: #define MATCH_KETRPOS (-998)
! 91: #define MATCH_ONCE (-997)
! 92: /* The next 5 must be kept together and in sequence so that a test that checks
! 93: for any one of them can use a range. */
! 94: #define MATCH_COMMIT (-996)
1.1 misho 95: #define MATCH_PRUNE (-995)
96: #define MATCH_SKIP (-994)
97: #define MATCH_SKIP_ARG (-993)
98: #define MATCH_THEN (-992)
1.1.1.4 ! misho 99: #define MATCH_BACKTRACK_MAX MATCH_THEN
! 100: #define MATCH_BACKTRACK_MIN MATCH_COMMIT
1.1 misho 101:
102: /* Maximum number of ints of offset to save on the stack for recursive calls.
103: If the offset vector is bigger, malloc is used. This should be a multiple of 3,
104: because the offset vector is always a multiple of 3 long. */
105:
106: #define REC_STACK_SAVE_MAX 30
107:
108: /* Min and max values for the common repeats; for the maxima, 0 => infinity */
109:
110: static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
111: static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
112:
113: #ifdef PCRE_DEBUG
114: /*************************************************
115: * Debugging function to print chars *
116: *************************************************/
117:
118: /* Print a sequence of chars in printable format, stopping at the end of the
119: subject if the requested.
120:
121: Arguments:
122: p points to characters
123: length number to print
124: is_subject TRUE if printing from within md->start_subject
125: md pointer to matching data block, if is_subject is TRUE
126:
127: Returns: nothing
128: */
129:
130: static void
1.1.1.2 misho 131: pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
1.1 misho 132: {
1.1.1.4 ! misho 133: pcre_uint32 c;
! 134: BOOL utf = md->utf;
1.1 misho 135: if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
136: while (length-- > 0)
1.1.1.4 ! misho 137: if (isprint(c = RAWUCHARINCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
1.1 misho 138: }
139: #endif
140:
141:
142:
143: /*************************************************
144: * Match a back-reference *
145: *************************************************/
146:
147: /* Normally, if a back reference hasn't been set, the length that is passed is
148: negative, so the match always fails. However, in JavaScript compatibility mode,
149: the length passed is zero. Note that in caseless UTF-8 mode, the number of
150: subject bytes matched may be different to the number of reference bytes.
151:
152: Arguments:
153: offset index into the offset vector
154: eptr pointer into the subject
155: length length of reference to be matched (number of bytes)
156: md points to match data block
157: caseless TRUE if caseless
158:
1.1.1.3 misho 159: Returns: >= 0 the number of subject bytes matched
160: -1 no match
161: -2 partial match; always given if at end subject
1.1 misho 162: */
163:
164: static int
1.1.1.2 misho 165: match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
1.1 misho 166: BOOL caseless)
167: {
1.1.1.2 misho 168: PCRE_PUCHAR eptr_start = eptr;
169: register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
1.1.1.4 ! misho 170: #ifdef SUPPORT_UTF
! 171: BOOL utf = md->utf;
! 172: #endif
1.1 misho 173:
174: #ifdef PCRE_DEBUG
175: if (eptr >= md->end_subject)
176: printf("matching subject <null>");
177: else
178: {
179: printf("matching subject ");
180: pchars(eptr, length, TRUE, md);
181: }
182: printf(" against backref ");
183: pchars(p, length, FALSE, md);
184: printf("\n");
185: #endif
186:
1.1.1.3 misho 187: /* Always fail if reference not set (and not JavaScript compatible - in that
188: case the length is passed as zero). */
1.1 misho 189:
190: if (length < 0) return -1;
191:
192: /* Separate the caseless case for speed. In UTF-8 mode we can only do this
193: properly if Unicode properties are supported. Otherwise, we can check only
194: ASCII characters. */
195:
196: if (caseless)
197: {
1.1.1.2 misho 198: #ifdef SUPPORT_UTF
1.1 misho 199: #ifdef SUPPORT_UCP
1.1.1.4 ! misho 200: if (utf)
1.1 misho 201: {
202: /* Match characters up to the end of the reference. NOTE: the number of
1.1.1.4 ! misho 203: data units matched may differ, because in UTF-8 there are some characters
! 204: whose upper and lower case versions code have different numbers of bytes.
! 205: For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
! 206: (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
! 207: sequence of two of the latter. It is important, therefore, to check the
! 208: length along the reference, not along the subject (earlier code did this
! 209: wrong). */
1.1 misho 210:
1.1.1.2 misho 211: PCRE_PUCHAR endptr = p + length;
1.1 misho 212: while (p < endptr)
213: {
1.1.1.4 ! misho 214: pcre_uint32 c, d;
! 215: const ucd_record *ur;
1.1.1.3 misho 216: if (eptr >= md->end_subject) return -2; /* Partial match */
1.1 misho 217: GETCHARINC(c, eptr);
218: GETCHARINC(d, p);
1.1.1.4 ! misho 219: ur = GET_UCD(d);
! 220: if (c != d && c != d + ur->other_case)
! 221: {
! 222: const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
! 223: for (;;)
! 224: {
! 225: if (c < *pp) return -1;
! 226: if (c == *pp++) break;
! 227: }
! 228: }
1.1 misho 229: }
230: }
231: else
232: #endif
233: #endif
234:
235: /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
236: is no UCP support. */
237: {
238: while (length-- > 0)
1.1.1.2 misho 239: {
1.1.1.4 ! misho 240: pcre_uint32 cc, cp;
1.1.1.3 misho 241: if (eptr >= md->end_subject) return -2; /* Partial match */
1.1.1.4 ! misho 242: cc = RAWUCHARTEST(eptr);
! 243: cp = RAWUCHARTEST(p);
! 244: if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
1.1.1.2 misho 245: p++;
246: eptr++;
247: }
1.1 misho 248: }
249: }
250:
251: /* In the caseful case, we can just compare the bytes, whether or not we
252: are in UTF-8 mode. */
253:
254: else
255: {
1.1.1.3 misho 256: while (length-- > 0)
257: {
258: if (eptr >= md->end_subject) return -2; /* Partial match */
1.1.1.4 ! misho 259: if (RAWUCHARINCTEST(p) != RAWUCHARINCTEST(eptr)) return -1;
1.1.1.3 misho 260: }
1.1 misho 261: }
262:
263: return (int)(eptr - eptr_start);
264: }
265:
266:
267:
268: /***************************************************************************
269: ****************************************************************************
270: RECURSION IN THE match() FUNCTION
271:
272: The match() function is highly recursive, though not every recursive call
273: increases the recursive depth. Nevertheless, some regular expressions can cause
274: it to recurse to a great depth. I was writing for Unix, so I just let it call
275: itself recursively. This uses the stack for saving everything that has to be
276: saved for a recursive call. On Unix, the stack can be large, and this works
277: fine.
278:
279: It turns out that on some non-Unix-like systems there are problems with
280: programs that use a lot of stack. (This despite the fact that every last chip
281: has oodles of memory these days, and techniques for extending the stack have
282: been known for decades.) So....
283:
284: There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
285: calls by keeping local variables that need to be preserved in blocks of memory
286: obtained from malloc() instead instead of on the stack. Macros are used to
287: achieve this so that the actual code doesn't look very different to what it
288: always used to.
289:
290: The original heap-recursive code used longjmp(). However, it seems that this
291: can be very slow on some operating systems. Following a suggestion from Stan
292: Switzer, the use of longjmp() has been abolished, at the cost of having to
293: provide a unique number for each call to RMATCH. There is no way of generating
294: a sequence of numbers at compile time in C. I have given them names, to make
295: them stand out more clearly.
296:
297: Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
298: FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
299: tests. Furthermore, not using longjmp() means that local dynamic variables
300: don't have indeterminate values; this has meant that the frame size can be
301: reduced because the result can be "passed back" by straight setting of the
302: variable instead of being passed in the frame.
303: ****************************************************************************
304: ***************************************************************************/
305:
306: /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
307: below must be updated in sync. */
308:
309: enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
310: RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
311: RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
312: RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
313: RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
314: RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
1.1.1.4 ! misho 315: RM61, RM62, RM63, RM64, RM65, RM66, RM67, RM68 };
1.1 misho 316:
317: /* These versions of the macros use the stack, as normal. There are debugging
318: versions and production versions. Note that the "rw" argument of RMATCH isn't
319: actually used in this definition. */
320:
321: #ifndef NO_RECURSE
322: #define REGISTER register
323:
324: #ifdef PCRE_DEBUG
325: #define RMATCH(ra,rb,rc,rd,re,rw) \
326: { \
327: printf("match() called in line %d\n", __LINE__); \
328: rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
329: printf("to line %d\n", __LINE__); \
330: }
331: #define RRETURN(ra) \
332: { \
1.1.1.4 ! misho 333: printf("match() returned %d from line %d\n", ra, __LINE__); \
1.1 misho 334: return ra; \
335: }
336: #else
337: #define RMATCH(ra,rb,rc,rd,re,rw) \
338: rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
339: #define RRETURN(ra) return ra
340: #endif
341:
342: #else
343:
344:
345: /* These versions of the macros manage a private stack on the heap. Note that
346: the "rd" argument of RMATCH isn't actually used in this definition. It's the md
347: argument of match(), which never changes. */
348:
349: #define REGISTER
350:
351: #define RMATCH(ra,rb,rc,rd,re,rw)\
352: {\
1.1.1.3 misho 353: heapframe *newframe = frame->Xnextframe;\
354: if (newframe == NULL)\
355: {\
356: newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
357: if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
358: newframe->Xnextframe = NULL;\
359: frame->Xnextframe = newframe;\
360: }\
361: frame->Xwhere = rw;\
1.1 misho 362: newframe->Xeptr = ra;\
363: newframe->Xecode = rb;\
364: newframe->Xmstart = mstart;\
365: newframe->Xoffset_top = rc;\
366: newframe->Xeptrb = re;\
367: newframe->Xrdepth = frame->Xrdepth + 1;\
368: newframe->Xprevframe = frame;\
369: frame = newframe;\
370: DPRINTF(("restarting from line %d\n", __LINE__));\
371: goto HEAP_RECURSE;\
372: L_##rw:\
373: DPRINTF(("jumped back to line %d\n", __LINE__));\
374: }
375:
376: #define RRETURN(ra)\
377: {\
378: heapframe *oldframe = frame;\
379: frame = oldframe->Xprevframe;\
380: if (frame != NULL)\
381: {\
382: rrc = ra;\
383: goto HEAP_RETURN;\
384: }\
385: return ra;\
386: }
387:
388:
389: /* Structure for remembering the local variables in a private frame */
390:
391: typedef struct heapframe {
392: struct heapframe *Xprevframe;
1.1.1.3 misho 393: struct heapframe *Xnextframe;
1.1 misho 394:
395: /* Function arguments that may change */
396:
1.1.1.2 misho 397: PCRE_PUCHAR Xeptr;
398: const pcre_uchar *Xecode;
399: PCRE_PUCHAR Xmstart;
1.1 misho 400: int Xoffset_top;
401: eptrblock *Xeptrb;
402: unsigned int Xrdepth;
403:
404: /* Function local variables */
405:
1.1.1.2 misho 406: PCRE_PUCHAR Xcallpat;
407: #ifdef SUPPORT_UTF
408: PCRE_PUCHAR Xcharptr;
409: #endif
410: PCRE_PUCHAR Xdata;
411: PCRE_PUCHAR Xnext;
412: PCRE_PUCHAR Xpp;
413: PCRE_PUCHAR Xprev;
414: PCRE_PUCHAR Xsaved_eptr;
1.1 misho 415:
416: recursion_info Xnew_recursive;
417:
418: BOOL Xcur_is_word;
419: BOOL Xcondition;
420: BOOL Xprev_is_word;
421:
422: #ifdef SUPPORT_UCP
423: int Xprop_type;
1.1.1.4 ! misho 424: unsigned int Xprop_value;
1.1 misho 425: int Xprop_fail_result;
426: int Xoclength;
1.1.1.2 misho 427: pcre_uchar Xocchars[6];
1.1 misho 428: #endif
429:
430: int Xcodelink;
431: int Xctype;
432: unsigned int Xfc;
433: int Xfi;
434: int Xlength;
435: int Xmax;
436: int Xmin;
1.1.1.4 ! misho 437: unsigned int Xnumber;
1.1 misho 438: int Xoffset;
1.1.1.4 ! misho 439: unsigned int Xop;
! 440: pcre_int32 Xsave_capture_last;
1.1 misho 441: int Xsave_offset1, Xsave_offset2, Xsave_offset3;
442: int Xstacksave[REC_STACK_SAVE_MAX];
443:
444: eptrblock Xnewptrb;
445:
446: /* Where to jump back to */
447:
448: int Xwhere;
449:
450: } heapframe;
451:
452: #endif
453:
454:
455: /***************************************************************************
456: ***************************************************************************/
457:
458:
459:
460: /*************************************************
461: * Match from current position *
462: *************************************************/
463:
464: /* This function is called recursively in many circumstances. Whenever it
465: returns a negative (error) response, the outer incarnation must also return the
466: same response. */
467:
468: /* These macros pack up tests that are used for partial matching, and which
469: appear several times in the code. We set the "hit end" flag if the pointer is
470: at the end of the subject and also past the start of the subject (i.e.
471: something has been matched). For hard partial matching, we then return
472: immediately. The second one is used when we already know we are past the end of
473: the subject. */
474:
475: #define CHECK_PARTIAL()\
476: if (md->partial != 0 && eptr >= md->end_subject && \
477: eptr > md->start_used_ptr) \
478: { \
479: md->hitend = TRUE; \
480: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
481: }
482:
483: #define SCHECK_PARTIAL()\
484: if (md->partial != 0 && eptr > md->start_used_ptr) \
485: { \
486: md->hitend = TRUE; \
487: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
488: }
489:
490:
491: /* Performance note: It might be tempting to extract commonly used fields from
1.1.1.2 misho 492: the md structure (e.g. utf, end_subject) into individual variables to improve
1.1 misho 493: performance. Tests using gcc on a SPARC disproved this; in the first case, it
494: made performance worse.
495:
496: Arguments:
497: eptr pointer to current character in subject
498: ecode pointer to current position in compiled code
499: mstart pointer to the current match start position (can be modified
500: by encountering \K)
501: offset_top current top pointer
502: md pointer to "static" info for the match
503: eptrb pointer to chain of blocks containing eptr at start of
504: brackets - for testing for empty matches
505: rdepth the recursion depth
506:
507: Returns: MATCH_MATCH if matched ) these values are >= 0
508: MATCH_NOMATCH if failed to match )
509: a negative MATCH_xxx value for PRUNE, SKIP, etc
510: a negative PCRE_ERROR_xxx value if aborted by an error condition
511: (e.g. stopped by repeated call or recursion limit)
512: */
513:
514: static int
1.1.1.2 misho 515: match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
516: PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
517: unsigned int rdepth)
1.1 misho 518: {
519: /* These variables do not need to be preserved over recursion in this function,
520: so they can be ordinary variables in all cases. Mark some of them with
521: "register" because they are used a lot in loops. */
522:
523: register int rrc; /* Returns from recursive calls */
524: register int i; /* Used for loops not involving calls to RMATCH() */
1.1.1.4 ! misho 525: register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
1.1.1.2 misho 526: register BOOL utf; /* Local copy of UTF flag for speed */
1.1 misho 527:
528: BOOL minimize, possessive; /* Quantifier options */
529: BOOL caseless;
530: int condcode;
531:
532: /* When recursion is not being used, all "local" variables that have to be
1.1.1.2 misho 533: preserved over calls to RMATCH() are part of a "frame". We set up the top-level
534: frame on the stack here; subsequent instantiations are obtained from the heap
535: whenever RMATCH() does a "recursion". See the macro definitions above. Putting
536: the top-level on the stack rather than malloc-ing them all gives a performance
537: boost in many cases where there is not much "recursion". */
1.1 misho 538:
539: #ifdef NO_RECURSE
1.1.1.3 misho 540: heapframe *frame = (heapframe *)md->match_frames_base;
1.1 misho 541:
542: /* Copy in the original argument variables */
543:
544: frame->Xeptr = eptr;
545: frame->Xecode = ecode;
546: frame->Xmstart = mstart;
547: frame->Xoffset_top = offset_top;
548: frame->Xeptrb = eptrb;
549: frame->Xrdepth = rdepth;
550:
551: /* This is where control jumps back to to effect "recursion" */
552:
553: HEAP_RECURSE:
554:
555: /* Macros make the argument variables come from the current frame */
556:
557: #define eptr frame->Xeptr
558: #define ecode frame->Xecode
559: #define mstart frame->Xmstart
560: #define offset_top frame->Xoffset_top
561: #define eptrb frame->Xeptrb
562: #define rdepth frame->Xrdepth
563:
564: /* Ditto for the local variables */
565:
1.1.1.2 misho 566: #ifdef SUPPORT_UTF
1.1 misho 567: #define charptr frame->Xcharptr
568: #endif
569: #define callpat frame->Xcallpat
570: #define codelink frame->Xcodelink
571: #define data frame->Xdata
572: #define next frame->Xnext
573: #define pp frame->Xpp
574: #define prev frame->Xprev
575: #define saved_eptr frame->Xsaved_eptr
576:
577: #define new_recursive frame->Xnew_recursive
578:
579: #define cur_is_word frame->Xcur_is_word
580: #define condition frame->Xcondition
581: #define prev_is_word frame->Xprev_is_word
582:
583: #ifdef SUPPORT_UCP
584: #define prop_type frame->Xprop_type
585: #define prop_value frame->Xprop_value
586: #define prop_fail_result frame->Xprop_fail_result
587: #define oclength frame->Xoclength
588: #define occhars frame->Xocchars
589: #endif
590:
591: #define ctype frame->Xctype
592: #define fc frame->Xfc
593: #define fi frame->Xfi
594: #define length frame->Xlength
595: #define max frame->Xmax
596: #define min frame->Xmin
597: #define number frame->Xnumber
598: #define offset frame->Xoffset
599: #define op frame->Xop
600: #define save_capture_last frame->Xsave_capture_last
601: #define save_offset1 frame->Xsave_offset1
602: #define save_offset2 frame->Xsave_offset2
603: #define save_offset3 frame->Xsave_offset3
604: #define stacksave frame->Xstacksave
605:
606: #define newptrb frame->Xnewptrb
607:
608: /* When recursion is being used, local variables are allocated on the stack and
609: get preserved during recursion in the normal way. In this environment, fi and
610: i, and fc and c, can be the same variables. */
611:
612: #else /* NO_RECURSE not defined */
613: #define fi i
614: #define fc c
615:
616: /* Many of the following variables are used only in small blocks of the code.
617: My normal style of coding would have declared them within each of those blocks.
618: However, in order to accommodate the version of this code that uses an external
619: "stack" implemented on the heap, it is easier to declare them all here, so the
620: declarations can be cut out in a block. The only declarations within blocks
621: below are for variables that do not have to be preserved over a recursive call
622: to RMATCH(). */
623:
1.1.1.2 misho 624: #ifdef SUPPORT_UTF
625: const pcre_uchar *charptr;
1.1 misho 626: #endif
1.1.1.2 misho 627: const pcre_uchar *callpat;
628: const pcre_uchar *data;
629: const pcre_uchar *next;
630: PCRE_PUCHAR pp;
631: const pcre_uchar *prev;
632: PCRE_PUCHAR saved_eptr;
1.1 misho 633:
634: recursion_info new_recursive;
635:
636: BOOL cur_is_word;
637: BOOL condition;
638: BOOL prev_is_word;
639:
640: #ifdef SUPPORT_UCP
641: int prop_type;
1.1.1.4 ! misho 642: unsigned int prop_value;
1.1 misho 643: int prop_fail_result;
644: int oclength;
1.1.1.2 misho 645: pcre_uchar occhars[6];
1.1 misho 646: #endif
647:
648: int codelink;
649: int ctype;
650: int length;
651: int max;
652: int min;
1.1.1.4 ! misho 653: unsigned int number;
1.1 misho 654: int offset;
1.1.1.4 ! misho 655: unsigned int op;
! 656: pcre_int32 save_capture_last;
1.1 misho 657: int save_offset1, save_offset2, save_offset3;
658: int stacksave[REC_STACK_SAVE_MAX];
659:
660: eptrblock newptrb;
1.1.1.2 misho 661:
662: /* There is a special fudge for calling match() in a way that causes it to
663: measure the size of its basic stack frame when the stack is being used for
664: recursion. The second argument (ecode) being NULL triggers this behaviour. It
665: cannot normally ever be NULL. The return is the negated value of the frame
666: size. */
667:
668: if (ecode == NULL)
669: {
670: if (rdepth == 0)
671: return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
672: else
673: {
674: int len = (char *)&rdepth - (char *)eptr;
675: return (len > 0)? -len : len;
676: }
677: }
1.1 misho 678: #endif /* NO_RECURSE */
679:
680: /* To save space on the stack and in the heap frame, I have doubled up on some
681: of the local variables that are used only in localised parts of the code, but
682: still need to be preserved over recursive calls of match(). These macros define
683: the alternative names that are used. */
684:
685: #define allow_zero cur_is_word
686: #define cbegroup condition
687: #define code_offset codelink
688: #define condassert condition
689: #define matched_once prev_is_word
1.1.1.2 misho 690: #define foc number
691: #define save_mark data
1.1 misho 692:
693: /* These statements are here to stop the compiler complaining about unitialized
694: variables. */
695:
696: #ifdef SUPPORT_UCP
697: prop_value = 0;
698: prop_fail_result = 0;
699: #endif
700:
701:
702: /* This label is used for tail recursion, which is used in a few cases even
703: when NO_RECURSE is not defined, in order to reduce the amount of stack that is
704: used. Thanks to Ian Taylor for noticing this possibility and sending the
705: original patch. */
706:
707: TAIL_RECURSE:
708:
709: /* OK, now we can get on with the real code of the function. Recursive calls
710: are specified by the macro RMATCH and RRETURN is used to return. When
711: NO_RECURSE is *not* defined, these just turn into a recursive call to match()
712: and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
713: defined). However, RMATCH isn't like a function call because it's quite a
714: complicated macro. It has to be used in one particular way. This shouldn't,
715: however, impact performance when true recursion is being used. */
716:
1.1.1.2 misho 717: #ifdef SUPPORT_UTF
718: utf = md->utf; /* Local copy of the flag */
1.1 misho 719: #else
1.1.1.2 misho 720: utf = FALSE;
1.1 misho 721: #endif
722:
723: /* First check that we haven't called match() too many times, or that we
724: haven't exceeded the recursive call limit. */
725:
726: if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
727: if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
728:
729: /* At the start of a group with an unlimited repeat that may match an empty
730: string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
731: done this way to save having to use another function argument, which would take
732: up space on the stack. See also MATCH_CONDASSERT below.
733:
734: When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
735: such remembered pointers, to be checked when we hit the closing ket, in order
736: to break infinite loops that match no characters. When match() is called in
737: other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
738: NOT be used with tail recursion, because the memory block that is used is on
739: the stack, so a new one may be required for each match(). */
740:
741: if (md->match_function_type == MATCH_CBEGROUP)
742: {
743: newptrb.epb_saved_eptr = eptr;
744: newptrb.epb_prev = eptrb;
745: eptrb = &newptrb;
746: md->match_function_type = 0;
747: }
748:
749: /* Now start processing the opcodes. */
750:
751: for (;;)
752: {
753: minimize = possessive = FALSE;
754: op = *ecode;
755:
756: switch(op)
757: {
758: case OP_MARK:
759: md->nomatch_mark = ecode + 2;
760: md->mark = NULL; /* In case previously set by assertion */
1.1.1.2 misho 761: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
1.1 misho 762: eptrb, RM55);
763: if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
764: md->mark == NULL) md->mark = ecode + 2;
765:
766: /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
767: argument, and we must check whether that argument matches this MARK's
768: argument. It is passed back in md->start_match_ptr (an overloading of that
769: variable). If it does match, we reset that variable to the current subject
770: position and return MATCH_SKIP. Otherwise, pass back the return code
771: unaltered. */
772:
773: else if (rrc == MATCH_SKIP_ARG &&
1.1.1.4 ! misho 774: STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
1.1 misho 775: {
776: md->start_match_ptr = eptr;
777: RRETURN(MATCH_SKIP);
778: }
779: RRETURN(rrc);
780:
781: case OP_FAIL:
782: RRETURN(MATCH_NOMATCH);
783:
784: case OP_COMMIT:
1.1.1.2 misho 785: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1.1 misho 786: eptrb, RM52);
1.1.1.4 ! misho 787: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1 misho 788: RRETURN(MATCH_COMMIT);
789:
790: case OP_PRUNE:
1.1.1.2 misho 791: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1.1 misho 792: eptrb, RM51);
1.1.1.4 ! misho 793: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1 misho 794: RRETURN(MATCH_PRUNE);
795:
796: case OP_PRUNE_ARG:
797: md->nomatch_mark = ecode + 2;
798: md->mark = NULL; /* In case previously set by assertion */
1.1.1.2 misho 799: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
1.1 misho 800: eptrb, RM56);
801: if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
802: md->mark == NULL) md->mark = ecode + 2;
1.1.1.4 ! misho 803: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1 misho 804: RRETURN(MATCH_PRUNE);
805:
806: case OP_SKIP:
1.1.1.2 misho 807: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1.1 misho 808: eptrb, RM53);
1.1.1.4 ! misho 809: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1 misho 810: md->start_match_ptr = eptr; /* Pass back current position */
811: RRETURN(MATCH_SKIP);
812:
813: /* Note that, for Perl compatibility, SKIP with an argument does NOT set
1.1.1.4 ! misho 814: nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
! 815: not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
! 816: that failed and any that precede it (either they also failed, or were not
! 817: triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
! 818: SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg
! 819: set to the count of the one that failed. */
1.1 misho 820:
821: case OP_SKIP_ARG:
1.1.1.4 ! misho 822: md->skip_arg_count++;
! 823: if (md->skip_arg_count <= md->ignore_skip_arg)
1.1 misho 824: {
1.1.1.2 misho 825: ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
1.1 misho 826: break;
827: }
1.1.1.2 misho 828: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
1.1 misho 829: eptrb, RM57);
1.1.1.4 ! misho 830: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1 misho 831:
832: /* Pass back the current skip name by overloading md->start_match_ptr and
833: returning the special MATCH_SKIP_ARG return code. This will either be
834: caught by a matching MARK, or get to the top, where it causes a rematch
1.1.1.4 ! misho 835: with md->ignore_skip_arg set to the value of md->skip_arg_count. */
1.1 misho 836:
837: md->start_match_ptr = ecode + 2;
838: RRETURN(MATCH_SKIP_ARG);
839:
840: /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
841: the branch in which it occurs can be determined. Overload the start of
842: match pointer to do this. */
843:
844: case OP_THEN:
1.1.1.2 misho 845: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1.1 misho 846: eptrb, RM54);
847: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
848: md->start_match_ptr = ecode;
849: RRETURN(MATCH_THEN);
850:
851: case OP_THEN_ARG:
852: md->nomatch_mark = ecode + 2;
853: md->mark = NULL; /* In case previously set by assertion */
1.1.1.2 misho 854: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
1.1 misho 855: md, eptrb, RM58);
856: if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
857: md->mark == NULL) md->mark = ecode + 2;
858: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
859: md->start_match_ptr = ecode;
860: RRETURN(MATCH_THEN);
861:
862: /* Handle an atomic group that does not contain any capturing parentheses.
863: This can be handled like an assertion. Prior to 8.13, all atomic groups
864: were handled this way. In 8.13, the code was changed as below for ONCE, so
865: that backups pass through the group and thereby reset captured values.
866: However, this uses a lot more stack, so in 8.20, atomic groups that do not
867: contain any captures generate OP_ONCE_NC, which can be handled in the old,
868: less stack intensive way.
869:
870: Check the alternative branches in turn - the matching won't pass the KET
871: for this kind of subpattern. If any one branch matches, we carry on as at
872: the end of a normal bracket, leaving the subject pointer, but resetting
873: the start-of-match value in case it was changed by \K. */
874:
875: case OP_ONCE_NC:
876: prev = ecode;
877: saved_eptr = eptr;
1.1.1.2 misho 878: save_mark = md->mark;
1.1 misho 879: do
880: {
881: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
882: if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
883: {
884: mstart = md->start_match_ptr;
885: break;
886: }
887: if (rrc == MATCH_THEN)
888: {
889: next = ecode + GET(ecode,1);
890: if (md->start_match_ptr < next &&
891: (*ecode == OP_ALT || *next == OP_ALT))
892: rrc = MATCH_NOMATCH;
893: }
894:
895: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
896: ecode += GET(ecode,1);
1.1.1.2 misho 897: md->mark = save_mark;
1.1 misho 898: }
899: while (*ecode == OP_ALT);
900:
901: /* If hit the end of the group (which could be repeated), fail */
902:
903: if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
904:
905: /* Continue as from after the group, updating the offsets high water
906: mark, since extracts may have been taken. */
907:
908: do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
909:
910: offset_top = md->end_offset_top;
911: eptr = md->end_match_ptr;
912:
913: /* For a non-repeating ket, just continue at this level. This also
914: happens for a repeating ket if no characters were matched in the group.
915: This is the forcible breaking of infinite loops as implemented in Perl
916: 5.005. */
917:
918: if (*ecode == OP_KET || eptr == saved_eptr)
919: {
920: ecode += 1+LINK_SIZE;
921: break;
922: }
923:
924: /* The repeating kets try the rest of the pattern or restart from the
925: preceding bracket, in the appropriate order. The second "call" of match()
926: uses tail recursion, to avoid using another stack frame. */
927:
928: if (*ecode == OP_KETRMIN)
929: {
930: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
931: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
932: ecode = prev;
933: goto TAIL_RECURSE;
934: }
935: else /* OP_KETRMAX */
936: {
937: RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
938: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
939: ecode += 1 + LINK_SIZE;
940: goto TAIL_RECURSE;
941: }
942: /* Control never gets here */
943:
944: /* Handle a capturing bracket, other than those that are possessive with an
945: unlimited repeat. If there is space in the offset vector, save the current
946: subject position in the working slot at the top of the vector. We mustn't
947: change the current values of the data slot, because they may be set from a
948: previous iteration of this group, and be referred to by a reference inside
949: the group. A failure to match might occur after the group has succeeded,
950: if something later on doesn't match. For this reason, we need to restore
951: the working value and also the values of the final offsets, in case they
952: were set by a previous iteration of the same bracket.
953:
954: If there isn't enough space in the offset vector, treat this as if it were
955: a non-capturing bracket. Don't worry about setting the flag for the error
956: case here; that is handled in the code for KET. */
957:
958: case OP_CBRA:
959: case OP_SCBRA:
960: number = GET2(ecode, 1+LINK_SIZE);
961: offset = number << 1;
962:
963: #ifdef PCRE_DEBUG
964: printf("start bracket %d\n", number);
965: printf("subject=");
966: pchars(eptr, 16, TRUE, md);
967: printf("\n");
968: #endif
969:
970: if (offset < md->offset_max)
971: {
972: save_offset1 = md->offset_vector[offset];
973: save_offset2 = md->offset_vector[offset+1];
974: save_offset3 = md->offset_vector[md->offset_end - number];
975: save_capture_last = md->capture_last;
1.1.1.2 misho 976: save_mark = md->mark;
1.1 misho 977:
978: DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
979: md->offset_vector[md->offset_end - number] =
980: (int)(eptr - md->start_subject);
981:
982: for (;;)
983: {
984: if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1.1.1.2 misho 985: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1.1 misho 986: eptrb, RM1);
987: if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
988:
989: /* If we backed up to a THEN, check whether it is within the current
990: branch by comparing the address of the THEN that is passed back with
991: the end of the branch. If it is within the current branch, and the
992: branch is one of two or more alternatives (it either starts or ends
993: with OP_ALT), we have reached the limit of THEN's action, so convert
994: the return code to NOMATCH, which will cause normal backtracking to
995: happen from now on. Otherwise, THEN is passed back to an outer
996: alternative. This implements Perl's treatment of parenthesized groups,
997: where a group not containing | does not affect the current alternative,
998: that is, (X) is NOT the same as (X|(*F)). */
999:
1000: if (rrc == MATCH_THEN)
1001: {
1002: next = ecode + GET(ecode,1);
1003: if (md->start_match_ptr < next &&
1004: (*ecode == OP_ALT || *next == OP_ALT))
1005: rrc = MATCH_NOMATCH;
1006: }
1007:
1008: /* Anything other than NOMATCH is passed back. */
1009:
1010: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1011: md->capture_last = save_capture_last;
1012: ecode += GET(ecode, 1);
1.1.1.2 misho 1013: md->mark = save_mark;
1.1 misho 1014: if (*ecode != OP_ALT) break;
1015: }
1016:
1017: DPRINTF(("bracket %d failed\n", number));
1018: md->offset_vector[offset] = save_offset1;
1019: md->offset_vector[offset+1] = save_offset2;
1020: md->offset_vector[md->offset_end - number] = save_offset3;
1021:
1022: /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1023:
1024: RRETURN(rrc);
1025: }
1026:
1027: /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1028: as a non-capturing bracket. */
1029:
1030: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1031: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1032:
1033: DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1034:
1035: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1036: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1037:
1038: /* Non-capturing or atomic group, except for possessive with unlimited
1039: repeat and ONCE group with no captures. Loop for all the alternatives.
1040:
1041: When we get to the final alternative within the brackets, we used to return
1042: the result of a recursive call to match() whatever happened so it was
1043: possible to reduce stack usage by turning this into a tail recursion,
1044: except in the case of a possibly empty group. However, now that there is
1045: the possiblity of (*THEN) occurring in the final alternative, this
1046: optimization is no longer always possible.
1047:
1048: We can optimize if we know there are no (*THEN)s in the pattern; at present
1049: this is the best that can be done.
1050:
1051: MATCH_ONCE is returned when the end of an atomic group is successfully
1052: reached, but subsequent matching fails. It passes back up the tree (causing
1053: captured values to be reset) until the original atomic group level is
1054: reached. This is tested by comparing md->once_target with the start of the
1055: group. At this point, the return is converted into MATCH_NOMATCH so that
1056: previous backup points can be taken. */
1057:
1058: case OP_ONCE:
1059: case OP_BRA:
1060: case OP_SBRA:
1061: DPRINTF(("start non-capturing bracket\n"));
1062:
1063: for (;;)
1064: {
1.1.1.3 misho 1065: if (op >= OP_SBRA || op == OP_ONCE)
1066: md->match_function_type = MATCH_CBEGROUP;
1.1 misho 1067:
1068: /* If this is not a possibly empty group, and there are no (*THEN)s in
1069: the pattern, and this is the final alternative, optimize as described
1070: above. */
1071:
1072: else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1073: {
1.1.1.2 misho 1074: ecode += PRIV(OP_lengths)[*ecode];
1.1 misho 1075: goto TAIL_RECURSE;
1076: }
1077:
1078: /* In all other cases, we have to make another call to match(). */
1079:
1.1.1.2 misho 1080: save_mark = md->mark;
1.1.1.4 ! misho 1081: save_capture_last = md->capture_last;
1.1.1.2 misho 1082: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1.1 misho 1083: RM2);
1084:
1085: /* See comment in the code for capturing groups above about handling
1086: THEN. */
1087:
1088: if (rrc == MATCH_THEN)
1089: {
1090: next = ecode + GET(ecode,1);
1091: if (md->start_match_ptr < next &&
1092: (*ecode == OP_ALT || *next == OP_ALT))
1093: rrc = MATCH_NOMATCH;
1094: }
1095:
1096: if (rrc != MATCH_NOMATCH)
1097: {
1098: if (rrc == MATCH_ONCE)
1099: {
1.1.1.2 misho 1100: const pcre_uchar *scode = ecode;
1.1 misho 1101: if (*scode != OP_ONCE) /* If not at start, find it */
1102: {
1103: while (*scode == OP_ALT) scode += GET(scode, 1);
1104: scode -= GET(scode, 1);
1105: }
1106: if (md->once_target == scode) rrc = MATCH_NOMATCH;
1107: }
1108: RRETURN(rrc);
1109: }
1110: ecode += GET(ecode, 1);
1.1.1.2 misho 1111: md->mark = save_mark;
1.1 misho 1112: if (*ecode != OP_ALT) break;
1.1.1.4 ! misho 1113: md->capture_last = save_capture_last;
1.1 misho 1114: }
1115:
1116: RRETURN(MATCH_NOMATCH);
1117:
1118: /* Handle possessive capturing brackets with an unlimited repeat. We come
1119: here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1120: handled similarly to the normal case above. However, the matching is
1121: different. The end of these brackets will always be OP_KETRPOS, which
1122: returns MATCH_KETRPOS without going further in the pattern. By this means
1123: we can handle the group by iteration rather than recursion, thereby
1124: reducing the amount of stack needed. */
1125:
1126: case OP_CBRAPOS:
1127: case OP_SCBRAPOS:
1128: allow_zero = FALSE;
1129:
1130: POSSESSIVE_CAPTURE:
1131: number = GET2(ecode, 1+LINK_SIZE);
1132: offset = number << 1;
1133:
1134: #ifdef PCRE_DEBUG
1135: printf("start possessive bracket %d\n", number);
1136: printf("subject=");
1137: pchars(eptr, 16, TRUE, md);
1138: printf("\n");
1139: #endif
1140:
1141: if (offset < md->offset_max)
1142: {
1143: matched_once = FALSE;
1144: code_offset = (int)(ecode - md->start_code);
1145:
1146: save_offset1 = md->offset_vector[offset];
1147: save_offset2 = md->offset_vector[offset+1];
1148: save_offset3 = md->offset_vector[md->offset_end - number];
1149: save_capture_last = md->capture_last;
1150:
1151: DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1152:
1153: /* Each time round the loop, save the current subject position for use
1154: when the group matches. For MATCH_MATCH, the group has matched, so we
1155: restart it with a new subject starting position, remembering that we had
1156: at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1157: usual. If we haven't matched any alternatives in any iteration, check to
1158: see if a previous iteration matched. If so, the group has matched;
1159: continue from afterwards. Otherwise it has failed; restore the previous
1160: capture values before returning NOMATCH. */
1161:
1162: for (;;)
1163: {
1164: md->offset_vector[md->offset_end - number] =
1165: (int)(eptr - md->start_subject);
1166: if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1.1.1.2 misho 1167: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1.1 misho 1168: eptrb, RM63);
1169: if (rrc == MATCH_KETRPOS)
1170: {
1171: offset_top = md->end_offset_top;
1172: eptr = md->end_match_ptr;
1173: ecode = md->start_code + code_offset;
1174: save_capture_last = md->capture_last;
1175: matched_once = TRUE;
1176: continue;
1177: }
1178:
1179: /* See comment in the code for capturing groups above about handling
1180: THEN. */
1181:
1182: if (rrc == MATCH_THEN)
1183: {
1184: next = ecode + GET(ecode,1);
1185: if (md->start_match_ptr < next &&
1186: (*ecode == OP_ALT || *next == OP_ALT))
1187: rrc = MATCH_NOMATCH;
1188: }
1189:
1190: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1191: md->capture_last = save_capture_last;
1192: ecode += GET(ecode, 1);
1193: if (*ecode != OP_ALT) break;
1194: }
1195:
1196: if (!matched_once)
1197: {
1198: md->offset_vector[offset] = save_offset1;
1199: md->offset_vector[offset+1] = save_offset2;
1200: md->offset_vector[md->offset_end - number] = save_offset3;
1201: }
1202:
1203: if (allow_zero || matched_once)
1204: {
1205: ecode += 1 + LINK_SIZE;
1206: break;
1207: }
1208:
1209: RRETURN(MATCH_NOMATCH);
1210: }
1211:
1212: /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1213: as a non-capturing bracket. */
1214:
1215: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1216: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1217:
1218: DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1219:
1220: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1221: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1222:
1223: /* Non-capturing possessive bracket with unlimited repeat. We come here
1224: from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1225: without the capturing complication. It is written out separately for speed
1226: and cleanliness. */
1227:
1228: case OP_BRAPOS:
1229: case OP_SBRAPOS:
1230: allow_zero = FALSE;
1231:
1232: POSSESSIVE_NON_CAPTURE:
1233: matched_once = FALSE;
1234: code_offset = (int)(ecode - md->start_code);
1.1.1.4 ! misho 1235: save_capture_last = md->capture_last;
1.1 misho 1236:
1237: for (;;)
1238: {
1239: if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1.1.1.2 misho 1240: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1.1 misho 1241: eptrb, RM48);
1242: if (rrc == MATCH_KETRPOS)
1243: {
1244: offset_top = md->end_offset_top;
1245: eptr = md->end_match_ptr;
1246: ecode = md->start_code + code_offset;
1247: matched_once = TRUE;
1248: continue;
1249: }
1250:
1251: /* See comment in the code for capturing groups above about handling
1252: THEN. */
1253:
1254: if (rrc == MATCH_THEN)
1255: {
1256: next = ecode + GET(ecode,1);
1257: if (md->start_match_ptr < next &&
1258: (*ecode == OP_ALT || *next == OP_ALT))
1259: rrc = MATCH_NOMATCH;
1260: }
1261:
1262: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1263: ecode += GET(ecode, 1);
1264: if (*ecode != OP_ALT) break;
1.1.1.4 ! misho 1265: md->capture_last = save_capture_last;
1.1 misho 1266: }
1267:
1268: if (matched_once || allow_zero)
1269: {
1270: ecode += 1 + LINK_SIZE;
1271: break;
1272: }
1273: RRETURN(MATCH_NOMATCH);
1274:
1275: /* Control never reaches here. */
1276:
1277: /* Conditional group: compilation checked that there are no more than
1278: two branches. If the condition is false, skipping the first branch takes us
1279: past the end if there is only one branch, but that's OK because that is
1280: exactly what going to the ket would do. */
1281:
1282: case OP_COND:
1283: case OP_SCOND:
1284: codelink = GET(ecode, 1);
1285:
1286: /* Because of the way auto-callout works during compile, a callout item is
1287: inserted between OP_COND and an assertion condition. */
1288:
1289: if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1290: {
1.1.1.2 misho 1291: if (PUBL(callout) != NULL)
1.1 misho 1292: {
1.1.1.2 misho 1293: PUBL(callout_block) cb;
1.1 misho 1294: cb.version = 2; /* Version 1 of the callout block */
1295: cb.callout_number = ecode[LINK_SIZE+2];
1296: cb.offset_vector = md->offset_vector;
1.1.1.4 ! misho 1297: #if defined COMPILE_PCRE8
1.1 misho 1298: cb.subject = (PCRE_SPTR)md->start_subject;
1.1.1.4 ! misho 1299: #elif defined COMPILE_PCRE16
1.1.1.2 misho 1300: cb.subject = (PCRE_SPTR16)md->start_subject;
1.1.1.4 ! misho 1301: #elif defined COMPILE_PCRE32
! 1302: cb.subject = (PCRE_SPTR32)md->start_subject;
1.1.1.2 misho 1303: #endif
1.1 misho 1304: cb.subject_length = (int)(md->end_subject - md->start_subject);
1305: cb.start_match = (int)(mstart - md->start_subject);
1306: cb.current_position = (int)(eptr - md->start_subject);
1307: cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1308: cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1309: cb.capture_top = offset_top/2;
1.1.1.4 ! misho 1310: cb.capture_last = md->capture_last & CAPLMASK;
! 1311: /* Internal change requires this for API compatibility. */
! 1312: if (cb.capture_last == 0) cb.capture_last = -1;
1.1 misho 1313: cb.callout_data = md->callout_data;
1314: cb.mark = md->nomatch_mark;
1.1.1.2 misho 1315: if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1.1 misho 1316: if (rrc < 0) RRETURN(rrc);
1317: }
1.1.1.2 misho 1318: ecode += PRIV(OP_lengths)[OP_CALLOUT];
1.1.1.4 ! misho 1319: codelink -= PRIV(OP_lengths)[OP_CALLOUT];
1.1 misho 1320: }
1321:
1322: condcode = ecode[LINK_SIZE+1];
1323:
1324: /* Now see what the actual condition is */
1325:
1326: if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1327: {
1328: if (md->recursive == NULL) /* Not recursing => FALSE */
1329: {
1330: condition = FALSE;
1331: ecode += GET(ecode, 1);
1332: }
1333: else
1334: {
1.1.1.4 ! misho 1335: unsigned int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1.1 misho 1336: condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1337:
1338: /* If the test is for recursion into a specific subpattern, and it is
1339: false, but the test was set up by name, scan the table to see if the
1340: name refers to any other numbers, and test them. The condition is true
1341: if any one is set. */
1342:
1343: if (!condition && condcode == OP_NRREF)
1344: {
1.1.1.2 misho 1345: pcre_uchar *slotA = md->name_table;
1.1 misho 1346: for (i = 0; i < md->name_count; i++)
1347: {
1348: if (GET2(slotA, 0) == recno) break;
1349: slotA += md->name_entry_size;
1350: }
1351:
1352: /* Found a name for the number - there can be only one; duplicate
1353: names for different numbers are allowed, but not vice versa. First
1354: scan down for duplicates. */
1355:
1356: if (i < md->name_count)
1357: {
1.1.1.2 misho 1358: pcre_uchar *slotB = slotA;
1.1 misho 1359: while (slotB > md->name_table)
1360: {
1361: slotB -= md->name_entry_size;
1.1.1.2 misho 1362: if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1.1 misho 1363: {
1364: condition = GET2(slotB, 0) == md->recursive->group_num;
1365: if (condition) break;
1366: }
1367: else break;
1368: }
1369:
1370: /* Scan up for duplicates */
1371:
1372: if (!condition)
1373: {
1374: slotB = slotA;
1375: for (i++; i < md->name_count; i++)
1376: {
1377: slotB += md->name_entry_size;
1.1.1.2 misho 1378: if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1.1 misho 1379: {
1380: condition = GET2(slotB, 0) == md->recursive->group_num;
1381: if (condition) break;
1382: }
1383: else break;
1384: }
1385: }
1386: }
1387: }
1388:
1389: /* Chose branch according to the condition */
1390:
1.1.1.2 misho 1391: ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1.1 misho 1392: }
1393: }
1394:
1395: else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1396: {
1397: offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1398: condition = offset < offset_top && md->offset_vector[offset] >= 0;
1399:
1400: /* If the numbered capture is unset, but the reference was by name,
1401: scan the table to see if the name refers to any other numbers, and test
1402: them. The condition is true if any one is set. This is tediously similar
1403: to the code above, but not close enough to try to amalgamate. */
1404:
1405: if (!condition && condcode == OP_NCREF)
1406: {
1.1.1.4 ! misho 1407: unsigned int refno = offset >> 1;
1.1.1.2 misho 1408: pcre_uchar *slotA = md->name_table;
1.1 misho 1409:
1410: for (i = 0; i < md->name_count; i++)
1411: {
1412: if (GET2(slotA, 0) == refno) break;
1413: slotA += md->name_entry_size;
1414: }
1415:
1416: /* Found a name for the number - there can be only one; duplicate names
1417: for different numbers are allowed, but not vice versa. First scan down
1418: for duplicates. */
1419:
1420: if (i < md->name_count)
1421: {
1.1.1.2 misho 1422: pcre_uchar *slotB = slotA;
1.1 misho 1423: while (slotB > md->name_table)
1424: {
1425: slotB -= md->name_entry_size;
1.1.1.2 misho 1426: if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1.1 misho 1427: {
1428: offset = GET2(slotB, 0) << 1;
1429: condition = offset < offset_top &&
1430: md->offset_vector[offset] >= 0;
1431: if (condition) break;
1432: }
1433: else break;
1434: }
1435:
1436: /* Scan up for duplicates */
1437:
1438: if (!condition)
1439: {
1440: slotB = slotA;
1441: for (i++; i < md->name_count; i++)
1442: {
1443: slotB += md->name_entry_size;
1.1.1.2 misho 1444: if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1.1 misho 1445: {
1446: offset = GET2(slotB, 0) << 1;
1447: condition = offset < offset_top &&
1448: md->offset_vector[offset] >= 0;
1449: if (condition) break;
1450: }
1451: else break;
1452: }
1453: }
1454: }
1455: }
1456:
1457: /* Chose branch according to the condition */
1458:
1.1.1.2 misho 1459: ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1.1 misho 1460: }
1461:
1462: else if (condcode == OP_DEF) /* DEFINE - always false */
1463: {
1464: condition = FALSE;
1465: ecode += GET(ecode, 1);
1466: }
1467:
1468: /* The condition is an assertion. Call match() to evaluate it - setting
1469: md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1470: an assertion. */
1471:
1472: else
1473: {
1474: md->match_function_type = MATCH_CONDASSERT;
1475: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1476: if (rrc == MATCH_MATCH)
1477: {
1478: if (md->end_offset_top > offset_top)
1479: offset_top = md->end_offset_top; /* Captures may have happened */
1480: condition = TRUE;
1481: ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1482: while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1483: }
1484:
1485: /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1486: assertion; it is therefore treated as NOMATCH. */
1487:
1488: else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1489: {
1490: RRETURN(rrc); /* Need braces because of following else */
1491: }
1492: else
1493: {
1494: condition = FALSE;
1495: ecode += codelink;
1496: }
1497: }
1498:
1499: /* We are now at the branch that is to be obeyed. As there is only one, can
1500: use tail recursion to avoid using another stack frame, except when there is
1501: unlimited repeat of a possibly empty group. In the latter case, a recursive
1502: call to match() is always required, unless the second alternative doesn't
1503: exist, in which case we can just plough on. Note that, for compatibility
1504: with Perl, the | in a conditional group is NOT treated as creating two
1505: alternatives. If a THEN is encountered in the branch, it propagates out to
1506: the enclosing alternative (unless nested in a deeper set of alternatives,
1507: of course). */
1508:
1509: if (condition || *ecode == OP_ALT)
1510: {
1511: if (op != OP_SCOND)
1512: {
1513: ecode += 1 + LINK_SIZE;
1514: goto TAIL_RECURSE;
1515: }
1516:
1517: md->match_function_type = MATCH_CBEGROUP;
1518: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1519: RRETURN(rrc);
1520: }
1521:
1522: /* Condition false & no alternative; continue after the group. */
1523:
1524: else
1525: {
1526: ecode += 1 + LINK_SIZE;
1527: }
1528: break;
1529:
1530:
1531: /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1532: to close any currently open capturing brackets. */
1533:
1534: case OP_CLOSE:
1.1.1.4 ! misho 1535: number = GET2(ecode, 1); /* Must be less than 65536 */
1.1 misho 1536: offset = number << 1;
1537:
1538: #ifdef PCRE_DEBUG
1539: printf("end bracket %d at *ACCEPT", number);
1540: printf("\n");
1541: #endif
1542:
1.1.1.4 ! misho 1543: md->capture_last = (md->capture_last & OVFLMASK) | number;
! 1544: if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1.1 misho 1545: {
1546: md->offset_vector[offset] =
1547: md->offset_vector[md->offset_end - number];
1548: md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1549: if (offset_top <= offset) offset_top = offset + 2;
1550: }
1.1.1.2 misho 1551: ecode += 1 + IMM2_SIZE;
1.1 misho 1552: break;
1553:
1554:
1555: /* End of the pattern, either real or forced. */
1556:
1557: case OP_END:
1558: case OP_ACCEPT:
1559: case OP_ASSERT_ACCEPT:
1560:
1561: /* If we have matched an empty string, fail if not in an assertion and not
1562: in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1563: is set and we have matched at the start of the subject. In both cases,
1564: backtracking will then try other alternatives, if any. */
1565:
1566: if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1567: md->recursive == NULL &&
1568: (md->notempty ||
1569: (md->notempty_atstart &&
1570: mstart == md->start_subject + md->start_offset)))
1571: RRETURN(MATCH_NOMATCH);
1572:
1573: /* Otherwise, we have a match. */
1574:
1575: md->end_match_ptr = eptr; /* Record where we ended */
1576: md->end_offset_top = offset_top; /* and how many extracts were taken */
1577: md->start_match_ptr = mstart; /* and the start (\K can modify) */
1578:
1579: /* For some reason, the macros don't work properly if an expression is
1580: given as the argument to RRETURN when the heap is in use. */
1581:
1582: rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1583: RRETURN(rrc);
1584:
1585: /* Assertion brackets. Check the alternative branches in turn - the
1586: matching won't pass the KET for an assertion. If any one branch matches,
1587: the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1588: start of each branch to move the current point backwards, so the code at
1589: this level is identical to the lookahead case. When the assertion is part
1590: of a condition, we want to return immediately afterwards. The caller of
1591: this incarnation of the match() function will have set MATCH_CONDASSERT in
1592: md->match_function type, and one of these opcodes will be the first opcode
1593: that is processed. We use a local variable that is preserved over calls to
1594: match() to remember this case. */
1595:
1596: case OP_ASSERT:
1597: case OP_ASSERTBACK:
1.1.1.2 misho 1598: save_mark = md->mark;
1.1 misho 1599: if (md->match_function_type == MATCH_CONDASSERT)
1600: {
1601: condassert = TRUE;
1602: md->match_function_type = 0;
1603: }
1604: else condassert = FALSE;
1605:
1.1.1.4 ! misho 1606: /* Loop for each branch */
! 1607:
1.1 misho 1608: do
1609: {
1610: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1.1.1.4 ! misho 1611:
! 1612: /* A match means that the assertion is true; break out of the loop
! 1613: that matches its alternatives. */
! 1614:
1.1 misho 1615: if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1616: {
1617: mstart = md->start_match_ptr; /* In case \K reset it */
1618: break;
1619: }
1.1.1.4 ! misho 1620:
! 1621: /* If not matched, restore the previous mark setting. */
! 1622:
1.1.1.3 misho 1623: md->mark = save_mark;
1.1 misho 1624:
1.1.1.4 ! misho 1625: /* See comment in the code for capturing groups above about handling
! 1626: THEN. */
1.1.1.3 misho 1627:
1.1.1.4 ! misho 1628: if (rrc == MATCH_THEN)
! 1629: {
! 1630: next = ecode + GET(ecode,1);
! 1631: if (md->start_match_ptr < next &&
! 1632: (*ecode == OP_ALT || *next == OP_ALT))
! 1633: rrc = MATCH_NOMATCH;
! 1634: }
1.1.1.3 misho 1635:
1.1.1.4 ! misho 1636: /* Anything other than NOMATCH causes the entire assertion to fail,
! 1637: passing back the return code. This includes COMMIT, SKIP, PRUNE and an
! 1638: uncaptured THEN, which means they take their normal effect. This
! 1639: consistent approach does not always have exactly the same effect as in
! 1640: Perl. */
1.1 misho 1641:
1.1.1.4 ! misho 1642: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1 misho 1643: ecode += GET(ecode, 1);
1644: }
1.1.1.4 ! misho 1645: while (*ecode == OP_ALT); /* Continue for next alternative */
! 1646:
! 1647: /* If we have tried all the alternative branches, the assertion has
! 1648: failed. If not, we broke out after a match. */
1.1 misho 1649:
1650: if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1651:
1652: /* If checking an assertion for a condition, return MATCH_MATCH. */
1653:
1654: if (condassert) RRETURN(MATCH_MATCH);
1655:
1.1.1.4 ! misho 1656: /* Continue from after a successful assertion, updating the offsets high
! 1657: water mark, since extracts may have been taken during the assertion. */
1.1 misho 1658:
1659: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1660: ecode += 1 + LINK_SIZE;
1661: offset_top = md->end_offset_top;
1662: continue;
1663:
1.1.1.4 ! misho 1664: /* Negative assertion: all branches must fail to match for the assertion to
! 1665: succeed. */
1.1 misho 1666:
1667: case OP_ASSERT_NOT:
1668: case OP_ASSERTBACK_NOT:
1.1.1.2 misho 1669: save_mark = md->mark;
1.1 misho 1670: if (md->match_function_type == MATCH_CONDASSERT)
1671: {
1672: condassert = TRUE;
1673: md->match_function_type = 0;
1674: }
1675: else condassert = FALSE;
1676:
1.1.1.4 ! misho 1677: /* Loop for each alternative branch. */
! 1678:
1.1 misho 1679: do
1680: {
1681: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1.1.1.4 ! misho 1682: md->mark = save_mark; /* Always restore the mark setting */
! 1683:
! 1684: switch(rrc)
1.1 misho 1685: {
1.1.1.4 ! misho 1686: case MATCH_MATCH: /* A successful match means */
! 1687: case MATCH_ACCEPT: /* the assertion has failed. */
! 1688: RRETURN(MATCH_NOMATCH);
! 1689:
! 1690: case MATCH_NOMATCH: /* Carry on with next branch */
1.1 misho 1691: break;
1.1.1.4 ! misho 1692:
! 1693: /* See comment in the code for capturing groups above about handling
! 1694: THEN. */
! 1695:
! 1696: case MATCH_THEN:
! 1697: next = ecode + GET(ecode,1);
! 1698: if (md->start_match_ptr < next &&
! 1699: (*ecode == OP_ALT || *next == OP_ALT))
! 1700: {
! 1701: rrc = MATCH_NOMATCH;
! 1702: break;
! 1703: }
! 1704: /* Otherwise fall through. */
! 1705:
! 1706: /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole
! 1707: assertion to fail to match, without considering any more alternatives.
! 1708: Failing to match means the assertion is true. This is a consistent
! 1709: approach, but does not always have the same effect as in Perl. */
! 1710:
! 1711: case MATCH_COMMIT:
! 1712: case MATCH_SKIP:
! 1713: case MATCH_SKIP_ARG:
! 1714: case MATCH_PRUNE:
! 1715: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
! 1716: goto NEG_ASSERT_TRUE; /* Break out of alternation loop */
! 1717:
! 1718: /* Anything else is an error */
! 1719:
! 1720: default:
! 1721: RRETURN(rrc);
1.1 misho 1722: }
1723:
1.1.1.4 ! misho 1724: /* Continue with next branch */
1.1 misho 1725:
1726: ecode += GET(ecode,1);
1727: }
1728: while (*ecode == OP_ALT);
1729:
1.1.1.4 ! misho 1730: /* All branches in the assertion failed to match. */
1.1 misho 1731:
1.1.1.4 ! misho 1732: NEG_ASSERT_TRUE:
! 1733: if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
! 1734: ecode += 1 + LINK_SIZE; /* Continue with current branch */
1.1 misho 1735: continue;
1736:
1737: /* Move the subject pointer back. This occurs only at the start of
1738: each branch of a lookbehind assertion. If we are too close to the start to
1739: move back, this match function fails. When working with UTF-8 we move
1740: back a number of characters, not bytes. */
1741:
1742: case OP_REVERSE:
1.1.1.2 misho 1743: #ifdef SUPPORT_UTF
1744: if (utf)
1.1 misho 1745: {
1746: i = GET(ecode, 1);
1747: while (i-- > 0)
1748: {
1749: eptr--;
1750: if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1751: BACKCHAR(eptr);
1752: }
1753: }
1754: else
1755: #endif
1756:
1757: /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1758:
1759: {
1760: eptr -= GET(ecode, 1);
1761: if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1762: }
1763:
1764: /* Save the earliest consulted character, then skip to next op code */
1765:
1766: if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1767: ecode += 1 + LINK_SIZE;
1768: break;
1769:
1770: /* The callout item calls an external function, if one is provided, passing
1771: details of the match so far. This is mainly for debugging, though the
1772: function is able to force a failure. */
1773:
1774: case OP_CALLOUT:
1.1.1.2 misho 1775: if (PUBL(callout) != NULL)
1.1 misho 1776: {
1.1.1.2 misho 1777: PUBL(callout_block) cb;
1.1 misho 1778: cb.version = 2; /* Version 1 of the callout block */
1779: cb.callout_number = ecode[1];
1780: cb.offset_vector = md->offset_vector;
1.1.1.4 ! misho 1781: #if defined COMPILE_PCRE8
1.1 misho 1782: cb.subject = (PCRE_SPTR)md->start_subject;
1.1.1.4 ! misho 1783: #elif defined COMPILE_PCRE16
1.1.1.2 misho 1784: cb.subject = (PCRE_SPTR16)md->start_subject;
1.1.1.4 ! misho 1785: #elif defined COMPILE_PCRE32
! 1786: cb.subject = (PCRE_SPTR32)md->start_subject;
1.1.1.2 misho 1787: #endif
1.1 misho 1788: cb.subject_length = (int)(md->end_subject - md->start_subject);
1789: cb.start_match = (int)(mstart - md->start_subject);
1790: cb.current_position = (int)(eptr - md->start_subject);
1791: cb.pattern_position = GET(ecode, 2);
1792: cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1793: cb.capture_top = offset_top/2;
1.1.1.4 ! misho 1794: cb.capture_last = md->capture_last & CAPLMASK;
! 1795: /* Internal change requires this for API compatibility. */
! 1796: if (cb.capture_last == 0) cb.capture_last = -1;
1.1 misho 1797: cb.callout_data = md->callout_data;
1798: cb.mark = md->nomatch_mark;
1.1.1.2 misho 1799: if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1.1 misho 1800: if (rrc < 0) RRETURN(rrc);
1801: }
1802: ecode += 2 + 2*LINK_SIZE;
1803: break;
1804:
1805: /* Recursion either matches the current regex, or some subexpression. The
1806: offset data is the offset to the starting bracket from the start of the
1807: whole pattern. (This is so that it works from duplicated subpatterns.)
1808:
1809: The state of the capturing groups is preserved over recursion, and
1810: re-instated afterwards. We don't know how many are started and not yet
1811: finished (offset_top records the completed total) so we just have to save
1812: all the potential data. There may be up to 65535 such values, which is too
1813: large to put on the stack, but using malloc for small numbers seems
1814: expensive. As a compromise, the stack is used when there are no more than
1815: REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1816:
1817: There are also other values that have to be saved. We use a chained
1818: sequence of blocks that actually live on the stack. Thanks to Robin Houston
1819: for the original version of this logic. It has, however, been hacked around
1820: a lot, so he is not to blame for the current way it works. */
1821:
1822: case OP_RECURSE:
1823: {
1824: recursion_info *ri;
1.1.1.4 ! misho 1825: unsigned int recno;
1.1 misho 1826:
1827: callpat = md->start_code + GET(ecode, 1);
1828: recno = (callpat == md->start_code)? 0 :
1829: GET2(callpat, 1 + LINK_SIZE);
1830:
1831: /* Check for repeating a recursion without advancing the subject pointer.
1832: This should catch convoluted mutual recursions. (Some simple cases are
1833: caught at compile time.) */
1834:
1835: for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1836: if (recno == ri->group_num && eptr == ri->subject_position)
1837: RRETURN(PCRE_ERROR_RECURSELOOP);
1838:
1839: /* Add to "recursing stack" */
1840:
1841: new_recursive.group_num = recno;
1.1.1.4 ! misho 1842: new_recursive.saved_capture_last = md->capture_last;
1.1 misho 1843: new_recursive.subject_position = eptr;
1844: new_recursive.prevrec = md->recursive;
1845: md->recursive = &new_recursive;
1846:
1847: /* Where to continue from afterwards */
1848:
1849: ecode += 1 + LINK_SIZE;
1850:
1851: /* Now save the offset data */
1852:
1853: new_recursive.saved_max = md->offset_end;
1854: if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1855: new_recursive.offset_save = stacksave;
1856: else
1857: {
1858: new_recursive.offset_save =
1.1.1.2 misho 1859: (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1.1 misho 1860: if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1861: }
1862: memcpy(new_recursive.offset_save, md->offset_vector,
1863: new_recursive.saved_max * sizeof(int));
1864:
1865: /* OK, now we can do the recursion. After processing each alternative,
1.1.1.4 ! misho 1866: restore the offset data and the last captured value. If there were nested
! 1867: recursions, md->recursive might be changed, so reset it before looping.
! 1868: */
1.1 misho 1869:
1870: DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1871: cbegroup = (*callpat >= OP_SBRA);
1872: do
1873: {
1874: if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1.1.1.2 misho 1875: RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1.1 misho 1876: md, eptrb, RM6);
1877: memcpy(md->offset_vector, new_recursive.offset_save,
1878: new_recursive.saved_max * sizeof(int));
1.1.1.4 ! misho 1879: md->capture_last = new_recursive.saved_capture_last;
1.1 misho 1880: md->recursive = new_recursive.prevrec;
1881: if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1882: {
1883: DPRINTF(("Recursion matched\n"));
1884: if (new_recursive.offset_save != stacksave)
1.1.1.2 misho 1885: (PUBL(free))(new_recursive.offset_save);
1.1 misho 1886:
1887: /* Set where we got to in the subject, and reset the start in case
1888: it was changed by \K. This *is* propagated back out of a recursion,
1889: for Perl compatibility. */
1890:
1891: eptr = md->end_match_ptr;
1892: mstart = md->start_match_ptr;
1893: goto RECURSION_MATCHED; /* Exit loop; end processing */
1894: }
1895:
1.1.1.4 ! misho 1896: /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
! 1897: recursion; they cause a NOMATCH for the entire recursion. These codes
! 1898: are defined in a range that can be tested for. */
! 1899:
! 1900: if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX)
! 1901: RRETURN(MATCH_NOMATCH);
! 1902:
! 1903: /* Any return code other than NOMATCH is an error. */
1.1 misho 1904:
1.1.1.4 ! misho 1905: if (rrc != MATCH_NOMATCH)
1.1 misho 1906: {
1907: DPRINTF(("Recursion gave error %d\n", rrc));
1908: if (new_recursive.offset_save != stacksave)
1.1.1.2 misho 1909: (PUBL(free))(new_recursive.offset_save);
1.1 misho 1910: RRETURN(rrc);
1911: }
1912:
1913: md->recursive = &new_recursive;
1914: callpat += GET(callpat, 1);
1915: }
1916: while (*callpat == OP_ALT);
1917:
1918: DPRINTF(("Recursion didn't match\n"));
1919: md->recursive = new_recursive.prevrec;
1920: if (new_recursive.offset_save != stacksave)
1.1.1.2 misho 1921: (PUBL(free))(new_recursive.offset_save);
1.1 misho 1922: RRETURN(MATCH_NOMATCH);
1923: }
1924:
1925: RECURSION_MATCHED:
1926: break;
1927:
1928: /* An alternation is the end of a branch; scan along to find the end of the
1929: bracketed group and go to there. */
1930:
1931: case OP_ALT:
1932: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1933: break;
1934:
1935: /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1936: indicating that it may occur zero times. It may repeat infinitely, or not
1937: at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1938: with fixed upper repeat limits are compiled as a number of copies, with the
1939: optional ones preceded by BRAZERO or BRAMINZERO. */
1940:
1941: case OP_BRAZERO:
1942: next = ecode + 1;
1943: RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1944: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1945: do next += GET(next, 1); while (*next == OP_ALT);
1946: ecode = next + 1 + LINK_SIZE;
1947: break;
1948:
1949: case OP_BRAMINZERO:
1950: next = ecode + 1;
1951: do next += GET(next, 1); while (*next == OP_ALT);
1952: RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1953: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1954: ecode++;
1955: break;
1956:
1957: case OP_SKIPZERO:
1958: next = ecode+1;
1959: do next += GET(next,1); while (*next == OP_ALT);
1960: ecode = next + 1 + LINK_SIZE;
1961: break;
1962:
1963: /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1964: here; just jump to the group, with allow_zero set TRUE. */
1965:
1966: case OP_BRAPOSZERO:
1967: op = *(++ecode);
1968: allow_zero = TRUE;
1969: if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1970: goto POSSESSIVE_NON_CAPTURE;
1971:
1972: /* End of a group, repeated or non-repeating. */
1973:
1974: case OP_KET:
1975: case OP_KETRMIN:
1976: case OP_KETRMAX:
1977: case OP_KETRPOS:
1978: prev = ecode - GET(ecode, 1);
1979:
1980: /* If this was a group that remembered the subject start, in order to break
1981: infinite repeats of empty string matches, retrieve the subject start from
1982: the chain. Otherwise, set it NULL. */
1983:
1984: if (*prev >= OP_SBRA || *prev == OP_ONCE)
1985: {
1986: saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1987: eptrb = eptrb->epb_prev; /* Backup to previous group */
1988: }
1989: else saved_eptr = NULL;
1990:
1991: /* If we are at the end of an assertion group or a non-capturing atomic
1992: group, stop matching and return MATCH_MATCH, but record the current high
1993: water mark for use by positive assertions. We also need to record the match
1994: start in case it was changed by \K. */
1995:
1996: if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1997: *prev == OP_ONCE_NC)
1998: {
1999: md->end_match_ptr = eptr; /* For ONCE_NC */
2000: md->end_offset_top = offset_top;
2001: md->start_match_ptr = mstart;
2002: RRETURN(MATCH_MATCH); /* Sets md->mark */
2003: }
2004:
2005: /* For capturing groups we have to check the group number back at the start
2006: and if necessary complete handling an extraction by setting the offsets and
2007: bumping the high water mark. Whole-pattern recursion is coded as a recurse
2008: into group 0, so it won't be picked up here. Instead, we catch it when the
2009: OP_END is reached. Other recursion is handled here. We just have to record
2010: the current subject position and start match pointer and give a MATCH
2011: return. */
2012:
2013: if (*prev == OP_CBRA || *prev == OP_SCBRA ||
2014: *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
2015: {
2016: number = GET2(prev, 1+LINK_SIZE);
2017: offset = number << 1;
2018:
2019: #ifdef PCRE_DEBUG
2020: printf("end bracket %d", number);
2021: printf("\n");
2022: #endif
2023:
2024: /* Handle a recursively called group. */
2025:
2026: if (md->recursive != NULL && md->recursive->group_num == number)
2027: {
2028: md->end_match_ptr = eptr;
2029: md->start_match_ptr = mstart;
2030: RRETURN(MATCH_MATCH);
2031: }
2032:
2033: /* Deal with capturing */
2034:
1.1.1.4 ! misho 2035: md->capture_last = (md->capture_last & OVFLMASK) | number;
! 2036: if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1.1 misho 2037: {
2038: /* If offset is greater than offset_top, it means that we are
2039: "skipping" a capturing group, and that group's offsets must be marked
2040: unset. In earlier versions of PCRE, all the offsets were unset at the
2041: start of matching, but this doesn't work because atomic groups and
2042: assertions can cause a value to be set that should later be unset.
2043: Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
2044: part of the atomic group, but this is not on the final matching path,
2045: so must be unset when 2 is set. (If there is no group 2, there is no
2046: problem, because offset_top will then be 2, indicating no capture.) */
2047:
2048: if (offset > offset_top)
2049: {
2050: register int *iptr = md->offset_vector + offset_top;
2051: register int *iend = md->offset_vector + offset;
2052: while (iptr < iend) *iptr++ = -1;
2053: }
2054:
2055: /* Now make the extraction */
2056:
2057: md->offset_vector[offset] =
2058: md->offset_vector[md->offset_end - number];
2059: md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
2060: if (offset_top <= offset) offset_top = offset + 2;
2061: }
2062: }
2063:
2064: /* For an ordinary non-repeating ket, just continue at this level. This
2065: also happens for a repeating ket if no characters were matched in the
2066: group. This is the forcible breaking of infinite loops as implemented in
2067: Perl 5.005. For a non-repeating atomic group that includes captures,
2068: establish a backup point by processing the rest of the pattern at a lower
2069: level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
2070: original OP_ONCE level, thereby bypassing intermediate backup points, but
2071: resetting any captures that happened along the way. */
2072:
2073: if (*ecode == OP_KET || eptr == saved_eptr)
2074: {
2075: if (*prev == OP_ONCE)
2076: {
2077: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
2078: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2079: md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2080: RRETURN(MATCH_ONCE);
2081: }
2082: ecode += 1 + LINK_SIZE; /* Carry on at this level */
2083: break;
2084: }
2085:
2086: /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
2087: and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2088: at a time from the outer level, thus saving stack. */
2089:
2090: if (*ecode == OP_KETRPOS)
2091: {
2092: md->end_match_ptr = eptr;
2093: md->end_offset_top = offset_top;
2094: RRETURN(MATCH_KETRPOS);
2095: }
2096:
2097: /* The normal repeating kets try the rest of the pattern or restart from
2098: the preceding bracket, in the appropriate order. In the second case, we can
2099: use tail recursion to avoid using another stack frame, unless we have an
2100: an atomic group or an unlimited repeat of a group that can match an empty
2101: string. */
2102:
2103: if (*ecode == OP_KETRMIN)
2104: {
2105: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2106: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2107: if (*prev == OP_ONCE)
2108: {
2109: RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2110: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2111: md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2112: RRETURN(MATCH_ONCE);
2113: }
2114: if (*prev >= OP_SBRA) /* Could match an empty string */
2115: {
2116: RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2117: RRETURN(rrc);
2118: }
2119: ecode = prev;
2120: goto TAIL_RECURSE;
2121: }
2122: else /* OP_KETRMAX */
2123: {
2124: RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2125: if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2126: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2127: if (*prev == OP_ONCE)
2128: {
2129: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2130: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2131: md->once_target = prev;
2132: RRETURN(MATCH_ONCE);
2133: }
2134: ecode += 1 + LINK_SIZE;
2135: goto TAIL_RECURSE;
2136: }
2137: /* Control never gets here */
2138:
2139: /* Not multiline mode: start of subject assertion, unless notbol. */
2140:
2141: case OP_CIRC:
2142: if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2143:
2144: /* Start of subject assertion */
2145:
2146: case OP_SOD:
2147: if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2148: ecode++;
2149: break;
2150:
2151: /* Multiline mode: start of subject unless notbol, or after any newline. */
2152:
2153: case OP_CIRCM:
2154: if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2155: if (eptr != md->start_subject &&
2156: (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2157: RRETURN(MATCH_NOMATCH);
2158: ecode++;
2159: break;
2160:
2161: /* Start of match assertion */
2162:
2163: case OP_SOM:
2164: if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2165: ecode++;
2166: break;
2167:
2168: /* Reset the start of match point */
2169:
2170: case OP_SET_SOM:
2171: mstart = eptr;
2172: ecode++;
2173: break;
2174:
2175: /* Multiline mode: assert before any newline, or before end of subject
2176: unless noteol is set. */
2177:
2178: case OP_DOLLM:
2179: if (eptr < md->end_subject)
1.1.1.3 misho 2180: {
2181: if (!IS_NEWLINE(eptr))
2182: {
2183: if (md->partial != 0 &&
2184: eptr + 1 >= md->end_subject &&
2185: NLBLOCK->nltype == NLTYPE_FIXED &&
2186: NLBLOCK->nllen == 2 &&
1.1.1.4 ! misho 2187: RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
1.1.1.3 misho 2188: {
2189: md->hitend = TRUE;
2190: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2191: }
2192: RRETURN(MATCH_NOMATCH);
2193: }
2194: }
1.1 misho 2195: else
2196: {
2197: if (md->noteol) RRETURN(MATCH_NOMATCH);
2198: SCHECK_PARTIAL();
2199: }
2200: ecode++;
2201: break;
2202:
2203: /* Not multiline mode: assert before a terminating newline or before end of
2204: subject unless noteol is set. */
2205:
2206: case OP_DOLL:
2207: if (md->noteol) RRETURN(MATCH_NOMATCH);
2208: if (!md->endonly) goto ASSERT_NL_OR_EOS;
2209:
2210: /* ... else fall through for endonly */
2211:
2212: /* End of subject assertion (\z) */
2213:
2214: case OP_EOD:
2215: if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2216: SCHECK_PARTIAL();
2217: ecode++;
2218: break;
2219:
2220: /* End of subject or ending \n assertion (\Z) */
2221:
2222: case OP_EODN:
2223: ASSERT_NL_OR_EOS:
2224: if (eptr < md->end_subject &&
2225: (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1.1.1.3 misho 2226: {
2227: if (md->partial != 0 &&
2228: eptr + 1 >= md->end_subject &&
2229: NLBLOCK->nltype == NLTYPE_FIXED &&
2230: NLBLOCK->nllen == 2 &&
1.1.1.4 ! misho 2231: RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
1.1.1.3 misho 2232: {
2233: md->hitend = TRUE;
2234: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2235: }
1.1 misho 2236: RRETURN(MATCH_NOMATCH);
1.1.1.3 misho 2237: }
1.1 misho 2238:
2239: /* Either at end of string or \n before end. */
2240:
2241: SCHECK_PARTIAL();
2242: ecode++;
2243: break;
2244:
2245: /* Word boundary assertions */
2246:
2247: case OP_NOT_WORD_BOUNDARY:
2248: case OP_WORD_BOUNDARY:
2249: {
2250:
2251: /* Find out if the previous and current characters are "word" characters.
2252: It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2253: be "non-word" characters. Remember the earliest consulted character for
2254: partial matching. */
2255:
1.1.1.2 misho 2256: #ifdef SUPPORT_UTF
2257: if (utf)
1.1 misho 2258: {
2259: /* Get status of previous character */
2260:
2261: if (eptr == md->start_subject) prev_is_word = FALSE; else
2262: {
1.1.1.2 misho 2263: PCRE_PUCHAR lastptr = eptr - 1;
2264: BACKCHAR(lastptr);
1.1 misho 2265: if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2266: GETCHAR(c, lastptr);
2267: #ifdef SUPPORT_UCP
2268: if (md->use_ucp)
2269: {
2270: if (c == '_') prev_is_word = TRUE; else
2271: {
2272: int cat = UCD_CATEGORY(c);
2273: prev_is_word = (cat == ucp_L || cat == ucp_N);
2274: }
2275: }
2276: else
2277: #endif
2278: prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2279: }
2280:
2281: /* Get status of next character */
2282:
2283: if (eptr >= md->end_subject)
2284: {
2285: SCHECK_PARTIAL();
2286: cur_is_word = FALSE;
2287: }
2288: else
2289: {
2290: GETCHAR(c, eptr);
2291: #ifdef SUPPORT_UCP
2292: if (md->use_ucp)
2293: {
2294: if (c == '_') cur_is_word = TRUE; else
2295: {
2296: int cat = UCD_CATEGORY(c);
2297: cur_is_word = (cat == ucp_L || cat == ucp_N);
2298: }
2299: }
2300: else
2301: #endif
2302: cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2303: }
2304: }
2305: else
2306: #endif
2307:
2308: /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2309: consistency with the behaviour of \w we do use it in this case. */
2310:
2311: {
2312: /* Get status of previous character */
2313:
2314: if (eptr == md->start_subject) prev_is_word = FALSE; else
2315: {
2316: if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2317: #ifdef SUPPORT_UCP
2318: if (md->use_ucp)
2319: {
2320: c = eptr[-1];
2321: if (c == '_') prev_is_word = TRUE; else
2322: {
2323: int cat = UCD_CATEGORY(c);
2324: prev_is_word = (cat == ucp_L || cat == ucp_N);
2325: }
2326: }
2327: else
2328: #endif
1.1.1.2 misho 2329: prev_is_word = MAX_255(eptr[-1])
2330: && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1.1 misho 2331: }
2332:
2333: /* Get status of next character */
2334:
2335: if (eptr >= md->end_subject)
2336: {
2337: SCHECK_PARTIAL();
2338: cur_is_word = FALSE;
2339: }
2340: else
2341: #ifdef SUPPORT_UCP
2342: if (md->use_ucp)
2343: {
2344: c = *eptr;
2345: if (c == '_') cur_is_word = TRUE; else
2346: {
2347: int cat = UCD_CATEGORY(c);
2348: cur_is_word = (cat == ucp_L || cat == ucp_N);
2349: }
2350: }
2351: else
2352: #endif
1.1.1.2 misho 2353: cur_is_word = MAX_255(*eptr)
2354: && ((md->ctypes[*eptr] & ctype_word) != 0);
1.1 misho 2355: }
2356:
2357: /* Now see if the situation is what we want */
2358:
2359: if ((*ecode++ == OP_WORD_BOUNDARY)?
2360: cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2361: RRETURN(MATCH_NOMATCH);
2362: }
2363: break;
2364:
1.1.1.3 misho 2365: /* Match any single character type except newline; have to take care with
2366: CRLF newlines and partial matching. */
1.1 misho 2367:
2368: case OP_ANY:
2369: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1.1.1.3 misho 2370: if (md->partial != 0 &&
2371: eptr + 1 >= md->end_subject &&
2372: NLBLOCK->nltype == NLTYPE_FIXED &&
2373: NLBLOCK->nllen == 2 &&
1.1.1.4 ! misho 2374: RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
1.1.1.3 misho 2375: {
2376: md->hitend = TRUE;
2377: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2378: }
2379:
1.1 misho 2380: /* Fall through */
2381:
1.1.1.3 misho 2382: /* Match any single character whatsoever. */
2383:
1.1 misho 2384: case OP_ALLANY:
2385: if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2386: { /* not be updated before SCHECK_PARTIAL. */
2387: SCHECK_PARTIAL();
2388: RRETURN(MATCH_NOMATCH);
2389: }
2390: eptr++;
1.1.1.2 misho 2391: #ifdef SUPPORT_UTF
2392: if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2393: #endif
1.1 misho 2394: ecode++;
2395: break;
2396:
2397: /* Match a single byte, even in UTF-8 mode. This opcode really does match
2398: any byte, even newline, independent of the setting of PCRE_DOTALL. */
2399:
2400: case OP_ANYBYTE:
2401: if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2402: { /* not be updated before SCHECK_PARTIAL. */
2403: SCHECK_PARTIAL();
2404: RRETURN(MATCH_NOMATCH);
2405: }
2406: eptr++;
2407: ecode++;
2408: break;
2409:
2410: case OP_NOT_DIGIT:
2411: if (eptr >= md->end_subject)
2412: {
2413: SCHECK_PARTIAL();
2414: RRETURN(MATCH_NOMATCH);
2415: }
2416: GETCHARINCTEST(c, eptr);
2417: if (
1.1.1.2 misho 2418: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
1.1 misho 2419: c < 256 &&
2420: #endif
2421: (md->ctypes[c] & ctype_digit) != 0
2422: )
2423: RRETURN(MATCH_NOMATCH);
2424: ecode++;
2425: break;
2426:
2427: case OP_DIGIT:
2428: if (eptr >= md->end_subject)
2429: {
2430: SCHECK_PARTIAL();
2431: RRETURN(MATCH_NOMATCH);
2432: }
2433: GETCHARINCTEST(c, eptr);
2434: if (
1.1.1.2 misho 2435: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2436: c > 255 ||
1.1 misho 2437: #endif
2438: (md->ctypes[c] & ctype_digit) == 0
2439: )
2440: RRETURN(MATCH_NOMATCH);
2441: ecode++;
2442: break;
2443:
2444: case OP_NOT_WHITESPACE:
2445: if (eptr >= md->end_subject)
2446: {
2447: SCHECK_PARTIAL();
2448: RRETURN(MATCH_NOMATCH);
2449: }
2450: GETCHARINCTEST(c, eptr);
2451: if (
1.1.1.2 misho 2452: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
1.1 misho 2453: c < 256 &&
2454: #endif
2455: (md->ctypes[c] & ctype_space) != 0
2456: )
2457: RRETURN(MATCH_NOMATCH);
2458: ecode++;
2459: break;
2460:
2461: case OP_WHITESPACE:
2462: if (eptr >= md->end_subject)
2463: {
2464: SCHECK_PARTIAL();
2465: RRETURN(MATCH_NOMATCH);
2466: }
2467: GETCHARINCTEST(c, eptr);
2468: if (
1.1.1.2 misho 2469: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2470: c > 255 ||
1.1 misho 2471: #endif
2472: (md->ctypes[c] & ctype_space) == 0
2473: )
2474: RRETURN(MATCH_NOMATCH);
2475: ecode++;
2476: break;
2477:
2478: case OP_NOT_WORDCHAR:
2479: if (eptr >= md->end_subject)
2480: {
2481: SCHECK_PARTIAL();
2482: RRETURN(MATCH_NOMATCH);
2483: }
2484: GETCHARINCTEST(c, eptr);
2485: if (
1.1.1.2 misho 2486: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
1.1 misho 2487: c < 256 &&
2488: #endif
2489: (md->ctypes[c] & ctype_word) != 0
2490: )
2491: RRETURN(MATCH_NOMATCH);
2492: ecode++;
2493: break;
2494:
2495: case OP_WORDCHAR:
2496: if (eptr >= md->end_subject)
2497: {
2498: SCHECK_PARTIAL();
2499: RRETURN(MATCH_NOMATCH);
2500: }
2501: GETCHARINCTEST(c, eptr);
2502: if (
1.1.1.2 misho 2503: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2504: c > 255 ||
1.1 misho 2505: #endif
2506: (md->ctypes[c] & ctype_word) == 0
2507: )
2508: RRETURN(MATCH_NOMATCH);
2509: ecode++;
2510: break;
2511:
2512: case OP_ANYNL:
2513: if (eptr >= md->end_subject)
2514: {
2515: SCHECK_PARTIAL();
2516: RRETURN(MATCH_NOMATCH);
2517: }
2518: GETCHARINCTEST(c, eptr);
2519: switch(c)
2520: {
2521: default: RRETURN(MATCH_NOMATCH);
2522:
1.1.1.4 ! misho 2523: case CHAR_CR:
1.1.1.3 misho 2524: if (eptr >= md->end_subject)
2525: {
2526: SCHECK_PARTIAL();
2527: }
1.1.1.4 ! misho 2528: else if (RAWUCHARTEST(eptr) == CHAR_LF) eptr++;
1.1 misho 2529: break;
2530:
1.1.1.4 ! misho 2531: case CHAR_LF:
1.1 misho 2532: break;
2533:
1.1.1.4 ! misho 2534: case CHAR_VT:
! 2535: case CHAR_FF:
! 2536: case CHAR_NEL:
! 2537: #ifndef EBCDIC
1.1 misho 2538: case 0x2028:
2539: case 0x2029:
1.1.1.4 ! misho 2540: #endif /* Not EBCDIC */
1.1 misho 2541: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2542: break;
2543: }
2544: ecode++;
2545: break;
2546:
2547: case OP_NOT_HSPACE:
2548: if (eptr >= md->end_subject)
2549: {
2550: SCHECK_PARTIAL();
2551: RRETURN(MATCH_NOMATCH);
2552: }
2553: GETCHARINCTEST(c, eptr);
2554: switch(c)
2555: {
1.1.1.4 ! misho 2556: HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
1.1 misho 2557: default: break;
2558: }
2559: ecode++;
2560: break;
2561:
2562: case OP_HSPACE:
2563: if (eptr >= md->end_subject)
2564: {
2565: SCHECK_PARTIAL();
2566: RRETURN(MATCH_NOMATCH);
2567: }
2568: GETCHARINCTEST(c, eptr);
2569: switch(c)
2570: {
1.1.1.4 ! misho 2571: HSPACE_CASES: break; /* Byte and multibyte cases */
1.1 misho 2572: default: RRETURN(MATCH_NOMATCH);
2573: }
2574: ecode++;
2575: break;
2576:
2577: case OP_NOT_VSPACE:
2578: if (eptr >= md->end_subject)
2579: {
2580: SCHECK_PARTIAL();
2581: RRETURN(MATCH_NOMATCH);
2582: }
2583: GETCHARINCTEST(c, eptr);
2584: switch(c)
2585: {
1.1.1.4 ! misho 2586: VSPACE_CASES: RRETURN(MATCH_NOMATCH);
1.1 misho 2587: default: break;
2588: }
2589: ecode++;
2590: break;
2591:
2592: case OP_VSPACE:
2593: if (eptr >= md->end_subject)
2594: {
2595: SCHECK_PARTIAL();
2596: RRETURN(MATCH_NOMATCH);
2597: }
2598: GETCHARINCTEST(c, eptr);
2599: switch(c)
2600: {
1.1.1.4 ! misho 2601: VSPACE_CASES: break;
1.1 misho 2602: default: RRETURN(MATCH_NOMATCH);
2603: }
2604: ecode++;
2605: break;
2606:
2607: #ifdef SUPPORT_UCP
2608: /* Check the next character by Unicode property. We will get here only
2609: if the support is in the binary; otherwise a compile-time error occurs. */
2610:
2611: case OP_PROP:
2612: case OP_NOTPROP:
2613: if (eptr >= md->end_subject)
2614: {
2615: SCHECK_PARTIAL();
2616: RRETURN(MATCH_NOMATCH);
2617: }
2618: GETCHARINCTEST(c, eptr);
2619: {
1.1.1.4 ! misho 2620: const pcre_uint32 *cp;
1.1 misho 2621: const ucd_record *prop = GET_UCD(c);
2622:
2623: switch(ecode[1])
2624: {
2625: case PT_ANY:
2626: if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2627: break;
2628:
2629: case PT_LAMP:
2630: if ((prop->chartype == ucp_Lu ||
2631: prop->chartype == ucp_Ll ||
2632: prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2633: RRETURN(MATCH_NOMATCH);
2634: break;
2635:
2636: case PT_GC:
1.1.1.2 misho 2637: if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
1.1 misho 2638: RRETURN(MATCH_NOMATCH);
2639: break;
2640:
2641: case PT_PC:
2642: if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2643: RRETURN(MATCH_NOMATCH);
2644: break;
2645:
2646: case PT_SC:
2647: if ((ecode[2] != prop->script) == (op == OP_PROP))
2648: RRETURN(MATCH_NOMATCH);
2649: break;
2650:
2651: /* These are specials */
2652:
2653: case PT_ALNUM:
1.1.1.2 misho 2654: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2655: PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
1.1 misho 2656: RRETURN(MATCH_NOMATCH);
2657: break;
2658:
2659: case PT_SPACE: /* Perl space */
1.1.1.2 misho 2660: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.1 misho 2661: c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2662: == (op == OP_NOTPROP))
2663: RRETURN(MATCH_NOMATCH);
2664: break;
2665:
2666: case PT_PXSPACE: /* POSIX space */
1.1.1.2 misho 2667: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.1 misho 2668: c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2669: c == CHAR_FF || c == CHAR_CR)
2670: == (op == OP_NOTPROP))
2671: RRETURN(MATCH_NOMATCH);
2672: break;
2673:
2674: case PT_WORD:
1.1.1.2 misho 2675: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2676: PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1.1 misho 2677: c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2678: RRETURN(MATCH_NOMATCH);
2679: break;
2680:
1.1.1.4 ! misho 2681: case PT_CLIST:
! 2682: cp = PRIV(ucd_caseless_sets) + ecode[2];
! 2683: for (;;)
! 2684: {
! 2685: if (c < *cp)
! 2686: { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
! 2687: if (c == *cp++)
! 2688: { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
! 2689: }
! 2690: break;
! 2691:
! 2692: case PT_UCNC:
! 2693: if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
! 2694: c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
! 2695: c >= 0xe000) == (op == OP_NOTPROP))
! 2696: RRETURN(MATCH_NOMATCH);
! 2697: break;
! 2698:
1.1 misho 2699: /* This should never occur */
2700:
2701: default:
2702: RRETURN(PCRE_ERROR_INTERNAL);
2703: }
2704:
2705: ecode += 3;
2706: }
2707: break;
2708:
2709: /* Match an extended Unicode sequence. We will get here only if the support
2710: is in the binary; otherwise a compile-time error occurs. */
2711:
2712: case OP_EXTUNI:
2713: if (eptr >= md->end_subject)
2714: {
2715: SCHECK_PARTIAL();
2716: RRETURN(MATCH_NOMATCH);
2717: }
1.1.1.4 ! misho 2718: else
1.1 misho 2719: {
1.1.1.4 ! misho 2720: int lgb, rgb;
! 2721: GETCHARINCTEST(c, eptr);
! 2722: lgb = UCD_GRAPHBREAK(c);
! 2723: while (eptr < md->end_subject)
! 2724: {
! 2725: int len = 1;
! 2726: if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
! 2727: rgb = UCD_GRAPHBREAK(c);
! 2728: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
! 2729: lgb = rgb;
! 2730: eptr += len;
! 2731: }
1.1 misho 2732: }
1.1.1.3 misho 2733: CHECK_PARTIAL();
1.1 misho 2734: ecode++;
2735: break;
1.1.1.4 ! misho 2736: #endif /* SUPPORT_UCP */
1.1 misho 2737:
2738:
2739: /* Match a back reference, possibly repeatedly. Look past the end of the
2740: item to see if there is repeat information following. The code is similar
2741: to that for character classes, but repeated for efficiency. Then obey
2742: similar code to character type repeats - written out again for speed.
2743: However, if the referenced string is the empty string, always treat
2744: it as matched, any number of times (otherwise there could be infinite
2745: loops). */
2746:
2747: case OP_REF:
2748: case OP_REFI:
2749: caseless = op == OP_REFI;
2750: offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1.1.1.2 misho 2751: ecode += 1 + IMM2_SIZE;
1.1 misho 2752:
2753: /* If the reference is unset, there are two possibilities:
2754:
2755: (a) In the default, Perl-compatible state, set the length negative;
2756: this ensures that every attempt at a match fails. We can't just fail
2757: here, because of the possibility of quantifiers with zero minima.
2758:
2759: (b) If the JavaScript compatibility flag is set, set the length to zero
2760: so that the back reference matches an empty string.
2761:
2762: Otherwise, set the length to the length of what was matched by the
2763: referenced subpattern. */
2764:
2765: if (offset >= offset_top || md->offset_vector[offset] < 0)
2766: length = (md->jscript_compat)? 0 : -1;
2767: else
2768: length = md->offset_vector[offset+1] - md->offset_vector[offset];
2769:
2770: /* Set up for repetition, or handle the non-repeated case */
2771:
2772: switch (*ecode)
2773: {
2774: case OP_CRSTAR:
2775: case OP_CRMINSTAR:
2776: case OP_CRPLUS:
2777: case OP_CRMINPLUS:
2778: case OP_CRQUERY:
2779: case OP_CRMINQUERY:
2780: c = *ecode++ - OP_CRSTAR;
2781: minimize = (c & 1) != 0;
2782: min = rep_min[c]; /* Pick up values from tables; */
2783: max = rep_max[c]; /* zero for max => infinity */
2784: if (max == 0) max = INT_MAX;
2785: break;
2786:
2787: case OP_CRRANGE:
2788: case OP_CRMINRANGE:
2789: minimize = (*ecode == OP_CRMINRANGE);
2790: min = GET2(ecode, 1);
1.1.1.2 misho 2791: max = GET2(ecode, 1 + IMM2_SIZE);
1.1 misho 2792: if (max == 0) max = INT_MAX;
1.1.1.2 misho 2793: ecode += 1 + 2 * IMM2_SIZE;
1.1 misho 2794: break;
2795:
2796: default: /* No repeat follows */
2797: if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2798: {
1.1.1.3 misho 2799: if (length == -2) eptr = md->end_subject; /* Partial match */
1.1 misho 2800: CHECK_PARTIAL();
2801: RRETURN(MATCH_NOMATCH);
2802: }
2803: eptr += length;
2804: continue; /* With the main loop */
2805: }
2806:
2807: /* Handle repeated back references. If the length of the reference is
1.1.1.2 misho 2808: zero, just continue with the main loop. If the length is negative, it
2809: means the reference is unset in non-Java-compatible mode. If the minimum is
2810: zero, we can continue at the same level without recursion. For any other
2811: minimum, carrying on will result in NOMATCH. */
1.1 misho 2812:
2813: if (length == 0) continue;
1.1.1.2 misho 2814: if (length < 0 && min == 0) continue;
1.1 misho 2815:
2816: /* First, ensure the minimum number of matches are present. We get back
2817: the length of the reference string explicitly rather than passing the
2818: address of eptr, so that eptr can be a register variable. */
2819:
2820: for (i = 1; i <= min; i++)
2821: {
2822: int slength;
2823: if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2824: {
1.1.1.3 misho 2825: if (slength == -2) eptr = md->end_subject; /* Partial match */
1.1 misho 2826: CHECK_PARTIAL();
2827: RRETURN(MATCH_NOMATCH);
2828: }
2829: eptr += slength;
2830: }
2831:
2832: /* If min = max, continue at the same level without recursion.
2833: They are not both allowed to be zero. */
2834:
2835: if (min == max) continue;
2836:
2837: /* If minimizing, keep trying and advancing the pointer */
2838:
2839: if (minimize)
2840: {
2841: for (fi = min;; fi++)
2842: {
2843: int slength;
2844: RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2845: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2846: if (fi >= max) RRETURN(MATCH_NOMATCH);
2847: if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2848: {
1.1.1.3 misho 2849: if (slength == -2) eptr = md->end_subject; /* Partial match */
1.1 misho 2850: CHECK_PARTIAL();
2851: RRETURN(MATCH_NOMATCH);
2852: }
2853: eptr += slength;
2854: }
2855: /* Control never gets here */
2856: }
2857:
2858: /* If maximizing, find the longest string and work backwards */
2859:
2860: else
2861: {
2862: pp = eptr;
2863: for (i = min; i < max; i++)
2864: {
2865: int slength;
2866: if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2867: {
1.1.1.3 misho 2868: /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2869: the soft partial matching case. */
2870:
2871: if (slength == -2 && md->partial != 0 &&
2872: md->end_subject > md->start_used_ptr)
2873: {
2874: md->hitend = TRUE;
2875: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2876: }
1.1 misho 2877: break;
2878: }
2879: eptr += slength;
2880: }
1.1.1.3 misho 2881:
1.1 misho 2882: while (eptr >= pp)
2883: {
2884: RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2885: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2886: eptr -= length;
2887: }
2888: RRETURN(MATCH_NOMATCH);
2889: }
2890: /* Control never gets here */
2891:
2892: /* Match a bit-mapped character class, possibly repeatedly. This op code is
2893: used when all the characters in the class have values in the range 0-255,
2894: and either the matching is caseful, or the characters are in the range
2895: 0-127 when UTF-8 processing is enabled. The only difference between
2896: OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2897: encountered.
2898:
2899: First, look past the end of the item to see if there is repeat information
2900: following. Then obey similar code to character type repeats - written out
2901: again for speed. */
2902:
2903: case OP_NCLASS:
2904: case OP_CLASS:
2905: {
1.1.1.2 misho 2906: /* The data variable is saved across frames, so the byte map needs to
2907: be stored there. */
2908: #define BYTE_MAP ((pcre_uint8 *)data)
1.1 misho 2909: data = ecode + 1; /* Save for matching */
1.1.1.2 misho 2910: ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
1.1 misho 2911:
2912: switch (*ecode)
2913: {
2914: case OP_CRSTAR:
2915: case OP_CRMINSTAR:
2916: case OP_CRPLUS:
2917: case OP_CRMINPLUS:
2918: case OP_CRQUERY:
2919: case OP_CRMINQUERY:
2920: c = *ecode++ - OP_CRSTAR;
2921: minimize = (c & 1) != 0;
2922: min = rep_min[c]; /* Pick up values from tables; */
2923: max = rep_max[c]; /* zero for max => infinity */
2924: if (max == 0) max = INT_MAX;
2925: break;
2926:
2927: case OP_CRRANGE:
2928: case OP_CRMINRANGE:
2929: minimize = (*ecode == OP_CRMINRANGE);
2930: min = GET2(ecode, 1);
1.1.1.2 misho 2931: max = GET2(ecode, 1 + IMM2_SIZE);
1.1 misho 2932: if (max == 0) max = INT_MAX;
1.1.1.2 misho 2933: ecode += 1 + 2 * IMM2_SIZE;
1.1 misho 2934: break;
2935:
2936: default: /* No repeat follows */
2937: min = max = 1;
2938: break;
2939: }
2940:
2941: /* First, ensure the minimum number of matches are present. */
2942:
1.1.1.2 misho 2943: #ifdef SUPPORT_UTF
2944: if (utf)
1.1 misho 2945: {
2946: for (i = 1; i <= min; i++)
2947: {
2948: if (eptr >= md->end_subject)
2949: {
2950: SCHECK_PARTIAL();
2951: RRETURN(MATCH_NOMATCH);
2952: }
2953: GETCHARINC(c, eptr);
2954: if (c > 255)
2955: {
2956: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2957: }
2958: else
1.1.1.2 misho 2959: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1.1 misho 2960: }
2961: }
2962: else
2963: #endif
1.1.1.2 misho 2964: /* Not UTF mode */
1.1 misho 2965: {
2966: for (i = 1; i <= min; i++)
2967: {
2968: if (eptr >= md->end_subject)
2969: {
2970: SCHECK_PARTIAL();
2971: RRETURN(MATCH_NOMATCH);
2972: }
2973: c = *eptr++;
1.1.1.2 misho 2974: #ifndef COMPILE_PCRE8
2975: if (c > 255)
2976: {
2977: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2978: }
2979: else
2980: #endif
2981: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1.1 misho 2982: }
2983: }
2984:
2985: /* If max == min we can continue with the main loop without the
2986: need to recurse. */
2987:
2988: if (min == max) continue;
2989:
2990: /* If minimizing, keep testing the rest of the expression and advancing
2991: the pointer while it matches the class. */
2992:
2993: if (minimize)
2994: {
1.1.1.2 misho 2995: #ifdef SUPPORT_UTF
2996: if (utf)
1.1 misho 2997: {
2998: for (fi = min;; fi++)
2999: {
3000: RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
3001: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3002: if (fi >= max) RRETURN(MATCH_NOMATCH);
3003: if (eptr >= md->end_subject)
3004: {
3005: SCHECK_PARTIAL();
3006: RRETURN(MATCH_NOMATCH);
3007: }
3008: GETCHARINC(c, eptr);
3009: if (c > 255)
3010: {
3011: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
3012: }
3013: else
1.1.1.2 misho 3014: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1.1 misho 3015: }
3016: }
3017: else
3018: #endif
1.1.1.2 misho 3019: /* Not UTF mode */
1.1 misho 3020: {
3021: for (fi = min;; fi++)
3022: {
3023: RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
3024: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3025: if (fi >= max) RRETURN(MATCH_NOMATCH);
3026: if (eptr >= md->end_subject)
3027: {
3028: SCHECK_PARTIAL();
3029: RRETURN(MATCH_NOMATCH);
3030: }
3031: c = *eptr++;
1.1.1.2 misho 3032: #ifndef COMPILE_PCRE8
3033: if (c > 255)
3034: {
3035: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
3036: }
3037: else
3038: #endif
3039: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1.1 misho 3040: }
3041: }
3042: /* Control never gets here */
3043: }
3044:
3045: /* If maximizing, find the longest possible run, then work backwards. */
3046:
3047: else
3048: {
3049: pp = eptr;
3050:
1.1.1.2 misho 3051: #ifdef SUPPORT_UTF
3052: if (utf)
1.1 misho 3053: {
3054: for (i = min; i < max; i++)
3055: {
3056: int len = 1;
3057: if (eptr >= md->end_subject)
3058: {
3059: SCHECK_PARTIAL();
3060: break;
3061: }
3062: GETCHARLEN(c, eptr, len);
3063: if (c > 255)
3064: {
3065: if (op == OP_CLASS) break;
3066: }
3067: else
1.1.1.2 misho 3068: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
1.1 misho 3069: eptr += len;
3070: }
3071: for (;;)
3072: {
3073: RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
3074: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3075: if (eptr-- == pp) break; /* Stop if tried at original pos */
3076: BACKCHAR(eptr);
3077: }
3078: }
3079: else
3080: #endif
1.1.1.2 misho 3081: /* Not UTF mode */
1.1 misho 3082: {
3083: for (i = min; i < max; i++)
3084: {
3085: if (eptr >= md->end_subject)
3086: {
3087: SCHECK_PARTIAL();
3088: break;
3089: }
3090: c = *eptr;
1.1.1.2 misho 3091: #ifndef COMPILE_PCRE8
3092: if (c > 255)
3093: {
3094: if (op == OP_CLASS) break;
3095: }
3096: else
3097: #endif
3098: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
1.1 misho 3099: eptr++;
3100: }
3101: while (eptr >= pp)
3102: {
3103: RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3104: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3105: eptr--;
3106: }
3107: }
3108:
3109: RRETURN(MATCH_NOMATCH);
3110: }
1.1.1.2 misho 3111: #undef BYTE_MAP
1.1 misho 3112: }
3113: /* Control never gets here */
3114:
3115:
3116: /* Match an extended character class. This opcode is encountered only
3117: when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3118: mode, because Unicode properties are supported in non-UTF-8 mode. */
3119:
1.1.1.2 misho 3120: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1.1 misho 3121: case OP_XCLASS:
3122: {
3123: data = ecode + 1 + LINK_SIZE; /* Save for matching */
3124: ecode += GET(ecode, 1); /* Advance past the item */
3125:
3126: switch (*ecode)
3127: {
3128: case OP_CRSTAR:
3129: case OP_CRMINSTAR:
3130: case OP_CRPLUS:
3131: case OP_CRMINPLUS:
3132: case OP_CRQUERY:
3133: case OP_CRMINQUERY:
3134: c = *ecode++ - OP_CRSTAR;
3135: minimize = (c & 1) != 0;
3136: min = rep_min[c]; /* Pick up values from tables; */
3137: max = rep_max[c]; /* zero for max => infinity */
3138: if (max == 0) max = INT_MAX;
3139: break;
3140:
3141: case OP_CRRANGE:
3142: case OP_CRMINRANGE:
3143: minimize = (*ecode == OP_CRMINRANGE);
3144: min = GET2(ecode, 1);
1.1.1.2 misho 3145: max = GET2(ecode, 1 + IMM2_SIZE);
1.1 misho 3146: if (max == 0) max = INT_MAX;
1.1.1.2 misho 3147: ecode += 1 + 2 * IMM2_SIZE;
1.1 misho 3148: break;
3149:
3150: default: /* No repeat follows */
3151: min = max = 1;
3152: break;
3153: }
3154:
3155: /* First, ensure the minimum number of matches are present. */
3156:
3157: for (i = 1; i <= min; i++)
3158: {
3159: if (eptr >= md->end_subject)
3160: {
3161: SCHECK_PARTIAL();
3162: RRETURN(MATCH_NOMATCH);
3163: }
3164: GETCHARINCTEST(c, eptr);
1.1.1.2 misho 3165: if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
1.1 misho 3166: }
3167:
3168: /* If max == min we can continue with the main loop without the
3169: need to recurse. */
3170:
3171: if (min == max) continue;
3172:
3173: /* If minimizing, keep testing the rest of the expression and advancing
3174: the pointer while it matches the class. */
3175:
3176: if (minimize)
3177: {
3178: for (fi = min;; fi++)
3179: {
3180: RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3181: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3182: if (fi >= max) RRETURN(MATCH_NOMATCH);
3183: if (eptr >= md->end_subject)
3184: {
3185: SCHECK_PARTIAL();
3186: RRETURN(MATCH_NOMATCH);
3187: }
3188: GETCHARINCTEST(c, eptr);
1.1.1.2 misho 3189: if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
1.1 misho 3190: }
3191: /* Control never gets here */
3192: }
3193:
3194: /* If maximizing, find the longest possible run, then work backwards. */
3195:
3196: else
3197: {
3198: pp = eptr;
3199: for (i = min; i < max; i++)
3200: {
3201: int len = 1;
3202: if (eptr >= md->end_subject)
3203: {
3204: SCHECK_PARTIAL();
3205: break;
3206: }
1.1.1.2 misho 3207: #ifdef SUPPORT_UTF
1.1 misho 3208: GETCHARLENTEST(c, eptr, len);
1.1.1.2 misho 3209: #else
3210: c = *eptr;
3211: #endif
3212: if (!PRIV(xclass)(c, data, utf)) break;
1.1 misho 3213: eptr += len;
3214: }
3215: for(;;)
3216: {
3217: RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3218: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3219: if (eptr-- == pp) break; /* Stop if tried at original pos */
1.1.1.2 misho 3220: #ifdef SUPPORT_UTF
3221: if (utf) BACKCHAR(eptr);
3222: #endif
1.1 misho 3223: }
3224: RRETURN(MATCH_NOMATCH);
3225: }
3226:
3227: /* Control never gets here */
3228: }
3229: #endif /* End of XCLASS */
3230:
3231: /* Match a single character, casefully */
3232:
3233: case OP_CHAR:
1.1.1.2 misho 3234: #ifdef SUPPORT_UTF
3235: if (utf)
1.1 misho 3236: {
3237: length = 1;
3238: ecode++;
3239: GETCHARLEN(fc, ecode, length);
3240: if (length > md->end_subject - eptr)
3241: {
3242: CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3243: RRETURN(MATCH_NOMATCH);
3244: }
1.1.1.4 ! misho 3245: while (length-- > 0) if (*ecode++ != RAWUCHARINC(eptr)) RRETURN(MATCH_NOMATCH);
1.1 misho 3246: }
3247: else
3248: #endif
1.1.1.2 misho 3249: /* Not UTF mode */
1.1 misho 3250: {
3251: if (md->end_subject - eptr < 1)
3252: {
3253: SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3254: RRETURN(MATCH_NOMATCH);
3255: }
3256: if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3257: ecode += 2;
3258: }
3259: break;
3260:
3261: /* Match a single character, caselessly. If we are at the end of the
3262: subject, give up immediately. */
3263:
3264: case OP_CHARI:
3265: if (eptr >= md->end_subject)
3266: {
3267: SCHECK_PARTIAL();
3268: RRETURN(MATCH_NOMATCH);
3269: }
3270:
1.1.1.2 misho 3271: #ifdef SUPPORT_UTF
3272: if (utf)
1.1 misho 3273: {
3274: length = 1;
3275: ecode++;
3276: GETCHARLEN(fc, ecode, length);
3277:
3278: /* If the pattern character's value is < 128, we have only one byte, and
3279: we know that its other case must also be one byte long, so we can use the
3280: fast lookup table. We know that there is at least one byte left in the
3281: subject. */
3282:
3283: if (fc < 128)
3284: {
1.1.1.4 ! misho 3285: pcre_uint32 cc = RAWUCHAR(eptr);
! 3286: if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
1.1.1.2 misho 3287: ecode++;
3288: eptr++;
1.1 misho 3289: }
3290:
3291: /* Otherwise we must pick up the subject character. Note that we cannot
3292: use the value of "length" to check for sufficient bytes left, because the
3293: other case of the character may have more or fewer bytes. */
3294:
3295: else
3296: {
1.1.1.4 ! misho 3297: pcre_uint32 dc;
1.1 misho 3298: GETCHARINC(dc, eptr);
3299: ecode += length;
3300:
3301: /* If we have Unicode property support, we can use it to test the other
3302: case of the character, if there is one. */
3303:
3304: if (fc != dc)
3305: {
3306: #ifdef SUPPORT_UCP
3307: if (dc != UCD_OTHERCASE(fc))
3308: #endif
3309: RRETURN(MATCH_NOMATCH);
3310: }
3311: }
3312: }
3313: else
1.1.1.2 misho 3314: #endif /* SUPPORT_UTF */
1.1 misho 3315:
1.1.1.2 misho 3316: /* Not UTF mode */
1.1 misho 3317: {
1.1.1.2 misho 3318: if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3319: != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3320: eptr++;
1.1 misho 3321: ecode += 2;
3322: }
3323: break;
3324:
3325: /* Match a single character repeatedly. */
3326:
3327: case OP_EXACT:
3328: case OP_EXACTI:
3329: min = max = GET2(ecode, 1);
1.1.1.2 misho 3330: ecode += 1 + IMM2_SIZE;
1.1 misho 3331: goto REPEATCHAR;
3332:
3333: case OP_POSUPTO:
3334: case OP_POSUPTOI:
3335: possessive = TRUE;
3336: /* Fall through */
3337:
3338: case OP_UPTO:
3339: case OP_UPTOI:
3340: case OP_MINUPTO:
3341: case OP_MINUPTOI:
3342: min = 0;
3343: max = GET2(ecode, 1);
3344: minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
1.1.1.2 misho 3345: ecode += 1 + IMM2_SIZE;
1.1 misho 3346: goto REPEATCHAR;
3347:
3348: case OP_POSSTAR:
3349: case OP_POSSTARI:
3350: possessive = TRUE;
3351: min = 0;
3352: max = INT_MAX;
3353: ecode++;
3354: goto REPEATCHAR;
3355:
3356: case OP_POSPLUS:
3357: case OP_POSPLUSI:
3358: possessive = TRUE;
3359: min = 1;
3360: max = INT_MAX;
3361: ecode++;
3362: goto REPEATCHAR;
3363:
3364: case OP_POSQUERY:
3365: case OP_POSQUERYI:
3366: possessive = TRUE;
3367: min = 0;
3368: max = 1;
3369: ecode++;
3370: goto REPEATCHAR;
3371:
3372: case OP_STAR:
3373: case OP_STARI:
3374: case OP_MINSTAR:
3375: case OP_MINSTARI:
3376: case OP_PLUS:
3377: case OP_PLUSI:
3378: case OP_MINPLUS:
3379: case OP_MINPLUSI:
3380: case OP_QUERY:
3381: case OP_QUERYI:
3382: case OP_MINQUERY:
3383: case OP_MINQUERYI:
3384: c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3385: minimize = (c & 1) != 0;
3386: min = rep_min[c]; /* Pick up values from tables; */
3387: max = rep_max[c]; /* zero for max => infinity */
3388: if (max == 0) max = INT_MAX;
3389:
1.1.1.4 ! misho 3390: /* Common code for all repeated single-character matches. We first check
! 3391: for the minimum number of characters. If the minimum equals the maximum, we
! 3392: are done. Otherwise, if minimizing, check the rest of the pattern for a
! 3393: match; if there isn't one, advance up to the maximum, one character at a
! 3394: time.
! 3395:
! 3396: If maximizing, advance up to the maximum number of matching characters,
! 3397: until eptr is past the end of the maximum run. If possessive, we are
! 3398: then done (no backing up). Otherwise, match at this position; anything
! 3399: other than no match is immediately returned. For nomatch, back up one
! 3400: character, unless we are matching \R and the last thing matched was
! 3401: \r\n, in which case, back up two bytes. When we reach the first optional
! 3402: character position, we can save stack by doing a tail recurse.
! 3403:
! 3404: The various UTF/non-UTF and caseful/caseless cases are handled separately,
! 3405: for speed. */
1.1 misho 3406:
3407: REPEATCHAR:
1.1.1.2 misho 3408: #ifdef SUPPORT_UTF
3409: if (utf)
1.1 misho 3410: {
3411: length = 1;
3412: charptr = ecode;
3413: GETCHARLEN(fc, ecode, length);
3414: ecode += length;
3415:
3416: /* Handle multibyte character matching specially here. There is
3417: support for caseless matching if UCP support is present. */
3418:
3419: if (length > 1)
3420: {
3421: #ifdef SUPPORT_UCP
1.1.1.4 ! misho 3422: pcre_uint32 othercase;
1.1 misho 3423: if (op >= OP_STARI && /* Caseless */
3424: (othercase = UCD_OTHERCASE(fc)) != fc)
1.1.1.2 misho 3425: oclength = PRIV(ord2utf)(othercase, occhars);
1.1 misho 3426: else oclength = 0;
3427: #endif /* SUPPORT_UCP */
3428:
3429: for (i = 1; i <= min; i++)
3430: {
3431: if (eptr <= md->end_subject - length &&
1.1.1.2 misho 3432: memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
1.1 misho 3433: #ifdef SUPPORT_UCP
3434: else if (oclength > 0 &&
3435: eptr <= md->end_subject - oclength &&
1.1.1.2 misho 3436: memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
1.1 misho 3437: #endif /* SUPPORT_UCP */
3438: else
3439: {
3440: CHECK_PARTIAL();
3441: RRETURN(MATCH_NOMATCH);
3442: }
3443: }
3444:
3445: if (min == max) continue;
3446:
3447: if (minimize)
3448: {
3449: for (fi = min;; fi++)
3450: {
3451: RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3452: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3453: if (fi >= max) RRETURN(MATCH_NOMATCH);
3454: if (eptr <= md->end_subject - length &&
1.1.1.2 misho 3455: memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
1.1 misho 3456: #ifdef SUPPORT_UCP
3457: else if (oclength > 0 &&
3458: eptr <= md->end_subject - oclength &&
1.1.1.2 misho 3459: memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
1.1 misho 3460: #endif /* SUPPORT_UCP */
3461: else
3462: {
3463: CHECK_PARTIAL();
3464: RRETURN(MATCH_NOMATCH);
3465: }
3466: }
3467: /* Control never gets here */
3468: }
3469:
3470: else /* Maximize */
3471: {
3472: pp = eptr;
3473: for (i = min; i < max; i++)
3474: {
3475: if (eptr <= md->end_subject - length &&
1.1.1.2 misho 3476: memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
1.1 misho 3477: #ifdef SUPPORT_UCP
3478: else if (oclength > 0 &&
3479: eptr <= md->end_subject - oclength &&
1.1.1.2 misho 3480: memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
1.1 misho 3481: #endif /* SUPPORT_UCP */
3482: else
3483: {
3484: CHECK_PARTIAL();
3485: break;
3486: }
3487: }
3488:
1.1.1.4 ! misho 3489: if (possessive) continue; /* No backtracking */
1.1 misho 3490: for(;;)
3491: {
1.1.1.4 ! misho 3492: if (eptr == pp) goto TAIL_RECURSE;
1.1 misho 3493: RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3494: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3495: #ifdef SUPPORT_UCP
3496: eptr--;
3497: BACKCHAR(eptr);
3498: #else /* without SUPPORT_UCP */
3499: eptr -= length;
3500: #endif /* SUPPORT_UCP */
3501: }
3502: }
3503: /* Control never gets here */
3504: }
3505:
3506: /* If the length of a UTF-8 character is 1, we fall through here, and
3507: obey the code as for non-UTF-8 characters below, though in this case the
3508: value of fc will always be < 128. */
3509: }
3510: else
1.1.1.2 misho 3511: #endif /* SUPPORT_UTF */
3512: /* When not in UTF-8 mode, load a single-byte character. */
3513: fc = *ecode++;
1.1 misho 3514:
1.1.1.2 misho 3515: /* The value of fc at this point is always one character, though we may
3516: or may not be in UTF mode. The code is duplicated for the caseless and
1.1 misho 3517: caseful cases, for speed, since matching characters is likely to be quite
3518: common. First, ensure the minimum number of matches are present. If min =
3519: max, continue at the same level without recursing. Otherwise, if
3520: minimizing, keep trying the rest of the expression and advancing one
3521: matching character if failing, up to the maximum. Alternatively, if
3522: maximizing, find the maximum number of characters and work backwards. */
3523:
3524: DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
1.1.1.3 misho 3525: max, (char *)eptr));
1.1 misho 3526:
3527: if (op >= OP_STARI) /* Caseless */
3528: {
1.1.1.2 misho 3529: #ifdef COMPILE_PCRE8
3530: /* fc must be < 128 if UTF is enabled. */
3531: foc = md->fcc[fc];
3532: #else
3533: #ifdef SUPPORT_UTF
3534: #ifdef SUPPORT_UCP
3535: if (utf && fc > 127)
3536: foc = UCD_OTHERCASE(fc);
3537: #else
3538: if (utf && fc > 127)
3539: foc = fc;
3540: #endif /* SUPPORT_UCP */
3541: else
3542: #endif /* SUPPORT_UTF */
3543: foc = TABLE_GET(fc, md->fcc, fc);
3544: #endif /* COMPILE_PCRE8 */
3545:
1.1 misho 3546: for (i = 1; i <= min; i++)
3547: {
1.1.1.4 ! misho 3548: pcre_uint32 cc; /* Faster than pcre_uchar */
1.1 misho 3549: if (eptr >= md->end_subject)
3550: {
3551: SCHECK_PARTIAL();
3552: RRETURN(MATCH_NOMATCH);
3553: }
1.1.1.4 ! misho 3554: cc = RAWUCHARTEST(eptr);
! 3555: if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
1.1.1.2 misho 3556: eptr++;
1.1 misho 3557: }
3558: if (min == max) continue;
3559: if (minimize)
3560: {
3561: for (fi = min;; fi++)
3562: {
1.1.1.4 ! misho 3563: pcre_uint32 cc; /* Faster than pcre_uchar */
1.1 misho 3564: RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3565: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3566: if (fi >= max) RRETURN(MATCH_NOMATCH);
3567: if (eptr >= md->end_subject)
3568: {
3569: SCHECK_PARTIAL();
3570: RRETURN(MATCH_NOMATCH);
3571: }
1.1.1.4 ! misho 3572: cc = RAWUCHARTEST(eptr);
! 3573: if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
1.1.1.2 misho 3574: eptr++;
1.1 misho 3575: }
3576: /* Control never gets here */
3577: }
3578: else /* Maximize */
3579: {
3580: pp = eptr;
3581: for (i = min; i < max; i++)
3582: {
1.1.1.4 ! misho 3583: pcre_uint32 cc; /* Faster than pcre_uchar */
1.1 misho 3584: if (eptr >= md->end_subject)
3585: {
3586: SCHECK_PARTIAL();
3587: break;
3588: }
1.1.1.4 ! misho 3589: cc = RAWUCHARTEST(eptr);
! 3590: if (fc != cc && foc != cc) break;
1.1 misho 3591: eptr++;
3592: }
3593:
1.1.1.4 ! misho 3594: if (possessive) continue; /* No backtracking */
! 3595: for (;;)
1.1 misho 3596: {
1.1.1.4 ! misho 3597: if (eptr == pp) goto TAIL_RECURSE;
1.1 misho 3598: RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3599: eptr--;
3600: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3601: }
3602: RRETURN(MATCH_NOMATCH);
3603: }
3604: /* Control never gets here */
3605: }
3606:
3607: /* Caseful comparisons (includes all multi-byte characters) */
3608:
3609: else
3610: {
3611: for (i = 1; i <= min; i++)
3612: {
3613: if (eptr >= md->end_subject)
3614: {
3615: SCHECK_PARTIAL();
3616: RRETURN(MATCH_NOMATCH);
3617: }
1.1.1.4 ! misho 3618: if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
1.1 misho 3619: }
3620:
3621: if (min == max) continue;
3622:
3623: if (minimize)
3624: {
3625: for (fi = min;; fi++)
3626: {
3627: RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3628: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3629: if (fi >= max) RRETURN(MATCH_NOMATCH);
3630: if (eptr >= md->end_subject)
3631: {
3632: SCHECK_PARTIAL();
3633: RRETURN(MATCH_NOMATCH);
3634: }
1.1.1.4 ! misho 3635: if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
1.1 misho 3636: }
3637: /* Control never gets here */
3638: }
3639: else /* Maximize */
3640: {
3641: pp = eptr;
3642: for (i = min; i < max; i++)
3643: {
3644: if (eptr >= md->end_subject)
3645: {
3646: SCHECK_PARTIAL();
3647: break;
3648: }
1.1.1.4 ! misho 3649: if (fc != RAWUCHARTEST(eptr)) break;
1.1 misho 3650: eptr++;
3651: }
1.1.1.4 ! misho 3652: if (possessive) continue; /* No backtracking */
! 3653: for (;;)
1.1 misho 3654: {
1.1.1.4 ! misho 3655: if (eptr == pp) goto TAIL_RECURSE;
1.1 misho 3656: RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3657: eptr--;
3658: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3659: }
3660: RRETURN(MATCH_NOMATCH);
3661: }
3662: }
3663: /* Control never gets here */
3664:
3665: /* Match a negated single one-byte character. The character we are
3666: checking can be multibyte. */
3667:
3668: case OP_NOT:
3669: case OP_NOTI:
3670: if (eptr >= md->end_subject)
3671: {
3672: SCHECK_PARTIAL();
3673: RRETURN(MATCH_NOMATCH);
3674: }
1.1.1.3 misho 3675: #ifdef SUPPORT_UTF
3676: if (utf)
1.1 misho 3677: {
1.1.1.4 ! misho 3678: register pcre_uint32 ch, och;
1.1.1.3 misho 3679:
3680: ecode++;
3681: GETCHARINC(ch, ecode);
3682: GETCHARINC(c, eptr);
3683:
3684: if (op == OP_NOT)
3685: {
3686: if (ch == c) RRETURN(MATCH_NOMATCH);
3687: }
3688: else
3689: {
1.1.1.2 misho 3690: #ifdef SUPPORT_UCP
1.1.1.3 misho 3691: if (ch > 127)
3692: och = UCD_OTHERCASE(ch);
1.1.1.2 misho 3693: #else
1.1.1.3 misho 3694: if (ch > 127)
3695: och = ch;
1.1.1.2 misho 3696: #endif /* SUPPORT_UCP */
1.1.1.3 misho 3697: else
3698: och = TABLE_GET(ch, md->fcc, ch);
3699: if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3700: }
1.1 misho 3701: }
1.1.1.3 misho 3702: else
3703: #endif
1.1 misho 3704: {
1.1.1.4 ! misho 3705: register pcre_uint32 ch = ecode[1];
1.1.1.3 misho 3706: c = *eptr++;
3707: if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3708: RRETURN(MATCH_NOMATCH);
3709: ecode += 2;
1.1 misho 3710: }
3711: break;
3712:
3713: /* Match a negated single one-byte character repeatedly. This is almost a
3714: repeat of the code for a repeated single character, but I haven't found a
3715: nice way of commoning these up that doesn't require a test of the
3716: positive/negative option for each character match. Maybe that wouldn't add
3717: very much to the time taken, but character matching *is* what this is all
3718: about... */
3719:
3720: case OP_NOTEXACT:
3721: case OP_NOTEXACTI:
3722: min = max = GET2(ecode, 1);
1.1.1.2 misho 3723: ecode += 1 + IMM2_SIZE;
1.1 misho 3724: goto REPEATNOTCHAR;
3725:
3726: case OP_NOTUPTO:
3727: case OP_NOTUPTOI:
3728: case OP_NOTMINUPTO:
3729: case OP_NOTMINUPTOI:
3730: min = 0;
3731: max = GET2(ecode, 1);
3732: minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
1.1.1.2 misho 3733: ecode += 1 + IMM2_SIZE;
1.1 misho 3734: goto REPEATNOTCHAR;
3735:
3736: case OP_NOTPOSSTAR:
3737: case OP_NOTPOSSTARI:
3738: possessive = TRUE;
3739: min = 0;
3740: max = INT_MAX;
3741: ecode++;
3742: goto REPEATNOTCHAR;
3743:
3744: case OP_NOTPOSPLUS:
3745: case OP_NOTPOSPLUSI:
3746: possessive = TRUE;
3747: min = 1;
3748: max = INT_MAX;
3749: ecode++;
3750: goto REPEATNOTCHAR;
3751:
3752: case OP_NOTPOSQUERY:
3753: case OP_NOTPOSQUERYI:
3754: possessive = TRUE;
3755: min = 0;
3756: max = 1;
3757: ecode++;
3758: goto REPEATNOTCHAR;
3759:
3760: case OP_NOTPOSUPTO:
3761: case OP_NOTPOSUPTOI:
3762: possessive = TRUE;
3763: min = 0;
3764: max = GET2(ecode, 1);
1.1.1.2 misho 3765: ecode += 1 + IMM2_SIZE;
1.1 misho 3766: goto REPEATNOTCHAR;
3767:
3768: case OP_NOTSTAR:
3769: case OP_NOTSTARI:
3770: case OP_NOTMINSTAR:
3771: case OP_NOTMINSTARI:
3772: case OP_NOTPLUS:
3773: case OP_NOTPLUSI:
3774: case OP_NOTMINPLUS:
3775: case OP_NOTMINPLUSI:
3776: case OP_NOTQUERY:
3777: case OP_NOTQUERYI:
3778: case OP_NOTMINQUERY:
3779: case OP_NOTMINQUERYI:
3780: c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3781: minimize = (c & 1) != 0;
3782: min = rep_min[c]; /* Pick up values from tables; */
3783: max = rep_max[c]; /* zero for max => infinity */
3784: if (max == 0) max = INT_MAX;
3785:
3786: /* Common code for all repeated single-byte matches. */
3787:
3788: REPEATNOTCHAR:
1.1.1.3 misho 3789: GETCHARINCTEST(fc, ecode);
1.1 misho 3790:
3791: /* The code is duplicated for the caseless and caseful cases, for speed,
3792: since matching characters is likely to be quite common. First, ensure the
3793: minimum number of matches are present. If min = max, continue at the same
3794: level without recursing. Otherwise, if minimizing, keep trying the rest of
3795: the expression and advancing one matching character if failing, up to the
3796: maximum. Alternatively, if maximizing, find the maximum number of
3797: characters and work backwards. */
3798:
3799: DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
1.1.1.3 misho 3800: max, (char *)eptr));
1.1 misho 3801:
3802: if (op >= OP_NOTSTARI) /* Caseless */
3803: {
1.1.1.2 misho 3804: #ifdef SUPPORT_UTF
3805: #ifdef SUPPORT_UCP
3806: if (utf && fc > 127)
3807: foc = UCD_OTHERCASE(fc);
3808: #else
3809: if (utf && fc > 127)
3810: foc = fc;
3811: #endif /* SUPPORT_UCP */
3812: else
3813: #endif /* SUPPORT_UTF */
3814: foc = TABLE_GET(fc, md->fcc, fc);
1.1 misho 3815:
1.1.1.2 misho 3816: #ifdef SUPPORT_UTF
3817: if (utf)
1.1 misho 3818: {
1.1.1.4 ! misho 3819: register pcre_uint32 d;
1.1 misho 3820: for (i = 1; i <= min; i++)
3821: {
3822: if (eptr >= md->end_subject)
3823: {
3824: SCHECK_PARTIAL();
3825: RRETURN(MATCH_NOMATCH);
3826: }
3827: GETCHARINC(d, eptr);
1.1.1.3 misho 3828: if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
1.1 misho 3829: }
3830: }
3831: else
1.1.1.4 ! misho 3832: #endif /* SUPPORT_UTF */
1.1.1.2 misho 3833: /* Not UTF mode */
1.1 misho 3834: {
3835: for (i = 1; i <= min; i++)
3836: {
3837: if (eptr >= md->end_subject)
3838: {
3839: SCHECK_PARTIAL();
3840: RRETURN(MATCH_NOMATCH);
3841: }
1.1.1.2 misho 3842: if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3843: eptr++;
1.1 misho 3844: }
3845: }
3846:
3847: if (min == max) continue;
3848:
3849: if (minimize)
3850: {
1.1.1.2 misho 3851: #ifdef SUPPORT_UTF
3852: if (utf)
1.1 misho 3853: {
1.1.1.4 ! misho 3854: register pcre_uint32 d;
1.1 misho 3855: for (fi = min;; fi++)
3856: {
3857: RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3858: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3859: if (fi >= max) RRETURN(MATCH_NOMATCH);
3860: if (eptr >= md->end_subject)
3861: {
3862: SCHECK_PARTIAL();
3863: RRETURN(MATCH_NOMATCH);
3864: }
3865: GETCHARINC(d, eptr);
1.1.1.2 misho 3866: if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
1.1 misho 3867: }
3868: }
3869: else
1.1.1.4 ! misho 3870: #endif /*SUPPORT_UTF */
1.1.1.2 misho 3871: /* Not UTF mode */
1.1 misho 3872: {
3873: for (fi = min;; fi++)
3874: {
3875: RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3876: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3877: if (fi >= max) RRETURN(MATCH_NOMATCH);
3878: if (eptr >= md->end_subject)
3879: {
3880: SCHECK_PARTIAL();
3881: RRETURN(MATCH_NOMATCH);
3882: }
1.1.1.2 misho 3883: if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3884: eptr++;
1.1 misho 3885: }
3886: }
3887: /* Control never gets here */
3888: }
3889:
3890: /* Maximize case */
3891:
3892: else
3893: {
3894: pp = eptr;
3895:
1.1.1.2 misho 3896: #ifdef SUPPORT_UTF
3897: if (utf)
1.1 misho 3898: {
1.1.1.4 ! misho 3899: register pcre_uint32 d;
1.1 misho 3900: for (i = min; i < max; i++)
3901: {
3902: int len = 1;
3903: if (eptr >= md->end_subject)
3904: {
3905: SCHECK_PARTIAL();
3906: break;
3907: }
3908: GETCHARLEN(d, eptr, len);
1.1.1.2 misho 3909: if (fc == d || (unsigned int)foc == d) break;
1.1 misho 3910: eptr += len;
3911: }
1.1.1.4 ! misho 3912: if (possessive) continue; /* No backtracking */
1.1.1.2 misho 3913: for(;;)
1.1 misho 3914: {
1.1.1.4 ! misho 3915: if (eptr == pp) goto TAIL_RECURSE;
1.1 misho 3916: RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3917: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1.1.4 ! misho 3918: eptr--;
1.1 misho 3919: BACKCHAR(eptr);
3920: }
3921: }
3922: else
1.1.1.4 ! misho 3923: #endif /* SUPPORT_UTF */
1.1.1.2 misho 3924: /* Not UTF mode */
1.1 misho 3925: {
3926: for (i = min; i < max; i++)
3927: {
3928: if (eptr >= md->end_subject)
3929: {
3930: SCHECK_PARTIAL();
3931: break;
3932: }
1.1.1.2 misho 3933: if (fc == *eptr || foc == *eptr) break;
1.1 misho 3934: eptr++;
3935: }
1.1.1.4 ! misho 3936: if (possessive) continue; /* No backtracking */
! 3937: for (;;)
1.1 misho 3938: {
1.1.1.4 ! misho 3939: if (eptr == pp) goto TAIL_RECURSE;
1.1 misho 3940: RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3941: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3942: eptr--;
3943: }
3944: }
3945:
3946: RRETURN(MATCH_NOMATCH);
3947: }
3948: /* Control never gets here */
3949: }
3950:
3951: /* Caseful comparisons */
3952:
3953: else
3954: {
1.1.1.2 misho 3955: #ifdef SUPPORT_UTF
3956: if (utf)
1.1 misho 3957: {
1.1.1.4 ! misho 3958: register pcre_uint32 d;
1.1 misho 3959: for (i = 1; i <= min; i++)
3960: {
3961: if (eptr >= md->end_subject)
3962: {
3963: SCHECK_PARTIAL();
3964: RRETURN(MATCH_NOMATCH);
3965: }
3966: GETCHARINC(d, eptr);
3967: if (fc == d) RRETURN(MATCH_NOMATCH);
3968: }
3969: }
3970: else
3971: #endif
1.1.1.2 misho 3972: /* Not UTF mode */
1.1 misho 3973: {
3974: for (i = 1; i <= min; i++)
3975: {
3976: if (eptr >= md->end_subject)
3977: {
3978: SCHECK_PARTIAL();
3979: RRETURN(MATCH_NOMATCH);
3980: }
3981: if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3982: }
3983: }
3984:
3985: if (min == max) continue;
3986:
3987: if (minimize)
3988: {
1.1.1.2 misho 3989: #ifdef SUPPORT_UTF
3990: if (utf)
1.1 misho 3991: {
1.1.1.4 ! misho 3992: register pcre_uint32 d;
1.1 misho 3993: for (fi = min;; fi++)
3994: {
3995: RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3996: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3997: if (fi >= max) RRETURN(MATCH_NOMATCH);
3998: if (eptr >= md->end_subject)
3999: {
4000: SCHECK_PARTIAL();
4001: RRETURN(MATCH_NOMATCH);
4002: }
4003: GETCHARINC(d, eptr);
4004: if (fc == d) RRETURN(MATCH_NOMATCH);
4005: }
4006: }
4007: else
4008: #endif
1.1.1.2 misho 4009: /* Not UTF mode */
1.1 misho 4010: {
4011: for (fi = min;; fi++)
4012: {
4013: RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
4014: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4015: if (fi >= max) RRETURN(MATCH_NOMATCH);
4016: if (eptr >= md->end_subject)
4017: {
4018: SCHECK_PARTIAL();
4019: RRETURN(MATCH_NOMATCH);
4020: }
4021: if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
4022: }
4023: }
4024: /* Control never gets here */
4025: }
4026:
4027: /* Maximize case */
4028:
4029: else
4030: {
4031: pp = eptr;
4032:
1.1.1.2 misho 4033: #ifdef SUPPORT_UTF
4034: if (utf)
1.1 misho 4035: {
1.1.1.4 ! misho 4036: register pcre_uint32 d;
1.1 misho 4037: for (i = min; i < max; i++)
4038: {
4039: int len = 1;
4040: if (eptr >= md->end_subject)
4041: {
4042: SCHECK_PARTIAL();
4043: break;
4044: }
4045: GETCHARLEN(d, eptr, len);
4046: if (fc == d) break;
4047: eptr += len;
4048: }
1.1.1.4 ! misho 4049: if (possessive) continue; /* No backtracking */
1.1 misho 4050: for(;;)
4051: {
1.1.1.4 ! misho 4052: if (eptr == pp) goto TAIL_RECURSE;
1.1 misho 4053: RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
4054: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1.1.4 ! misho 4055: eptr--;
1.1 misho 4056: BACKCHAR(eptr);
4057: }
4058: }
4059: else
4060: #endif
1.1.1.2 misho 4061: /* Not UTF mode */
1.1 misho 4062: {
4063: for (i = min; i < max; i++)
4064: {
4065: if (eptr >= md->end_subject)
4066: {
4067: SCHECK_PARTIAL();
4068: break;
4069: }
4070: if (fc == *eptr) break;
4071: eptr++;
4072: }
1.1.1.4 ! misho 4073: if (possessive) continue; /* No backtracking */
! 4074: for (;;)
1.1 misho 4075: {
1.1.1.4 ! misho 4076: if (eptr == pp) goto TAIL_RECURSE;
1.1 misho 4077: RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
4078: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4079: eptr--;
4080: }
4081: }
4082:
4083: RRETURN(MATCH_NOMATCH);
4084: }
4085: }
4086: /* Control never gets here */
4087:
4088: /* Match a single character type repeatedly; several different opcodes
4089: share code. This is very similar to the code for single characters, but we
4090: repeat it in the interests of efficiency. */
4091:
4092: case OP_TYPEEXACT:
4093: min = max = GET2(ecode, 1);
4094: minimize = TRUE;
1.1.1.2 misho 4095: ecode += 1 + IMM2_SIZE;
1.1 misho 4096: goto REPEATTYPE;
4097:
4098: case OP_TYPEUPTO:
4099: case OP_TYPEMINUPTO:
4100: min = 0;
4101: max = GET2(ecode, 1);
4102: minimize = *ecode == OP_TYPEMINUPTO;
1.1.1.2 misho 4103: ecode += 1 + IMM2_SIZE;
1.1 misho 4104: goto REPEATTYPE;
4105:
4106: case OP_TYPEPOSSTAR:
4107: possessive = TRUE;
4108: min = 0;
4109: max = INT_MAX;
4110: ecode++;
4111: goto REPEATTYPE;
4112:
4113: case OP_TYPEPOSPLUS:
4114: possessive = TRUE;
4115: min = 1;
4116: max = INT_MAX;
4117: ecode++;
4118: goto REPEATTYPE;
4119:
4120: case OP_TYPEPOSQUERY:
4121: possessive = TRUE;
4122: min = 0;
4123: max = 1;
4124: ecode++;
4125: goto REPEATTYPE;
4126:
4127: case OP_TYPEPOSUPTO:
4128: possessive = TRUE;
4129: min = 0;
4130: max = GET2(ecode, 1);
1.1.1.2 misho 4131: ecode += 1 + IMM2_SIZE;
1.1 misho 4132: goto REPEATTYPE;
4133:
4134: case OP_TYPESTAR:
4135: case OP_TYPEMINSTAR:
4136: case OP_TYPEPLUS:
4137: case OP_TYPEMINPLUS:
4138: case OP_TYPEQUERY:
4139: case OP_TYPEMINQUERY:
4140: c = *ecode++ - OP_TYPESTAR;
4141: minimize = (c & 1) != 0;
4142: min = rep_min[c]; /* Pick up values from tables; */
4143: max = rep_max[c]; /* zero for max => infinity */
4144: if (max == 0) max = INT_MAX;
4145:
4146: /* Common code for all repeated single character type matches. Note that
4147: in UTF-8 mode, '.' matches a character of any length, but for the other
4148: character types, the valid characters are all one-byte long. */
4149:
4150: REPEATTYPE:
4151: ctype = *ecode++; /* Code for the character type */
4152:
4153: #ifdef SUPPORT_UCP
4154: if (ctype == OP_PROP || ctype == OP_NOTPROP)
4155: {
4156: prop_fail_result = ctype == OP_NOTPROP;
4157: prop_type = *ecode++;
4158: prop_value = *ecode++;
4159: }
4160: else prop_type = -1;
4161: #endif
4162:
4163: /* First, ensure the minimum number of matches are present. Use inline
4164: code for maximizing the speed, and do the type test once at the start
4165: (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4166: is tidier. Also separate the UCP code, which can be the same for both UTF-8
4167: and single-bytes. */
4168:
4169: if (min > 0)
4170: {
4171: #ifdef SUPPORT_UCP
4172: if (prop_type >= 0)
4173: {
4174: switch(prop_type)
4175: {
4176: case PT_ANY:
4177: if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4178: for (i = 1; i <= min; i++)
4179: {
4180: if (eptr >= md->end_subject)
4181: {
4182: SCHECK_PARTIAL();
4183: RRETURN(MATCH_NOMATCH);
4184: }
4185: GETCHARINCTEST(c, eptr);
4186: }
4187: break;
4188:
4189: case PT_LAMP:
4190: for (i = 1; i <= min; i++)
4191: {
4192: int chartype;
4193: if (eptr >= md->end_subject)
4194: {
4195: SCHECK_PARTIAL();
4196: RRETURN(MATCH_NOMATCH);
4197: }
4198: GETCHARINCTEST(c, eptr);
4199: chartype = UCD_CHARTYPE(c);
4200: if ((chartype == ucp_Lu ||
4201: chartype == ucp_Ll ||
4202: chartype == ucp_Lt) == prop_fail_result)
4203: RRETURN(MATCH_NOMATCH);
4204: }
4205: break;
4206:
4207: case PT_GC:
4208: for (i = 1; i <= min; i++)
4209: {
4210: if (eptr >= md->end_subject)
4211: {
4212: SCHECK_PARTIAL();
4213: RRETURN(MATCH_NOMATCH);
4214: }
4215: GETCHARINCTEST(c, eptr);
4216: if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4217: RRETURN(MATCH_NOMATCH);
4218: }
4219: break;
4220:
4221: case PT_PC:
4222: for (i = 1; i <= min; i++)
4223: {
4224: if (eptr >= md->end_subject)
4225: {
4226: SCHECK_PARTIAL();
4227: RRETURN(MATCH_NOMATCH);
4228: }
4229: GETCHARINCTEST(c, eptr);
4230: if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4231: RRETURN(MATCH_NOMATCH);
4232: }
4233: break;
4234:
4235: case PT_SC:
4236: for (i = 1; i <= min; i++)
4237: {
4238: if (eptr >= md->end_subject)
4239: {
4240: SCHECK_PARTIAL();
4241: RRETURN(MATCH_NOMATCH);
4242: }
4243: GETCHARINCTEST(c, eptr);
4244: if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4245: RRETURN(MATCH_NOMATCH);
4246: }
4247: break;
4248:
4249: case PT_ALNUM:
4250: for (i = 1; i <= min; i++)
4251: {
4252: int category;
4253: if (eptr >= md->end_subject)
4254: {
4255: SCHECK_PARTIAL();
4256: RRETURN(MATCH_NOMATCH);
4257: }
4258: GETCHARINCTEST(c, eptr);
4259: category = UCD_CATEGORY(c);
4260: if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4261: RRETURN(MATCH_NOMATCH);
4262: }
4263: break;
4264:
4265: case PT_SPACE: /* Perl space */
4266: for (i = 1; i <= min; i++)
4267: {
4268: if (eptr >= md->end_subject)
4269: {
4270: SCHECK_PARTIAL();
4271: RRETURN(MATCH_NOMATCH);
4272: }
4273: GETCHARINCTEST(c, eptr);
4274: if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4275: c == CHAR_FF || c == CHAR_CR)
4276: == prop_fail_result)
4277: RRETURN(MATCH_NOMATCH);
4278: }
4279: break;
4280:
4281: case PT_PXSPACE: /* POSIX space */
4282: for (i = 1; i <= min; i++)
4283: {
4284: if (eptr >= md->end_subject)
4285: {
4286: SCHECK_PARTIAL();
4287: RRETURN(MATCH_NOMATCH);
4288: }
4289: GETCHARINCTEST(c, eptr);
4290: if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4291: c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4292: == prop_fail_result)
4293: RRETURN(MATCH_NOMATCH);
4294: }
4295: break;
4296:
4297: case PT_WORD:
4298: for (i = 1; i <= min; i++)
4299: {
4300: int category;
4301: if (eptr >= md->end_subject)
4302: {
4303: SCHECK_PARTIAL();
4304: RRETURN(MATCH_NOMATCH);
4305: }
4306: GETCHARINCTEST(c, eptr);
4307: category = UCD_CATEGORY(c);
4308: if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4309: == prop_fail_result)
4310: RRETURN(MATCH_NOMATCH);
4311: }
4312: break;
4313:
1.1.1.4 ! misho 4314: case PT_CLIST:
! 4315: for (i = 1; i <= min; i++)
! 4316: {
! 4317: const pcre_uint32 *cp;
! 4318: if (eptr >= md->end_subject)
! 4319: {
! 4320: SCHECK_PARTIAL();
! 4321: RRETURN(MATCH_NOMATCH);
! 4322: }
! 4323: GETCHARINCTEST(c, eptr);
! 4324: cp = PRIV(ucd_caseless_sets) + prop_value;
! 4325: for (;;)
! 4326: {
! 4327: if (c < *cp)
! 4328: { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
! 4329: if (c == *cp++)
! 4330: { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
! 4331: }
! 4332: }
! 4333: break;
! 4334:
! 4335: case PT_UCNC:
! 4336: for (i = 1; i <= min; i++)
! 4337: {
! 4338: if (eptr >= md->end_subject)
! 4339: {
! 4340: SCHECK_PARTIAL();
! 4341: RRETURN(MATCH_NOMATCH);
! 4342: }
! 4343: GETCHARINCTEST(c, eptr);
! 4344: if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
! 4345: c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
! 4346: c >= 0xe000) == prop_fail_result)
! 4347: RRETURN(MATCH_NOMATCH);
! 4348: }
! 4349: break;
! 4350:
1.1 misho 4351: /* This should not occur */
4352:
4353: default:
4354: RRETURN(PCRE_ERROR_INTERNAL);
4355: }
4356: }
4357:
4358: /* Match extended Unicode sequences. We will get here only if the
4359: support is in the binary; otherwise a compile-time error occurs. */
4360:
4361: else if (ctype == OP_EXTUNI)
4362: {
4363: for (i = 1; i <= min; i++)
4364: {
4365: if (eptr >= md->end_subject)
4366: {
4367: SCHECK_PARTIAL();
4368: RRETURN(MATCH_NOMATCH);
4369: }
1.1.1.4 ! misho 4370: else
1.1 misho 4371: {
1.1.1.4 ! misho 4372: int lgb, rgb;
! 4373: GETCHARINCTEST(c, eptr);
! 4374: lgb = UCD_GRAPHBREAK(c);
! 4375: while (eptr < md->end_subject)
! 4376: {
! 4377: int len = 1;
! 4378: if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
! 4379: rgb = UCD_GRAPHBREAK(c);
! 4380: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
! 4381: lgb = rgb;
! 4382: eptr += len;
! 4383: }
1.1 misho 4384: }
1.1.1.3 misho 4385: CHECK_PARTIAL();
1.1 misho 4386: }
4387: }
4388:
4389: else
4390: #endif /* SUPPORT_UCP */
4391:
4392: /* Handle all other cases when the coding is UTF-8 */
4393:
1.1.1.2 misho 4394: #ifdef SUPPORT_UTF
4395: if (utf) switch(ctype)
1.1 misho 4396: {
4397: case OP_ANY:
4398: for (i = 1; i <= min; i++)
4399: {
4400: if (eptr >= md->end_subject)
4401: {
4402: SCHECK_PARTIAL();
4403: RRETURN(MATCH_NOMATCH);
4404: }
4405: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1.1.1.3 misho 4406: if (md->partial != 0 &&
4407: eptr + 1 >= md->end_subject &&
4408: NLBLOCK->nltype == NLTYPE_FIXED &&
4409: NLBLOCK->nllen == 2 &&
1.1.1.4 ! misho 4410: RAWUCHAR(eptr) == NLBLOCK->nl[0])
1.1.1.3 misho 4411: {
4412: md->hitend = TRUE;
4413: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4414: }
1.1 misho 4415: eptr++;
1.1.1.2 misho 4416: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misho 4417: }
4418: break;
4419:
4420: case OP_ALLANY:
4421: for (i = 1; i <= min; i++)
4422: {
4423: if (eptr >= md->end_subject)
4424: {
4425: SCHECK_PARTIAL();
4426: RRETURN(MATCH_NOMATCH);
4427: }
4428: eptr++;
1.1.1.2 misho 4429: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misho 4430: }
4431: break;
4432:
4433: case OP_ANYBYTE:
4434: if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4435: eptr += min;
4436: break;
4437:
4438: case OP_ANYNL:
4439: for (i = 1; i <= min; i++)
4440: {
4441: if (eptr >= md->end_subject)
4442: {
4443: SCHECK_PARTIAL();
4444: RRETURN(MATCH_NOMATCH);
4445: }
4446: GETCHARINC(c, eptr);
4447: switch(c)
4448: {
4449: default: RRETURN(MATCH_NOMATCH);
4450:
1.1.1.4 ! misho 4451: case CHAR_CR:
! 4452: if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
1.1 misho 4453: break;
4454:
1.1.1.4 ! misho 4455: case CHAR_LF:
1.1 misho 4456: break;
4457:
1.1.1.4 ! misho 4458: case CHAR_VT:
! 4459: case CHAR_FF:
! 4460: case CHAR_NEL:
! 4461: #ifndef EBCDIC
1.1 misho 4462: case 0x2028:
4463: case 0x2029:
1.1.1.4 ! misho 4464: #endif /* Not EBCDIC */
1.1 misho 4465: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4466: break;
4467: }
4468: }
4469: break;
4470:
4471: case OP_NOT_HSPACE:
4472: for (i = 1; i <= min; i++)
4473: {
4474: if (eptr >= md->end_subject)
4475: {
4476: SCHECK_PARTIAL();
4477: RRETURN(MATCH_NOMATCH);
4478: }
4479: GETCHARINC(c, eptr);
4480: switch(c)
4481: {
1.1.1.4 ! misho 4482: HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
1.1 misho 4483: default: break;
4484: }
4485: }
4486: break;
4487:
4488: case OP_HSPACE:
4489: for (i = 1; i <= min; i++)
4490: {
4491: if (eptr >= md->end_subject)
4492: {
4493: SCHECK_PARTIAL();
4494: RRETURN(MATCH_NOMATCH);
4495: }
4496: GETCHARINC(c, eptr);
4497: switch(c)
4498: {
1.1.1.4 ! misho 4499: HSPACE_CASES: break; /* Byte and multibyte cases */
1.1 misho 4500: default: RRETURN(MATCH_NOMATCH);
4501: }
4502: }
4503: break;
4504:
4505: case OP_NOT_VSPACE:
4506: for (i = 1; i <= min; i++)
4507: {
4508: if (eptr >= md->end_subject)
4509: {
4510: SCHECK_PARTIAL();
4511: RRETURN(MATCH_NOMATCH);
4512: }
4513: GETCHARINC(c, eptr);
4514: switch(c)
4515: {
1.1.1.4 ! misho 4516: VSPACE_CASES: RRETURN(MATCH_NOMATCH);
1.1 misho 4517: default: break;
4518: }
4519: }
4520: break;
4521:
4522: case OP_VSPACE:
4523: for (i = 1; i <= min; i++)
4524: {
4525: if (eptr >= md->end_subject)
4526: {
4527: SCHECK_PARTIAL();
4528: RRETURN(MATCH_NOMATCH);
4529: }
4530: GETCHARINC(c, eptr);
4531: switch(c)
4532: {
1.1.1.4 ! misho 4533: VSPACE_CASES: break;
1.1 misho 4534: default: RRETURN(MATCH_NOMATCH);
4535: }
4536: }
4537: break;
4538:
4539: case OP_NOT_DIGIT:
4540: for (i = 1; i <= min; i++)
4541: {
4542: if (eptr >= md->end_subject)
4543: {
4544: SCHECK_PARTIAL();
4545: RRETURN(MATCH_NOMATCH);
4546: }
4547: GETCHARINC(c, eptr);
4548: if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4549: RRETURN(MATCH_NOMATCH);
4550: }
4551: break;
4552:
4553: case OP_DIGIT:
4554: for (i = 1; i <= min; i++)
4555: {
1.1.1.4 ! misho 4556: pcre_uint32 cc;
1.1 misho 4557: if (eptr >= md->end_subject)
4558: {
4559: SCHECK_PARTIAL();
4560: RRETURN(MATCH_NOMATCH);
4561: }
1.1.1.4 ! misho 4562: cc = RAWUCHAR(eptr);
! 4563: if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
1.1 misho 4564: RRETURN(MATCH_NOMATCH);
1.1.1.2 misho 4565: eptr++;
1.1 misho 4566: /* No need to skip more bytes - we know it's a 1-byte character */
4567: }
4568: break;
4569:
4570: case OP_NOT_WHITESPACE:
4571: for (i = 1; i <= min; i++)
4572: {
1.1.1.4 ! misho 4573: pcre_uint32 cc;
1.1 misho 4574: if (eptr >= md->end_subject)
4575: {
4576: SCHECK_PARTIAL();
4577: RRETURN(MATCH_NOMATCH);
4578: }
1.1.1.4 ! misho 4579: cc = RAWUCHAR(eptr);
! 4580: if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
1.1 misho 4581: RRETURN(MATCH_NOMATCH);
1.1.1.2 misho 4582: eptr++;
4583: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misho 4584: }
4585: break;
4586:
4587: case OP_WHITESPACE:
4588: for (i = 1; i <= min; i++)
4589: {
1.1.1.4 ! misho 4590: pcre_uint32 cc;
1.1 misho 4591: if (eptr >= md->end_subject)
4592: {
4593: SCHECK_PARTIAL();
4594: RRETURN(MATCH_NOMATCH);
4595: }
1.1.1.4 ! misho 4596: cc = RAWUCHAR(eptr);
! 4597: if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
1.1 misho 4598: RRETURN(MATCH_NOMATCH);
1.1.1.2 misho 4599: eptr++;
1.1 misho 4600: /* No need to skip more bytes - we know it's a 1-byte character */
4601: }
4602: break;
4603:
4604: case OP_NOT_WORDCHAR:
4605: for (i = 1; i <= min; i++)
4606: {
1.1.1.4 ! misho 4607: pcre_uint32 cc;
1.1 misho 4608: if (eptr >= md->end_subject)
4609: {
4610: SCHECK_PARTIAL();
4611: RRETURN(MATCH_NOMATCH);
4612: }
1.1.1.4 ! misho 4613: cc = RAWUCHAR(eptr);
! 4614: if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
1.1 misho 4615: RRETURN(MATCH_NOMATCH);
1.1.1.2 misho 4616: eptr++;
4617: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misho 4618: }
4619: break;
4620:
4621: case OP_WORDCHAR:
4622: for (i = 1; i <= min; i++)
4623: {
1.1.1.4 ! misho 4624: pcre_uint32 cc;
1.1 misho 4625: if (eptr >= md->end_subject)
4626: {
4627: SCHECK_PARTIAL();
4628: RRETURN(MATCH_NOMATCH);
4629: }
1.1.1.4 ! misho 4630: cc = RAWUCHAR(eptr);
! 4631: if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
1.1 misho 4632: RRETURN(MATCH_NOMATCH);
1.1.1.2 misho 4633: eptr++;
1.1 misho 4634: /* No need to skip more bytes - we know it's a 1-byte character */
4635: }
4636: break;
4637:
4638: default:
4639: RRETURN(PCRE_ERROR_INTERNAL);
4640: } /* End switch(ctype) */
4641:
4642: else
1.1.1.2 misho 4643: #endif /* SUPPORT_UTF */
1.1 misho 4644:
4645: /* Code for the non-UTF-8 case for minimum matching of operators other
4646: than OP_PROP and OP_NOTPROP. */
4647:
4648: switch(ctype)
4649: {
4650: case OP_ANY:
4651: for (i = 1; i <= min; i++)
4652: {
4653: if (eptr >= md->end_subject)
4654: {
4655: SCHECK_PARTIAL();
4656: RRETURN(MATCH_NOMATCH);
4657: }
4658: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1.1.1.3 misho 4659: if (md->partial != 0 &&
4660: eptr + 1 >= md->end_subject &&
4661: NLBLOCK->nltype == NLTYPE_FIXED &&
4662: NLBLOCK->nllen == 2 &&
4663: *eptr == NLBLOCK->nl[0])
4664: {
4665: md->hitend = TRUE;
4666: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4667: }
1.1 misho 4668: eptr++;
4669: }
4670: break;
4671:
4672: case OP_ALLANY:
4673: if (eptr > md->end_subject - min)
4674: {
4675: SCHECK_PARTIAL();
4676: RRETURN(MATCH_NOMATCH);
4677: }
4678: eptr += min;
4679: break;
4680:
4681: case OP_ANYBYTE:
4682: if (eptr > md->end_subject - min)
4683: {
4684: SCHECK_PARTIAL();
4685: RRETURN(MATCH_NOMATCH);
4686: }
4687: eptr += min;
4688: break;
4689:
4690: case OP_ANYNL:
4691: for (i = 1; i <= min; i++)
4692: {
4693: if (eptr >= md->end_subject)
4694: {
4695: SCHECK_PARTIAL();
4696: RRETURN(MATCH_NOMATCH);
4697: }
4698: switch(*eptr++)
4699: {
4700: default: RRETURN(MATCH_NOMATCH);
4701:
1.1.1.4 ! misho 4702: case CHAR_CR:
! 4703: if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
1.1 misho 4704: break;
4705:
1.1.1.4 ! misho 4706: case CHAR_LF:
1.1 misho 4707: break;
4708:
1.1.1.4 ! misho 4709: case CHAR_VT:
! 4710: case CHAR_FF:
! 4711: case CHAR_NEL:
! 4712: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1.1.1.2 misho 4713: case 0x2028:
4714: case 0x2029:
4715: #endif
1.1 misho 4716: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4717: break;
4718: }
4719: }
4720: break;
4721:
4722: case OP_NOT_HSPACE:
4723: for (i = 1; i <= min; i++)
4724: {
4725: if (eptr >= md->end_subject)
4726: {
4727: SCHECK_PARTIAL();
4728: RRETURN(MATCH_NOMATCH);
4729: }
4730: switch(*eptr++)
4731: {
4732: default: break;
1.1.1.4 ! misho 4733: HSPACE_BYTE_CASES:
! 4734: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
! 4735: HSPACE_MULTIBYTE_CASES:
1.1.1.2 misho 4736: #endif
1.1 misho 4737: RRETURN(MATCH_NOMATCH);
4738: }
4739: }
4740: break;
4741:
4742: case OP_HSPACE:
4743: for (i = 1; i <= min; i++)
4744: {
4745: if (eptr >= md->end_subject)
4746: {
4747: SCHECK_PARTIAL();
4748: RRETURN(MATCH_NOMATCH);
4749: }
4750: switch(*eptr++)
4751: {
4752: default: RRETURN(MATCH_NOMATCH);
1.1.1.4 ! misho 4753: HSPACE_BYTE_CASES:
! 4754: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
! 4755: HSPACE_MULTIBYTE_CASES:
1.1.1.2 misho 4756: #endif
1.1 misho 4757: break;
4758: }
4759: }
4760: break;
4761:
4762: case OP_NOT_VSPACE:
4763: for (i = 1; i <= min; i++)
4764: {
4765: if (eptr >= md->end_subject)
4766: {
4767: SCHECK_PARTIAL();
4768: RRETURN(MATCH_NOMATCH);
4769: }
4770: switch(*eptr++)
4771: {
1.1.1.4 ! misho 4772: VSPACE_BYTE_CASES:
! 4773: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
! 4774: VSPACE_MULTIBYTE_CASES:
1.1.1.2 misho 4775: #endif
1.1 misho 4776: RRETURN(MATCH_NOMATCH);
1.1.1.4 ! misho 4777: default: break;
1.1 misho 4778: }
4779: }
4780: break;
4781:
4782: case OP_VSPACE:
4783: for (i = 1; i <= min; i++)
4784: {
4785: if (eptr >= md->end_subject)
4786: {
4787: SCHECK_PARTIAL();
4788: RRETURN(MATCH_NOMATCH);
4789: }
4790: switch(*eptr++)
4791: {
4792: default: RRETURN(MATCH_NOMATCH);
1.1.1.4 ! misho 4793: VSPACE_BYTE_CASES:
! 4794: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
! 4795: VSPACE_MULTIBYTE_CASES:
1.1.1.2 misho 4796: #endif
1.1 misho 4797: break;
4798: }
4799: }
4800: break;
4801:
4802: case OP_NOT_DIGIT:
4803: for (i = 1; i <= min; i++)
4804: {
4805: if (eptr >= md->end_subject)
4806: {
4807: SCHECK_PARTIAL();
4808: RRETURN(MATCH_NOMATCH);
4809: }
1.1.1.2 misho 4810: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4811: RRETURN(MATCH_NOMATCH);
4812: eptr++;
1.1 misho 4813: }
4814: break;
4815:
4816: case OP_DIGIT:
4817: for (i = 1; i <= min; i++)
4818: {
4819: if (eptr >= md->end_subject)
4820: {
4821: SCHECK_PARTIAL();
4822: RRETURN(MATCH_NOMATCH);
4823: }
1.1.1.2 misho 4824: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4825: RRETURN(MATCH_NOMATCH);
4826: eptr++;
1.1 misho 4827: }
4828: break;
4829:
4830: case OP_NOT_WHITESPACE:
4831: for (i = 1; i <= min; i++)
4832: {
4833: if (eptr >= md->end_subject)
4834: {
4835: SCHECK_PARTIAL();
4836: RRETURN(MATCH_NOMATCH);
4837: }
1.1.1.2 misho 4838: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4839: RRETURN(MATCH_NOMATCH);
4840: eptr++;
1.1 misho 4841: }
4842: break;
4843:
4844: case OP_WHITESPACE:
4845: for (i = 1; i <= min; i++)
4846: {
4847: if (eptr >= md->end_subject)
4848: {
4849: SCHECK_PARTIAL();
4850: RRETURN(MATCH_NOMATCH);
4851: }
1.1.1.2 misho 4852: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4853: RRETURN(MATCH_NOMATCH);
4854: eptr++;
1.1 misho 4855: }
4856: break;
4857:
4858: case OP_NOT_WORDCHAR:
4859: for (i = 1; i <= min; i++)
4860: {
4861: if (eptr >= md->end_subject)
4862: {
4863: SCHECK_PARTIAL();
4864: RRETURN(MATCH_NOMATCH);
4865: }
1.1.1.2 misho 4866: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
1.1 misho 4867: RRETURN(MATCH_NOMATCH);
1.1.1.2 misho 4868: eptr++;
1.1 misho 4869: }
4870: break;
4871:
4872: case OP_WORDCHAR:
4873: for (i = 1; i <= min; i++)
4874: {
4875: if (eptr >= md->end_subject)
4876: {
4877: SCHECK_PARTIAL();
4878: RRETURN(MATCH_NOMATCH);
4879: }
1.1.1.2 misho 4880: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
1.1 misho 4881: RRETURN(MATCH_NOMATCH);
1.1.1.2 misho 4882: eptr++;
1.1 misho 4883: }
4884: break;
4885:
4886: default:
4887: RRETURN(PCRE_ERROR_INTERNAL);
4888: }
4889: }
4890:
4891: /* If min = max, continue at the same level without recursing */
4892:
4893: if (min == max) continue;
4894:
4895: /* If minimizing, we have to test the rest of the pattern before each
4896: subsequent match. Again, separate the UTF-8 case for speed, and also
4897: separate the UCP cases. */
4898:
4899: if (minimize)
4900: {
4901: #ifdef SUPPORT_UCP
4902: if (prop_type >= 0)
4903: {
4904: switch(prop_type)
4905: {
4906: case PT_ANY:
4907: for (fi = min;; fi++)
4908: {
4909: RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4910: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4911: if (fi >= max) RRETURN(MATCH_NOMATCH);
4912: if (eptr >= md->end_subject)
4913: {
4914: SCHECK_PARTIAL();
4915: RRETURN(MATCH_NOMATCH);
4916: }
4917: GETCHARINCTEST(c, eptr);
4918: if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4919: }
4920: /* Control never gets here */
4921:
4922: case PT_LAMP:
4923: for (fi = min;; fi++)
4924: {
4925: int chartype;
4926: RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4927: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4928: if (fi >= max) RRETURN(MATCH_NOMATCH);
4929: if (eptr >= md->end_subject)
4930: {
4931: SCHECK_PARTIAL();
4932: RRETURN(MATCH_NOMATCH);
4933: }
4934: GETCHARINCTEST(c, eptr);
4935: chartype = UCD_CHARTYPE(c);
4936: if ((chartype == ucp_Lu ||
4937: chartype == ucp_Ll ||
4938: chartype == ucp_Lt) == prop_fail_result)
4939: RRETURN(MATCH_NOMATCH);
4940: }
4941: /* Control never gets here */
4942:
4943: case PT_GC:
4944: for (fi = min;; fi++)
4945: {
4946: RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4947: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4948: if (fi >= max) RRETURN(MATCH_NOMATCH);
4949: if (eptr >= md->end_subject)
4950: {
4951: SCHECK_PARTIAL();
4952: RRETURN(MATCH_NOMATCH);
4953: }
4954: GETCHARINCTEST(c, eptr);
4955: if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4956: RRETURN(MATCH_NOMATCH);
4957: }
4958: /* Control never gets here */
4959:
4960: case PT_PC:
4961: for (fi = min;; fi++)
4962: {
4963: RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4964: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4965: if (fi >= max) RRETURN(MATCH_NOMATCH);
4966: if (eptr >= md->end_subject)
4967: {
4968: SCHECK_PARTIAL();
4969: RRETURN(MATCH_NOMATCH);
4970: }
4971: GETCHARINCTEST(c, eptr);
4972: if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4973: RRETURN(MATCH_NOMATCH);
4974: }
4975: /* Control never gets here */
4976:
4977: case PT_SC:
4978: for (fi = min;; fi++)
4979: {
4980: RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4981: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4982: if (fi >= max) RRETURN(MATCH_NOMATCH);
4983: if (eptr >= md->end_subject)
4984: {
4985: SCHECK_PARTIAL();
4986: RRETURN(MATCH_NOMATCH);
4987: }
4988: GETCHARINCTEST(c, eptr);
4989: if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4990: RRETURN(MATCH_NOMATCH);
4991: }
4992: /* Control never gets here */
4993:
4994: case PT_ALNUM:
4995: for (fi = min;; fi++)
4996: {
4997: int category;
4998: RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4999: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5000: if (fi >= max) RRETURN(MATCH_NOMATCH);
5001: if (eptr >= md->end_subject)
5002: {
5003: SCHECK_PARTIAL();
5004: RRETURN(MATCH_NOMATCH);
5005: }
5006: GETCHARINCTEST(c, eptr);
5007: category = UCD_CATEGORY(c);
5008: if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5009: RRETURN(MATCH_NOMATCH);
5010: }
5011: /* Control never gets here */
5012:
5013: case PT_SPACE: /* Perl space */
5014: for (fi = min;; fi++)
5015: {
5016: RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
5017: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5018: if (fi >= max) RRETURN(MATCH_NOMATCH);
5019: if (eptr >= md->end_subject)
5020: {
5021: SCHECK_PARTIAL();
5022: RRETURN(MATCH_NOMATCH);
5023: }
5024: GETCHARINCTEST(c, eptr);
5025: if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5026: c == CHAR_FF || c == CHAR_CR)
5027: == prop_fail_result)
5028: RRETURN(MATCH_NOMATCH);
5029: }
5030: /* Control never gets here */
5031:
5032: case PT_PXSPACE: /* POSIX space */
5033: for (fi = min;; fi++)
5034: {
5035: RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
5036: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5037: if (fi >= max) RRETURN(MATCH_NOMATCH);
5038: if (eptr >= md->end_subject)
5039: {
5040: SCHECK_PARTIAL();
5041: RRETURN(MATCH_NOMATCH);
5042: }
5043: GETCHARINCTEST(c, eptr);
5044: if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5045: c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5046: == prop_fail_result)
5047: RRETURN(MATCH_NOMATCH);
5048: }
5049: /* Control never gets here */
5050:
5051: case PT_WORD:
5052: for (fi = min;; fi++)
5053: {
5054: int category;
5055: RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
5056: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5057: if (fi >= max) RRETURN(MATCH_NOMATCH);
5058: if (eptr >= md->end_subject)
5059: {
5060: SCHECK_PARTIAL();
5061: RRETURN(MATCH_NOMATCH);
5062: }
5063: GETCHARINCTEST(c, eptr);
5064: category = UCD_CATEGORY(c);
5065: if ((category == ucp_L ||
5066: category == ucp_N ||
5067: c == CHAR_UNDERSCORE)
5068: == prop_fail_result)
5069: RRETURN(MATCH_NOMATCH);
5070: }
5071: /* Control never gets here */
5072:
1.1.1.4 ! misho 5073: case PT_CLIST:
! 5074: for (fi = min;; fi++)
! 5075: {
! 5076: const pcre_uint32 *cp;
! 5077: RMATCH(eptr, ecode, offset_top, md, eptrb, RM67);
! 5078: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 5079: if (fi >= max) RRETURN(MATCH_NOMATCH);
! 5080: if (eptr >= md->end_subject)
! 5081: {
! 5082: SCHECK_PARTIAL();
! 5083: RRETURN(MATCH_NOMATCH);
! 5084: }
! 5085: GETCHARINCTEST(c, eptr);
! 5086: cp = PRIV(ucd_caseless_sets) + prop_value;
! 5087: for (;;)
! 5088: {
! 5089: if (c < *cp)
! 5090: { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
! 5091: if (c == *cp++)
! 5092: { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
! 5093: }
! 5094: }
! 5095: /* Control never gets here */
1.1 misho 5096:
1.1.1.4 ! misho 5097: case PT_UCNC:
! 5098: for (fi = min;; fi++)
! 5099: {
! 5100: RMATCH(eptr, ecode, offset_top, md, eptrb, RM68);
! 5101: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 5102: if (fi >= max) RRETURN(MATCH_NOMATCH);
! 5103: if (eptr >= md->end_subject)
! 5104: {
! 5105: SCHECK_PARTIAL();
! 5106: RRETURN(MATCH_NOMATCH);
! 5107: }
! 5108: GETCHARINCTEST(c, eptr);
! 5109: if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
! 5110: c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
! 5111: c >= 0xe000) == prop_fail_result)
! 5112: RRETURN(MATCH_NOMATCH);
! 5113: }
! 5114: /* Control never gets here */
! 5115:
! 5116: /* This should never occur */
1.1 misho 5117: default:
5118: RRETURN(PCRE_ERROR_INTERNAL);
5119: }
5120: }
5121:
5122: /* Match extended Unicode sequences. We will get here only if the
5123: support is in the binary; otherwise a compile-time error occurs. */
5124:
5125: else if (ctype == OP_EXTUNI)
5126: {
5127: for (fi = min;; fi++)
5128: {
5129: RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5130: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5131: if (fi >= max) RRETURN(MATCH_NOMATCH);
5132: if (eptr >= md->end_subject)
5133: {
5134: SCHECK_PARTIAL();
5135: RRETURN(MATCH_NOMATCH);
5136: }
1.1.1.4 ! misho 5137: else
1.1 misho 5138: {
1.1.1.4 ! misho 5139: int lgb, rgb;
! 5140: GETCHARINCTEST(c, eptr);
! 5141: lgb = UCD_GRAPHBREAK(c);
! 5142: while (eptr < md->end_subject)
! 5143: {
! 5144: int len = 1;
! 5145: if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
! 5146: rgb = UCD_GRAPHBREAK(c);
! 5147: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
! 5148: lgb = rgb;
! 5149: eptr += len;
! 5150: }
1.1 misho 5151: }
1.1.1.3 misho 5152: CHECK_PARTIAL();
1.1 misho 5153: }
5154: }
5155: else
5156: #endif /* SUPPORT_UCP */
5157:
1.1.1.2 misho 5158: #ifdef SUPPORT_UTF
5159: if (utf)
1.1 misho 5160: {
5161: for (fi = min;; fi++)
5162: {
5163: RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5164: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5165: if (fi >= max) RRETURN(MATCH_NOMATCH);
5166: if (eptr >= md->end_subject)
5167: {
5168: SCHECK_PARTIAL();
5169: RRETURN(MATCH_NOMATCH);
5170: }
5171: if (ctype == OP_ANY && IS_NEWLINE(eptr))
5172: RRETURN(MATCH_NOMATCH);
5173: GETCHARINC(c, eptr);
5174: switch(ctype)
5175: {
1.1.1.3 misho 5176: case OP_ANY: /* This is the non-NL case */
5177: if (md->partial != 0 && /* Take care with CRLF partial */
5178: eptr >= md->end_subject &&
5179: NLBLOCK->nltype == NLTYPE_FIXED &&
5180: NLBLOCK->nllen == 2 &&
5181: c == NLBLOCK->nl[0])
5182: {
5183: md->hitend = TRUE;
5184: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5185: }
5186: break;
5187:
1.1 misho 5188: case OP_ALLANY:
5189: case OP_ANYBYTE:
5190: break;
5191:
5192: case OP_ANYNL:
5193: switch(c)
5194: {
5195: default: RRETURN(MATCH_NOMATCH);
1.1.1.4 ! misho 5196: case CHAR_CR:
! 5197: if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
1.1 misho 5198: break;
1.1.1.4 ! misho 5199:
! 5200: case CHAR_LF:
1.1 misho 5201: break;
5202:
1.1.1.4 ! misho 5203: case CHAR_VT:
! 5204: case CHAR_FF:
! 5205: case CHAR_NEL:
! 5206: #ifndef EBCDIC
1.1 misho 5207: case 0x2028:
5208: case 0x2029:
1.1.1.4 ! misho 5209: #endif /* Not EBCDIC */
1.1 misho 5210: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5211: break;
5212: }
5213: break;
5214:
5215: case OP_NOT_HSPACE:
5216: switch(c)
5217: {
1.1.1.4 ! misho 5218: HSPACE_CASES: RRETURN(MATCH_NOMATCH);
1.1 misho 5219: default: break;
5220: }
5221: break;
5222:
5223: case OP_HSPACE:
5224: switch(c)
5225: {
1.1.1.4 ! misho 5226: HSPACE_CASES: break;
1.1 misho 5227: default: RRETURN(MATCH_NOMATCH);
5228: }
5229: break;
5230:
5231: case OP_NOT_VSPACE:
5232: switch(c)
5233: {
1.1.1.4 ! misho 5234: VSPACE_CASES: RRETURN(MATCH_NOMATCH);
1.1 misho 5235: default: break;
5236: }
5237: break;
5238:
5239: case OP_VSPACE:
5240: switch(c)
5241: {
1.1.1.4 ! misho 5242: VSPACE_CASES: break;
1.1 misho 5243: default: RRETURN(MATCH_NOMATCH);
5244: }
5245: break;
5246:
5247: case OP_NOT_DIGIT:
5248: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5249: RRETURN(MATCH_NOMATCH);
5250: break;
5251:
5252: case OP_DIGIT:
5253: if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5254: RRETURN(MATCH_NOMATCH);
5255: break;
5256:
5257: case OP_NOT_WHITESPACE:
5258: if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5259: RRETURN(MATCH_NOMATCH);
5260: break;
5261:
5262: case OP_WHITESPACE:
1.1.1.2 misho 5263: if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
1.1 misho 5264: RRETURN(MATCH_NOMATCH);
5265: break;
5266:
5267: case OP_NOT_WORDCHAR:
5268: if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5269: RRETURN(MATCH_NOMATCH);
5270: break;
5271:
5272: case OP_WORDCHAR:
5273: if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5274: RRETURN(MATCH_NOMATCH);
5275: break;
5276:
5277: default:
5278: RRETURN(PCRE_ERROR_INTERNAL);
5279: }
5280: }
5281: }
5282: else
5283: #endif
1.1.1.2 misho 5284: /* Not UTF mode */
1.1 misho 5285: {
5286: for (fi = min;; fi++)
5287: {
5288: RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5289: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5290: if (fi >= max) RRETURN(MATCH_NOMATCH);
5291: if (eptr >= md->end_subject)
5292: {
5293: SCHECK_PARTIAL();
5294: RRETURN(MATCH_NOMATCH);
5295: }
5296: if (ctype == OP_ANY && IS_NEWLINE(eptr))
5297: RRETURN(MATCH_NOMATCH);
5298: c = *eptr++;
5299: switch(ctype)
5300: {
1.1.1.3 misho 5301: case OP_ANY: /* This is the non-NL case */
5302: if (md->partial != 0 && /* Take care with CRLF partial */
5303: eptr >= md->end_subject &&
5304: NLBLOCK->nltype == NLTYPE_FIXED &&
5305: NLBLOCK->nllen == 2 &&
5306: c == NLBLOCK->nl[0])
5307: {
5308: md->hitend = TRUE;
5309: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5310: }
5311: break;
5312:
1.1 misho 5313: case OP_ALLANY:
5314: case OP_ANYBYTE:
5315: break;
5316:
5317: case OP_ANYNL:
5318: switch(c)
5319: {
5320: default: RRETURN(MATCH_NOMATCH);
1.1.1.4 ! misho 5321: case CHAR_CR:
! 5322: if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
1.1 misho 5323: break;
5324:
1.1.1.4 ! misho 5325: case CHAR_LF:
1.1 misho 5326: break;
5327:
1.1.1.4 ! misho 5328: case CHAR_VT:
! 5329: case CHAR_FF:
! 5330: case CHAR_NEL:
! 5331: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1.1.1.2 misho 5332: case 0x2028:
5333: case 0x2029:
5334: #endif
1.1 misho 5335: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5336: break;
5337: }
5338: break;
5339:
5340: case OP_NOT_HSPACE:
5341: switch(c)
5342: {
5343: default: break;
1.1.1.4 ! misho 5344: HSPACE_BYTE_CASES:
! 5345: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
! 5346: HSPACE_MULTIBYTE_CASES:
1.1.1.2 misho 5347: #endif
1.1 misho 5348: RRETURN(MATCH_NOMATCH);
5349: }
5350: break;
5351:
5352: case OP_HSPACE:
5353: switch(c)
5354: {
5355: default: RRETURN(MATCH_NOMATCH);
1.1.1.4 ! misho 5356: HSPACE_BYTE_CASES:
! 5357: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
! 5358: HSPACE_MULTIBYTE_CASES:
1.1.1.2 misho 5359: #endif
1.1 misho 5360: break;
5361: }
5362: break;
5363:
5364: case OP_NOT_VSPACE:
5365: switch(c)
5366: {
5367: default: break;
1.1.1.4 ! misho 5368: VSPACE_BYTE_CASES:
! 5369: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
! 5370: VSPACE_MULTIBYTE_CASES:
1.1.1.2 misho 5371: #endif
1.1 misho 5372: RRETURN(MATCH_NOMATCH);
5373: }
5374: break;
5375:
5376: case OP_VSPACE:
5377: switch(c)
5378: {
5379: default: RRETURN(MATCH_NOMATCH);
1.1.1.4 ! misho 5380: VSPACE_BYTE_CASES:
! 5381: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
! 5382: VSPACE_MULTIBYTE_CASES:
1.1.1.2 misho 5383: #endif
1.1 misho 5384: break;
5385: }
5386: break;
5387:
5388: case OP_NOT_DIGIT:
1.1.1.2 misho 5389: if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
1.1 misho 5390: break;
5391:
5392: case OP_DIGIT:
1.1.1.2 misho 5393: if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
1.1 misho 5394: break;
5395:
5396: case OP_NOT_WHITESPACE:
1.1.1.2 misho 5397: if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
1.1 misho 5398: break;
5399:
5400: case OP_WHITESPACE:
1.1.1.2 misho 5401: if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
1.1 misho 5402: break;
5403:
5404: case OP_NOT_WORDCHAR:
1.1.1.2 misho 5405: if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
1.1 misho 5406: break;
5407:
5408: case OP_WORDCHAR:
1.1.1.2 misho 5409: if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
1.1 misho 5410: break;
5411:
5412: default:
5413: RRETURN(PCRE_ERROR_INTERNAL);
5414: }
5415: }
5416: }
5417: /* Control never gets here */
5418: }
5419:
5420: /* If maximizing, it is worth using inline code for speed, doing the type
5421: test once at the start (i.e. keep it out of the loop). Again, keep the
5422: UTF-8 and UCP stuff separate. */
5423:
5424: else
5425: {
5426: pp = eptr; /* Remember where we started */
5427:
5428: #ifdef SUPPORT_UCP
5429: if (prop_type >= 0)
5430: {
5431: switch(prop_type)
5432: {
5433: case PT_ANY:
5434: for (i = min; i < max; i++)
5435: {
5436: int len = 1;
5437: if (eptr >= md->end_subject)
5438: {
5439: SCHECK_PARTIAL();
5440: break;
5441: }
5442: GETCHARLENTEST(c, eptr, len);
5443: if (prop_fail_result) break;
5444: eptr+= len;
5445: }
5446: break;
5447:
5448: case PT_LAMP:
5449: for (i = min; i < max; i++)
5450: {
5451: int chartype;
5452: int len = 1;
5453: if (eptr >= md->end_subject)
5454: {
5455: SCHECK_PARTIAL();
5456: break;
5457: }
5458: GETCHARLENTEST(c, eptr, len);
5459: chartype = UCD_CHARTYPE(c);
5460: if ((chartype == ucp_Lu ||
5461: chartype == ucp_Ll ||
5462: chartype == ucp_Lt) == prop_fail_result)
5463: break;
5464: eptr+= len;
5465: }
5466: break;
5467:
5468: case PT_GC:
5469: for (i = min; i < max; i++)
5470: {
5471: int len = 1;
5472: if (eptr >= md->end_subject)
5473: {
5474: SCHECK_PARTIAL();
5475: break;
5476: }
5477: GETCHARLENTEST(c, eptr, len);
5478: if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5479: eptr+= len;
5480: }
5481: break;
5482:
5483: case PT_PC:
5484: for (i = min; i < max; i++)
5485: {
5486: int len = 1;
5487: if (eptr >= md->end_subject)
5488: {
5489: SCHECK_PARTIAL();
5490: break;
5491: }
5492: GETCHARLENTEST(c, eptr, len);
5493: if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5494: eptr+= len;
5495: }
5496: break;
5497:
5498: case PT_SC:
5499: for (i = min; i < max; i++)
5500: {
5501: int len = 1;
5502: if (eptr >= md->end_subject)
5503: {
5504: SCHECK_PARTIAL();
5505: break;
5506: }
5507: GETCHARLENTEST(c, eptr, len);
5508: if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5509: eptr+= len;
5510: }
5511: break;
5512:
5513: case PT_ALNUM:
5514: for (i = min; i < max; i++)
5515: {
5516: int category;
5517: int len = 1;
5518: if (eptr >= md->end_subject)
5519: {
5520: SCHECK_PARTIAL();
5521: break;
5522: }
5523: GETCHARLENTEST(c, eptr, len);
5524: category = UCD_CATEGORY(c);
5525: if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5526: break;
5527: eptr+= len;
5528: }
5529: break;
5530:
5531: case PT_SPACE: /* Perl space */
5532: for (i = min; i < max; i++)
5533: {
5534: int len = 1;
5535: if (eptr >= md->end_subject)
5536: {
5537: SCHECK_PARTIAL();
5538: break;
5539: }
5540: GETCHARLENTEST(c, eptr, len);
5541: if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5542: c == CHAR_FF || c == CHAR_CR)
5543: == prop_fail_result)
5544: break;
5545: eptr+= len;
5546: }
5547: break;
5548:
5549: case PT_PXSPACE: /* POSIX space */
5550: for (i = min; i < max; i++)
5551: {
5552: int len = 1;
5553: if (eptr >= md->end_subject)
5554: {
5555: SCHECK_PARTIAL();
5556: break;
5557: }
5558: GETCHARLENTEST(c, eptr, len);
5559: if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5560: c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5561: == prop_fail_result)
5562: break;
5563: eptr+= len;
5564: }
5565: break;
5566:
5567: case PT_WORD:
5568: for (i = min; i < max; i++)
5569: {
5570: int category;
5571: int len = 1;
5572: if (eptr >= md->end_subject)
5573: {
5574: SCHECK_PARTIAL();
5575: break;
5576: }
5577: GETCHARLENTEST(c, eptr, len);
5578: category = UCD_CATEGORY(c);
5579: if ((category == ucp_L || category == ucp_N ||
5580: c == CHAR_UNDERSCORE) == prop_fail_result)
5581: break;
5582: eptr+= len;
5583: }
5584: break;
5585:
1.1.1.4 ! misho 5586: case PT_CLIST:
! 5587: for (i = min; i < max; i++)
! 5588: {
! 5589: const pcre_uint32 *cp;
! 5590: int len = 1;
! 5591: if (eptr >= md->end_subject)
! 5592: {
! 5593: SCHECK_PARTIAL();
! 5594: break;
! 5595: }
! 5596: GETCHARLENTEST(c, eptr, len);
! 5597: cp = PRIV(ucd_caseless_sets) + prop_value;
! 5598: for (;;)
! 5599: {
! 5600: if (c < *cp)
! 5601: { if (prop_fail_result) break; else goto GOT_MAX; }
! 5602: if (c == *cp++)
! 5603: { if (prop_fail_result) goto GOT_MAX; else break; }
! 5604: }
! 5605: eptr += len;
! 5606: }
! 5607: GOT_MAX:
! 5608: break;
! 5609:
! 5610: case PT_UCNC:
! 5611: for (i = min; i < max; i++)
! 5612: {
! 5613: int len = 1;
! 5614: if (eptr >= md->end_subject)
! 5615: {
! 5616: SCHECK_PARTIAL();
! 5617: break;
! 5618: }
! 5619: GETCHARLENTEST(c, eptr, len);
! 5620: if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
! 5621: c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
! 5622: c >= 0xe000) == prop_fail_result)
! 5623: break;
! 5624: eptr += len;
! 5625: }
! 5626: break;
! 5627:
1.1 misho 5628: default:
5629: RRETURN(PCRE_ERROR_INTERNAL);
5630: }
5631:
5632: /* eptr is now past the end of the maximum run */
5633:
1.1.1.4 ! misho 5634: if (possessive) continue; /* No backtracking */
1.1 misho 5635: for(;;)
5636: {
1.1.1.4 ! misho 5637: if (eptr == pp) goto TAIL_RECURSE;
1.1 misho 5638: RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5639: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1.1.4 ! misho 5640: eptr--;
1.1.1.2 misho 5641: if (utf) BACKCHAR(eptr);
1.1 misho 5642: }
5643: }
5644:
5645: /* Match extended Unicode sequences. We will get here only if the
5646: support is in the binary; otherwise a compile-time error occurs. */
5647:
5648: else if (ctype == OP_EXTUNI)
5649: {
5650: for (i = min; i < max; i++)
5651: {
5652: if (eptr >= md->end_subject)
5653: {
5654: SCHECK_PARTIAL();
5655: break;
5656: }
1.1.1.4 ! misho 5657: else
1.1 misho 5658: {
1.1.1.4 ! misho 5659: int lgb, rgb;
! 5660: GETCHARINCTEST(c, eptr);
! 5661: lgb = UCD_GRAPHBREAK(c);
! 5662: while (eptr < md->end_subject)
! 5663: {
! 5664: int len = 1;
! 5665: if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
! 5666: rgb = UCD_GRAPHBREAK(c);
! 5667: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
! 5668: lgb = rgb;
! 5669: eptr += len;
! 5670: }
1.1 misho 5671: }
1.1.1.3 misho 5672: CHECK_PARTIAL();
1.1 misho 5673: }
5674:
5675: /* eptr is now past the end of the maximum run */
5676:
1.1.1.4 ! misho 5677: if (possessive) continue; /* No backtracking */
1.1 misho 5678: for(;;)
5679: {
1.1.1.4 ! misho 5680: if (eptr == pp) goto TAIL_RECURSE;
1.1 misho 5681: RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5682: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1.1.4 ! misho 5683: eptr--;
1.1 misho 5684: for (;;) /* Move back over one extended */
5685: {
1.1.1.2 misho 5686: if (!utf) c = *eptr; else
1.1 misho 5687: {
5688: BACKCHAR(eptr);
5689: GETCHAR(c, eptr);
5690: }
5691: if (UCD_CATEGORY(c) != ucp_M) break;
5692: eptr--;
5693: }
5694: }
5695: }
5696:
5697: else
5698: #endif /* SUPPORT_UCP */
5699:
1.1.1.2 misho 5700: #ifdef SUPPORT_UTF
5701: if (utf)
1.1 misho 5702: {
5703: switch(ctype)
5704: {
5705: case OP_ANY:
5706: if (max < INT_MAX)
5707: {
5708: for (i = min; i < max; i++)
5709: {
5710: if (eptr >= md->end_subject)
5711: {
5712: SCHECK_PARTIAL();
5713: break;
5714: }
5715: if (IS_NEWLINE(eptr)) break;
1.1.1.3 misho 5716: if (md->partial != 0 && /* Take care with CRLF partial */
5717: eptr + 1 >= md->end_subject &&
5718: NLBLOCK->nltype == NLTYPE_FIXED &&
5719: NLBLOCK->nllen == 2 &&
1.1.1.4 ! misho 5720: RAWUCHAR(eptr) == NLBLOCK->nl[0])
1.1.1.3 misho 5721: {
5722: md->hitend = TRUE;
5723: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5724: }
1.1 misho 5725: eptr++;
1.1.1.2 misho 5726: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misho 5727: }
5728: }
5729:
5730: /* Handle unlimited UTF-8 repeat */
5731:
5732: else
5733: {
5734: for (i = min; i < max; i++)
5735: {
5736: if (eptr >= md->end_subject)
5737: {
5738: SCHECK_PARTIAL();
5739: break;
5740: }
5741: if (IS_NEWLINE(eptr)) break;
1.1.1.3 misho 5742: if (md->partial != 0 && /* Take care with CRLF partial */
5743: eptr + 1 >= md->end_subject &&
5744: NLBLOCK->nltype == NLTYPE_FIXED &&
5745: NLBLOCK->nllen == 2 &&
1.1.1.4 ! misho 5746: RAWUCHAR(eptr) == NLBLOCK->nl[0])
1.1.1.3 misho 5747: {
5748: md->hitend = TRUE;
5749: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5750: }
1.1 misho 5751: eptr++;
1.1.1.2 misho 5752: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misho 5753: }
5754: }
5755: break;
5756:
5757: case OP_ALLANY:
5758: if (max < INT_MAX)
5759: {
5760: for (i = min; i < max; i++)
5761: {
5762: if (eptr >= md->end_subject)
5763: {
5764: SCHECK_PARTIAL();
5765: break;
5766: }
5767: eptr++;
1.1.1.2 misho 5768: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misho 5769: }
5770: }
5771: else
5772: {
5773: eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5774: SCHECK_PARTIAL();
5775: }
5776: break;
5777:
5778: /* The byte case is the same as non-UTF8 */
5779:
5780: case OP_ANYBYTE:
5781: c = max - min;
5782: if (c > (unsigned int)(md->end_subject - eptr))
5783: {
5784: eptr = md->end_subject;
5785: SCHECK_PARTIAL();
5786: }
5787: else eptr += c;
5788: break;
5789:
5790: case OP_ANYNL:
5791: for (i = min; i < max; i++)
5792: {
5793: int len = 1;
5794: if (eptr >= md->end_subject)
5795: {
5796: SCHECK_PARTIAL();
5797: break;
5798: }
5799: GETCHARLEN(c, eptr, len);
1.1.1.4 ! misho 5800: if (c == CHAR_CR)
1.1 misho 5801: {
5802: if (++eptr >= md->end_subject) break;
1.1.1.4 ! misho 5803: if (RAWUCHAR(eptr) == CHAR_LF) eptr++;
1.1 misho 5804: }
5805: else
5806: {
1.1.1.4 ! misho 5807: if (c != CHAR_LF &&
1.1 misho 5808: (md->bsr_anycrlf ||
1.1.1.4 ! misho 5809: (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
! 5810: #ifndef EBCDIC
! 5811: && c != 0x2028 && c != 0x2029
! 5812: #endif /* Not EBCDIC */
! 5813: )))
1.1 misho 5814: break;
5815: eptr += len;
5816: }
5817: }
5818: break;
5819:
5820: case OP_NOT_HSPACE:
5821: case OP_HSPACE:
5822: for (i = min; i < max; i++)
5823: {
5824: BOOL gotspace;
5825: int len = 1;
5826: if (eptr >= md->end_subject)
5827: {
5828: SCHECK_PARTIAL();
5829: break;
5830: }
5831: GETCHARLEN(c, eptr, len);
5832: switch(c)
5833: {
1.1.1.4 ! misho 5834: HSPACE_CASES: gotspace = TRUE; break;
1.1 misho 5835: default: gotspace = FALSE; break;
5836: }
5837: if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5838: eptr += len;
5839: }
5840: break;
5841:
5842: case OP_NOT_VSPACE:
5843: case OP_VSPACE:
5844: for (i = min; i < max; i++)
5845: {
5846: BOOL gotspace;
5847: int len = 1;
5848: if (eptr >= md->end_subject)
5849: {
5850: SCHECK_PARTIAL();
5851: break;
5852: }
5853: GETCHARLEN(c, eptr, len);
5854: switch(c)
5855: {
1.1.1.4 ! misho 5856: VSPACE_CASES: gotspace = TRUE; break;
1.1 misho 5857: default: gotspace = FALSE; break;
5858: }
5859: if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5860: eptr += len;
5861: }
5862: break;
5863:
5864: case OP_NOT_DIGIT:
5865: for (i = min; i < max; i++)
5866: {
5867: int len = 1;
5868: if (eptr >= md->end_subject)
5869: {
5870: SCHECK_PARTIAL();
5871: break;
5872: }
5873: GETCHARLEN(c, eptr, len);
5874: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5875: eptr+= len;
5876: }
5877: break;
5878:
5879: case OP_DIGIT:
5880: for (i = min; i < max; i++)
5881: {
5882: int len = 1;
5883: if (eptr >= md->end_subject)
5884: {
5885: SCHECK_PARTIAL();
5886: break;
5887: }
5888: GETCHARLEN(c, eptr, len);
5889: if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5890: eptr+= len;
5891: }
5892: break;
5893:
5894: case OP_NOT_WHITESPACE:
5895: for (i = min; i < max; i++)
5896: {
5897: int len = 1;
5898: if (eptr >= md->end_subject)
5899: {
5900: SCHECK_PARTIAL();
5901: break;
5902: }
5903: GETCHARLEN(c, eptr, len);
5904: if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5905: eptr+= len;
5906: }
5907: break;
5908:
5909: case OP_WHITESPACE:
5910: for (i = min; i < max; i++)
5911: {
5912: int len = 1;
5913: if (eptr >= md->end_subject)
5914: {
5915: SCHECK_PARTIAL();
5916: break;
5917: }
5918: GETCHARLEN(c, eptr, len);
5919: if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5920: eptr+= len;
5921: }
5922: break;
5923:
5924: case OP_NOT_WORDCHAR:
5925: for (i = min; i < max; i++)
5926: {
5927: int len = 1;
5928: if (eptr >= md->end_subject)
5929: {
5930: SCHECK_PARTIAL();
5931: break;
5932: }
5933: GETCHARLEN(c, eptr, len);
5934: if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5935: eptr+= len;
5936: }
5937: break;
5938:
5939: case OP_WORDCHAR:
5940: for (i = min; i < max; i++)
5941: {
5942: int len = 1;
5943: if (eptr >= md->end_subject)
5944: {
5945: SCHECK_PARTIAL();
5946: break;
5947: }
5948: GETCHARLEN(c, eptr, len);
5949: if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5950: eptr+= len;
5951: }
5952: break;
5953:
5954: default:
5955: RRETURN(PCRE_ERROR_INTERNAL);
5956: }
5957:
1.1.1.4 ! misho 5958: if (possessive) continue; /* No backtracking */
1.1 misho 5959: for(;;)
5960: {
1.1.1.4 ! misho 5961: if (eptr == pp) goto TAIL_RECURSE;
1.1 misho 5962: RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5963: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1.1.4 ! misho 5964: eptr--;
1.1 misho 5965: BACKCHAR(eptr);
1.1.1.4 ! misho 5966: if (ctype == OP_ANYNL && eptr > pp && RAWUCHAR(eptr) == CHAR_NL &&
! 5967: RAWUCHAR(eptr - 1) == CHAR_CR) eptr--;
1.1 misho 5968: }
5969: }
5970: else
1.1.1.2 misho 5971: #endif /* SUPPORT_UTF */
5972: /* Not UTF mode */
1.1 misho 5973: {
5974: switch(ctype)
5975: {
5976: case OP_ANY:
5977: for (i = min; i < max; i++)
5978: {
5979: if (eptr >= md->end_subject)
5980: {
5981: SCHECK_PARTIAL();
5982: break;
5983: }
5984: if (IS_NEWLINE(eptr)) break;
1.1.1.3 misho 5985: if (md->partial != 0 && /* Take care with CRLF partial */
5986: eptr + 1 >= md->end_subject &&
5987: NLBLOCK->nltype == NLTYPE_FIXED &&
5988: NLBLOCK->nllen == 2 &&
5989: *eptr == NLBLOCK->nl[0])
5990: {
5991: md->hitend = TRUE;
5992: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5993: }
1.1 misho 5994: eptr++;
5995: }
5996: break;
5997:
5998: case OP_ALLANY:
5999: case OP_ANYBYTE:
6000: c = max - min;
6001: if (c > (unsigned int)(md->end_subject - eptr))
6002: {
6003: eptr = md->end_subject;
6004: SCHECK_PARTIAL();
6005: }
6006: else eptr += c;
6007: break;
6008:
6009: case OP_ANYNL:
6010: for (i = min; i < max; i++)
6011: {
6012: if (eptr >= md->end_subject)
6013: {
6014: SCHECK_PARTIAL();
6015: break;
6016: }
6017: c = *eptr;
1.1.1.4 ! misho 6018: if (c == CHAR_CR)
1.1 misho 6019: {
6020: if (++eptr >= md->end_subject) break;
1.1.1.4 ! misho 6021: if (*eptr == CHAR_LF) eptr++;
1.1 misho 6022: }
6023: else
6024: {
1.1.1.4 ! misho 6025: if (c != CHAR_LF && (md->bsr_anycrlf ||
! 6026: (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
! 6027: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
! 6028: && c != 0x2028 && c != 0x2029
1.1.1.2 misho 6029: #endif
1.1.1.4 ! misho 6030: ))) break;
1.1 misho 6031: eptr++;
6032: }
6033: }
6034: break;
6035:
6036: case OP_NOT_HSPACE:
6037: for (i = min; i < max; i++)
6038: {
6039: if (eptr >= md->end_subject)
6040: {
6041: SCHECK_PARTIAL();
6042: break;
6043: }
1.1.1.4 ! misho 6044: switch(*eptr)
! 6045: {
! 6046: default: eptr++; break;
! 6047: HSPACE_BYTE_CASES:
! 6048: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
! 6049: HSPACE_MULTIBYTE_CASES:
1.1.1.2 misho 6050: #endif
1.1.1.4 ! misho 6051: goto ENDLOOP00;
! 6052: }
1.1 misho 6053: }
1.1.1.4 ! misho 6054: ENDLOOP00:
1.1 misho 6055: break;
6056:
6057: case OP_HSPACE:
6058: for (i = min; i < max; i++)
6059: {
6060: if (eptr >= md->end_subject)
6061: {
6062: SCHECK_PARTIAL();
6063: break;
6064: }
1.1.1.4 ! misho 6065: switch(*eptr)
! 6066: {
! 6067: default: goto ENDLOOP01;
! 6068: HSPACE_BYTE_CASES:
! 6069: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
! 6070: HSPACE_MULTIBYTE_CASES:
1.1.1.2 misho 6071: #endif
1.1.1.4 ! misho 6072: eptr++; break;
! 6073: }
1.1 misho 6074: }
1.1.1.4 ! misho 6075: ENDLOOP01:
1.1 misho 6076: break;
6077:
6078: case OP_NOT_VSPACE:
6079: for (i = min; i < max; i++)
6080: {
6081: if (eptr >= md->end_subject)
6082: {
6083: SCHECK_PARTIAL();
6084: break;
6085: }
1.1.1.4 ! misho 6086: switch(*eptr)
! 6087: {
! 6088: default: eptr++; break;
! 6089: VSPACE_BYTE_CASES:
! 6090: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
! 6091: VSPACE_MULTIBYTE_CASES:
1.1.1.2 misho 6092: #endif
1.1.1.4 ! misho 6093: goto ENDLOOP02;
! 6094: }
1.1 misho 6095: }
1.1.1.4 ! misho 6096: ENDLOOP02:
1.1 misho 6097: break;
6098:
6099: case OP_VSPACE:
6100: for (i = min; i < max; i++)
6101: {
6102: if (eptr >= md->end_subject)
6103: {
6104: SCHECK_PARTIAL();
6105: break;
6106: }
1.1.1.4 ! misho 6107: switch(*eptr)
! 6108: {
! 6109: default: goto ENDLOOP03;
! 6110: VSPACE_BYTE_CASES:
! 6111: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
! 6112: VSPACE_MULTIBYTE_CASES:
1.1.1.2 misho 6113: #endif
1.1.1.4 ! misho 6114: eptr++; break;
! 6115: }
1.1 misho 6116: }
1.1.1.4 ! misho 6117: ENDLOOP03:
1.1 misho 6118: break;
6119:
6120: case OP_NOT_DIGIT:
6121: for (i = min; i < max; i++)
6122: {
6123: if (eptr >= md->end_subject)
6124: {
6125: SCHECK_PARTIAL();
6126: break;
6127: }
1.1.1.2 misho 6128: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
1.1 misho 6129: eptr++;
6130: }
6131: break;
6132:
6133: case OP_DIGIT:
6134: for (i = min; i < max; i++)
6135: {
6136: if (eptr >= md->end_subject)
6137: {
6138: SCHECK_PARTIAL();
6139: break;
6140: }
1.1.1.2 misho 6141: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
1.1 misho 6142: eptr++;
6143: }
6144: break;
6145:
6146: case OP_NOT_WHITESPACE:
6147: for (i = min; i < max; i++)
6148: {
6149: if (eptr >= md->end_subject)
6150: {
6151: SCHECK_PARTIAL();
6152: break;
6153: }
1.1.1.2 misho 6154: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
1.1 misho 6155: eptr++;
6156: }
6157: break;
6158:
6159: case OP_WHITESPACE:
6160: for (i = min; i < max; i++)
6161: {
6162: if (eptr >= md->end_subject)
6163: {
6164: SCHECK_PARTIAL();
6165: break;
6166: }
1.1.1.2 misho 6167: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
1.1 misho 6168: eptr++;
6169: }
6170: break;
6171:
6172: case OP_NOT_WORDCHAR:
6173: for (i = min; i < max; i++)
6174: {
6175: if (eptr >= md->end_subject)
6176: {
6177: SCHECK_PARTIAL();
6178: break;
6179: }
1.1.1.2 misho 6180: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
1.1 misho 6181: eptr++;
6182: }
6183: break;
6184:
6185: case OP_WORDCHAR:
6186: for (i = min; i < max; i++)
6187: {
6188: if (eptr >= md->end_subject)
6189: {
6190: SCHECK_PARTIAL();
6191: break;
6192: }
1.1.1.2 misho 6193: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
1.1 misho 6194: eptr++;
6195: }
6196: break;
6197:
6198: default:
6199: RRETURN(PCRE_ERROR_INTERNAL);
6200: }
6201:
1.1.1.4 ! misho 6202: if (possessive) continue; /* No backtracking */
! 6203: for (;;)
1.1 misho 6204: {
1.1.1.4 ! misho 6205: if (eptr == pp) goto TAIL_RECURSE;
1.1 misho 6206: RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6207: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6208: eptr--;
1.1.1.4 ! misho 6209: if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
! 6210: eptr[-1] == CHAR_CR) eptr--;
1.1 misho 6211: }
6212: }
6213:
6214: /* Get here if we can't make it match with any permitted repetitions */
6215:
6216: RRETURN(MATCH_NOMATCH);
6217: }
6218: /* Control never gets here */
6219:
6220: /* There's been some horrible disaster. Arrival here can only mean there is
6221: something seriously wrong in the code above or the OP_xxx definitions. */
6222:
6223: default:
6224: DPRINTF(("Unknown opcode %d\n", *ecode));
6225: RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6226: }
6227:
6228: /* Do not stick any code in here without much thought; it is assumed
6229: that "continue" in the code above comes out to here to repeat the main
6230: loop. */
6231:
6232: } /* End of main loop */
6233: /* Control never reaches here */
6234:
6235:
6236: /* When compiling to use the heap rather than the stack for recursive calls to
6237: match(), the RRETURN() macro jumps here. The number that is saved in
6238: frame->Xwhere indicates which label we actually want to return to. */
6239:
6240: #ifdef NO_RECURSE
6241: #define LBL(val) case val: goto L_RM##val;
6242: HEAP_RETURN:
6243: switch (frame->Xwhere)
6244: {
6245: LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6246: LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6247: LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6248: LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6249: LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6250: LBL(65) LBL(66)
1.1.1.2 misho 6251: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6252: LBL(21)
6253: #endif
6254: #ifdef SUPPORT_UTF
6255: LBL(16) LBL(18) LBL(20)
6256: LBL(22) LBL(23) LBL(28) LBL(30)
1.1 misho 6257: LBL(32) LBL(34) LBL(42) LBL(46)
6258: #ifdef SUPPORT_UCP
6259: LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
1.1.1.4 ! misho 6260: LBL(59) LBL(60) LBL(61) LBL(62) LBL(67) LBL(68)
1.1 misho 6261: #endif /* SUPPORT_UCP */
1.1.1.2 misho 6262: #endif /* SUPPORT_UTF */
1.1 misho 6263: default:
6264: DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6265: return PCRE_ERROR_INTERNAL;
6266: }
6267: #undef LBL
6268: #endif /* NO_RECURSE */
6269: }
6270:
6271:
6272: /***************************************************************************
6273: ****************************************************************************
6274: RECURSION IN THE match() FUNCTION
6275:
6276: Undefine all the macros that were defined above to handle this. */
6277:
6278: #ifdef NO_RECURSE
6279: #undef eptr
6280: #undef ecode
6281: #undef mstart
6282: #undef offset_top
6283: #undef eptrb
6284: #undef flags
6285:
6286: #undef callpat
6287: #undef charptr
6288: #undef data
6289: #undef next
6290: #undef pp
6291: #undef prev
6292: #undef saved_eptr
6293:
6294: #undef new_recursive
6295:
6296: #undef cur_is_word
6297: #undef condition
6298: #undef prev_is_word
6299:
6300: #undef ctype
6301: #undef length
6302: #undef max
6303: #undef min
6304: #undef number
6305: #undef offset
6306: #undef op
6307: #undef save_capture_last
6308: #undef save_offset1
6309: #undef save_offset2
6310: #undef save_offset3
6311: #undef stacksave
6312:
6313: #undef newptrb
6314:
6315: #endif
6316:
6317: /* These two are defined as macros in both cases */
6318:
6319: #undef fc
6320: #undef fi
6321:
6322: /***************************************************************************
6323: ***************************************************************************/
6324:
6325:
1.1.1.3 misho 6326: #ifdef NO_RECURSE
6327: /*************************************************
6328: * Release allocated heap frames *
6329: *************************************************/
6330:
6331: /* This function releases all the allocated frames. The base frame is on the
6332: machine stack, and so must not be freed.
6333:
6334: Argument: the address of the base frame
6335: Returns: nothing
6336: */
6337:
6338: static void
6339: release_match_heapframes (heapframe *frame_base)
6340: {
6341: heapframe *nextframe = frame_base->Xnextframe;
6342: while (nextframe != NULL)
6343: {
6344: heapframe *oldframe = nextframe;
6345: nextframe = nextframe->Xnextframe;
6346: (PUBL(stack_free))(oldframe);
6347: }
6348: }
6349: #endif
6350:
1.1 misho 6351:
6352: /*************************************************
6353: * Execute a Regular Expression *
6354: *************************************************/
6355:
6356: /* This function applies a compiled re to a subject string and picks out
6357: portions of the string if it matches. Two elements in the vector are set for
6358: each substring: the offsets to the start and end of the substring.
6359:
6360: Arguments:
6361: argument_re points to the compiled expression
6362: extra_data points to extra data or is NULL
6363: subject points to the subject string
6364: length length of subject string (may contain binary zeros)
6365: start_offset where to start in the subject string
6366: options option bits
6367: offsets points to a vector of ints to be filled in with offsets
6368: offsetcount the number of elements in the vector
6369:
6370: Returns: > 0 => success; value is the number of elements filled in
6371: = 0 => success, but offsets is not big enough
6372: -1 => failed to match
6373: < -1 => some kind of unexpected problem
6374: */
6375:
1.1.1.4 ! misho 6376: #if defined COMPILE_PCRE8
1.1 misho 6377: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6378: pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6379: PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6380: int offsetcount)
1.1.1.4 ! misho 6381: #elif defined COMPILE_PCRE16
1.1.1.2 misho 6382: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6383: pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6384: PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6385: int offsetcount)
1.1.1.4 ! misho 6386: #elif defined COMPILE_PCRE32
! 6387: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
! 6388: pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
! 6389: PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
! 6390: int offsetcount)
1.1.1.2 misho 6391: #endif
1.1 misho 6392: {
6393: int rc, ocount, arg_offset_max;
6394: int newline;
6395: BOOL using_temporary_offsets = FALSE;
6396: BOOL anchored;
6397: BOOL startline;
6398: BOOL firstline;
1.1.1.2 misho 6399: BOOL utf;
6400: BOOL has_first_char = FALSE;
6401: BOOL has_req_char = FALSE;
6402: pcre_uchar first_char = 0;
6403: pcre_uchar first_char2 = 0;
6404: pcre_uchar req_char = 0;
6405: pcre_uchar req_char2 = 0;
1.1 misho 6406: match_data match_block;
6407: match_data *md = &match_block;
1.1.1.2 misho 6408: const pcre_uint8 *tables;
6409: const pcre_uint8 *start_bits = NULL;
6410: PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6411: PCRE_PUCHAR end_subject;
6412: PCRE_PUCHAR start_partial = NULL;
1.1.1.4 ! misho 6413: PCRE_PUCHAR match_partial;
1.1.1.2 misho 6414: PCRE_PUCHAR req_char_ptr = start_match - 1;
1.1 misho 6415:
6416: const pcre_study_data *study;
1.1.1.2 misho 6417: const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
1.1 misho 6418:
1.1.1.3 misho 6419: #ifdef NO_RECURSE
6420: heapframe frame_zero;
6421: frame_zero.Xprevframe = NULL; /* Marks the top level */
6422: frame_zero.Xnextframe = NULL; /* None are allocated yet */
6423: md->match_frames_base = &frame_zero;
6424: #endif
6425:
1.1.1.2 misho 6426: /* Check for the special magic call that measures the size of the stack used
1.1.1.3 misho 6427: per recursive call of match(). Without the funny casting for sizeof, a Windows
6428: compiler gave this error: "unary minus operator applied to unsigned type,
6429: result still unsigned". Hopefully the cast fixes that. */
1.1.1.2 misho 6430:
6431: if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6432: start_offset == -999)
6433: #ifdef NO_RECURSE
1.1.1.3 misho 6434: return -((int)sizeof(heapframe));
1.1.1.2 misho 6435: #else
6436: return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6437: #endif
1.1 misho 6438:
6439: /* Plausibility checks */
6440:
6441: if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
1.1.1.2 misho 6442: if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6443: return PCRE_ERROR_NULL;
1.1 misho 6444: if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
1.1.1.4 ! misho 6445: if (length < 0) return PCRE_ERROR_BADLENGTH;
1.1 misho 6446: if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6447:
1.1.1.2 misho 6448: /* Check that the first field in the block is the magic number. If it is not,
6449: return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6450: REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6451: means that the pattern is likely compiled with different endianness. */
6452:
6453: if (re->magic_number != MAGIC_NUMBER)
6454: return re->magic_number == REVERSED_MAGIC_NUMBER?
6455: PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6456: if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6457:
1.1 misho 6458: /* These two settings are used in the code for checking a UTF-8 string that
6459: follows immediately afterwards. Other values in the md block are used only
6460: during "normal" pcre_exec() processing, not when the JIT support is in use,
6461: so they are set up later. */
6462:
1.1.1.2 misho 6463: /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6464: utf = md->utf = (re->options & PCRE_UTF8) != 0;
1.1 misho 6465: md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6466: ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6467:
6468: /* Check a UTF-8 string if required. Pass back the character offset and error
6469: code for an invalid string if a results vector is available. */
6470:
1.1.1.2 misho 6471: #ifdef SUPPORT_UTF
6472: if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
1.1 misho 6473: {
6474: int erroroffset;
1.1.1.2 misho 6475: int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
1.1 misho 6476: if (errorcode != 0)
6477: {
6478: if (offsetcount >= 2)
6479: {
6480: offsets[0] = erroroffset;
6481: offsets[1] = errorcode;
6482: }
1.1.1.4 ! misho 6483: #if defined COMPILE_PCRE8
1.1 misho 6484: return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6485: PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
1.1.1.4 ! misho 6486: #elif defined COMPILE_PCRE16
! 6487: return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
! 6488: PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
! 6489: #elif defined COMPILE_PCRE32
! 6490: return PCRE_ERROR_BADUTF32;
1.1.1.2 misho 6491: #endif
1.1 misho 6492: }
1.1.1.4 ! misho 6493: #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
1.1.1.2 misho 6494: /* Check that a start_offset points to the start of a UTF character. */
1.1 misho 6495: if (start_offset > 0 && start_offset < length &&
1.1.1.2 misho 6496: NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
1.1 misho 6497: return PCRE_ERROR_BADUTF8_OFFSET;
1.1.1.4 ! misho 6498: #endif
1.1 misho 6499: }
6500: #endif
6501:
6502: /* If the pattern was successfully studied with JIT support, run the JIT
6503: executable instead of the rest of this function. Most options must be set at
6504: compile time for the JIT code to be usable. Fallback to the normal code path if
1.1.1.3 misho 6505: an unsupported flag is set. */
1.1 misho 6506:
6507: #ifdef SUPPORT_JIT
6508: if (extra_data != NULL
1.1.1.3 misho 6509: && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6510: PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
1.1 misho 6511: && extra_data->executable_jit != NULL
1.1.1.4 ! misho 6512: && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0)
1.1.1.3 misho 6513: {
1.1.1.4 ! misho 6514: rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length,
1.1.1.3 misho 6515: start_offset, options, offsets, offsetcount);
6516:
6517: /* PCRE_ERROR_NULL means that the selected normal or partial matching
6518: mode is not compiled. In this case we simply fallback to interpreter. */
6519:
1.1.1.4 ! misho 6520: if (rc != PCRE_ERROR_JIT_BADOPTION) return rc;
1.1.1.3 misho 6521: }
1.1 misho 6522: #endif
6523:
6524: /* Carry on with non-JIT matching. This information is for finding all the
6525: numbers associated with a given name, for condition testing. */
6526:
1.1.1.2 misho 6527: md->name_table = (pcre_uchar *)re + re->name_table_offset;
1.1 misho 6528: md->name_count = re->name_count;
6529: md->name_entry_size = re->name_entry_size;
6530:
6531: /* Fish out the optional data from the extra_data structure, first setting
6532: the default values. */
6533:
6534: study = NULL;
6535: md->match_limit = MATCH_LIMIT;
6536: md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6537: md->callout_data = NULL;
6538:
6539: /* The table pointer is always in native byte order. */
6540:
1.1.1.2 misho 6541: tables = re->tables;
1.1 misho 6542:
1.1.1.4 ! misho 6543: /* The two limit values override the defaults, whatever their value. */
! 6544:
1.1 misho 6545: if (extra_data != NULL)
6546: {
6547: register unsigned int flags = extra_data->flags;
6548: if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6549: study = (const pcre_study_data *)extra_data->study_data;
6550: if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6551: md->match_limit = extra_data->match_limit;
6552: if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6553: md->match_limit_recursion = extra_data->match_limit_recursion;
6554: if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6555: md->callout_data = extra_data->callout_data;
6556: if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6557: }
6558:
1.1.1.4 ! misho 6559: /* Limits in the regex override only if they are smaller. */
! 6560:
! 6561: if ((re->flags & PCRE_MLSET) != 0 && re->limit_match < md->match_limit)
! 6562: md->match_limit = re->limit_match;
! 6563:
! 6564: if ((re->flags & PCRE_RLSET) != 0 &&
! 6565: re->limit_recursion < md->match_limit_recursion)
! 6566: md->match_limit_recursion = re->limit_recursion;
! 6567:
1.1 misho 6568: /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6569: is a feature that makes it possible to save compiled regex and re-use them
6570: in other programs later. */
6571:
1.1.1.2 misho 6572: if (tables == NULL) tables = PRIV(default_tables);
1.1 misho 6573:
6574: /* Set up other data */
6575:
6576: anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6577: startline = (re->flags & PCRE_STARTLINE) != 0;
6578: firstline = (re->options & PCRE_FIRSTLINE) != 0;
6579:
6580: /* The code starts after the real_pcre block and the capture name table. */
6581:
1.1.1.2 misho 6582: md->start_code = (const pcre_uchar *)re + re->name_table_offset +
1.1 misho 6583: re->name_count * re->name_entry_size;
6584:
1.1.1.2 misho 6585: md->start_subject = (PCRE_PUCHAR)subject;
1.1 misho 6586: md->start_offset = start_offset;
6587: md->end_subject = md->start_subject + length;
6588: end_subject = md->end_subject;
6589:
6590: md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6591: md->use_ucp = (re->options & PCRE_UCP) != 0;
6592: md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
1.1.1.4 ! misho 6593: md->ignore_skip_arg = 0;
1.1 misho 6594:
6595: /* Some options are unpacked into BOOL variables in the hope that testing
6596: them will be faster than individual option bits. */
6597:
6598: md->notbol = (options & PCRE_NOTBOL) != 0;
6599: md->noteol = (options & PCRE_NOTEOL) != 0;
6600: md->notempty = (options & PCRE_NOTEMPTY) != 0;
6601: md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6602:
6603: md->hitend = FALSE;
6604: md->mark = md->nomatch_mark = NULL; /* In case never set */
6605:
6606: md->recursive = NULL; /* No recursion at top level */
6607: md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6608:
6609: md->lcc = tables + lcc_offset;
1.1.1.2 misho 6610: md->fcc = tables + fcc_offset;
1.1 misho 6611: md->ctypes = tables + ctypes_offset;
6612:
6613: /* Handle different \R options. */
6614:
6615: switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6616: {
6617: case 0:
6618: if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6619: md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6620: else
6621: #ifdef BSR_ANYCRLF
6622: md->bsr_anycrlf = TRUE;
6623: #else
6624: md->bsr_anycrlf = FALSE;
6625: #endif
6626: break;
6627:
6628: case PCRE_BSR_ANYCRLF:
6629: md->bsr_anycrlf = TRUE;
6630: break;
6631:
6632: case PCRE_BSR_UNICODE:
6633: md->bsr_anycrlf = FALSE;
6634: break;
6635:
6636: default: return PCRE_ERROR_BADNEWLINE;
6637: }
6638:
6639: /* Handle different types of newline. The three bits give eight cases. If
6640: nothing is set at run time, whatever was used at compile time applies. */
6641:
6642: switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6643: (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6644: {
6645: case 0: newline = NEWLINE; break; /* Compile-time default */
6646: case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6647: case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6648: case PCRE_NEWLINE_CR+
6649: PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6650: case PCRE_NEWLINE_ANY: newline = -1; break;
6651: case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6652: default: return PCRE_ERROR_BADNEWLINE;
6653: }
6654:
6655: if (newline == -2)
6656: {
6657: md->nltype = NLTYPE_ANYCRLF;
6658: }
6659: else if (newline < 0)
6660: {
6661: md->nltype = NLTYPE_ANY;
6662: }
6663: else
6664: {
6665: md->nltype = NLTYPE_FIXED;
6666: if (newline > 255)
6667: {
6668: md->nllen = 2;
6669: md->nl[0] = (newline >> 8) & 255;
6670: md->nl[1] = newline & 255;
6671: }
6672: else
6673: {
6674: md->nllen = 1;
6675: md->nl[0] = newline;
6676: }
6677: }
6678:
6679: /* Partial matching was originally supported only for a restricted set of
6680: regexes; from release 8.00 there are no restrictions, but the bits are still
6681: defined (though never set). So there's no harm in leaving this code. */
6682:
6683: if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6684: return PCRE_ERROR_BADPARTIAL;
6685:
6686: /* If the expression has got more back references than the offsets supplied can
6687: hold, we get a temporary chunk of working store to use during the matching.
6688: Otherwise, we can use the vector supplied, rounding down its size to a multiple
6689: of 3. */
6690:
6691: ocount = offsetcount - (offsetcount % 3);
6692: arg_offset_max = (2*ocount)/3;
6693:
6694: if (re->top_backref > 0 && re->top_backref >= ocount/3)
6695: {
6696: ocount = re->top_backref * 3 + 3;
1.1.1.2 misho 6697: md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
1.1 misho 6698: if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6699: using_temporary_offsets = TRUE;
6700: DPRINTF(("Got memory to hold back references\n"));
6701: }
6702: else md->offset_vector = offsets;
6703: md->offset_end = ocount;
6704: md->offset_max = (2*ocount)/3;
1.1.1.4 ! misho 6705: md->capture_last = 0;
1.1 misho 6706:
6707: /* Reset the working variable associated with each extraction. These should
6708: never be used unless previously set, but they get saved and restored, and so we
6709: initialize them to avoid reading uninitialized locations. Also, unset the
6710: offsets for the matched string. This is really just for tidiness with callouts,
6711: in case they inspect these fields. */
6712:
6713: if (md->offset_vector != NULL)
6714: {
6715: register int *iptr = md->offset_vector + ocount;
6716: register int *iend = iptr - re->top_bracket;
6717: if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6718: while (--iptr >= iend) *iptr = -1;
6719: md->offset_vector[0] = md->offset_vector[1] = -1;
6720: }
6721:
1.1.1.2 misho 6722: /* Set up the first character to match, if available. The first_char value is
1.1 misho 6723: never set for an anchored regular expression, but the anchoring may be forced
6724: at run time, so we have to test for anchoring. The first char may be unset for
6725: an unanchored pattern, of course. If there's no first char and the pattern was
6726: studied, there may be a bitmap of possible first characters. */
6727:
6728: if (!anchored)
6729: {
6730: if ((re->flags & PCRE_FIRSTSET) != 0)
6731: {
1.1.1.2 misho 6732: has_first_char = TRUE;
6733: first_char = first_char2 = (pcre_uchar)(re->first_char);
6734: if ((re->flags & PCRE_FCH_CASELESS) != 0)
6735: {
6736: first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6737: #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6738: if (utf && first_char > 127)
6739: first_char2 = UCD_OTHERCASE(first_char);
6740: #endif
6741: }
1.1 misho 6742: }
6743: else
6744: if (!startline && study != NULL &&
6745: (study->flags & PCRE_STUDY_MAPPED) != 0)
6746: start_bits = study->start_bits;
6747: }
6748:
6749: /* For anchored or unanchored matches, there may be a "last known required
6750: character" set. */
6751:
6752: if ((re->flags & PCRE_REQCHSET) != 0)
6753: {
1.1.1.2 misho 6754: has_req_char = TRUE;
6755: req_char = req_char2 = (pcre_uchar)(re->req_char);
6756: if ((re->flags & PCRE_RCH_CASELESS) != 0)
6757: {
6758: req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6759: #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6760: if (utf && req_char > 127)
6761: req_char2 = UCD_OTHERCASE(req_char);
6762: #endif
6763: }
1.1 misho 6764: }
6765:
6766:
6767: /* ==========================================================================*/
6768:
6769: /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6770: the loop runs just once. */
6771:
6772: for(;;)
6773: {
1.1.1.2 misho 6774: PCRE_PUCHAR save_end_subject = end_subject;
6775: PCRE_PUCHAR new_start_match;
1.1 misho 6776:
6777: /* If firstline is TRUE, the start of the match is constrained to the first
6778: line of a multiline string. That is, the match must be before or at the first
6779: newline. Implement this by temporarily adjusting end_subject so that we stop
6780: scanning at a newline. If the match fails at the newline, later code breaks
6781: this loop. */
6782:
6783: if (firstline)
6784: {
1.1.1.2 misho 6785: PCRE_PUCHAR t = start_match;
6786: #ifdef SUPPORT_UTF
6787: if (utf)
1.1 misho 6788: {
6789: while (t < md->end_subject && !IS_NEWLINE(t))
6790: {
6791: t++;
1.1.1.2 misho 6792: ACROSSCHAR(t < end_subject, *t, t++);
1.1 misho 6793: }
6794: }
6795: else
6796: #endif
6797: while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6798: end_subject = t;
6799: }
6800:
6801: /* There are some optimizations that avoid running the match if a known
6802: starting point is not found, or if a known later character is not present.
6803: However, there is an option that disables these, for testing and for ensuring
6804: that all callouts do actually occur. The option can be set in the regex by
6805: (*NO_START_OPT) or passed in match-time options. */
6806:
6807: if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6808: {
1.1.1.2 misho 6809: /* Advance to a unique first char if there is one. */
1.1 misho 6810:
1.1.1.2 misho 6811: if (has_first_char)
1.1 misho 6812: {
1.1.1.4 ! misho 6813: pcre_uchar smc;
! 6814:
1.1.1.2 misho 6815: if (first_char != first_char2)
6816: while (start_match < end_subject &&
1.1.1.4 ! misho 6817: (smc = RAWUCHARTEST(start_match)) != first_char && smc != first_char2)
1.1 misho 6818: start_match++;
6819: else
1.1.1.4 ! misho 6820: while (start_match < end_subject && RAWUCHARTEST(start_match) != first_char)
1.1 misho 6821: start_match++;
6822: }
6823:
6824: /* Or to just after a linebreak for a multiline match */
6825:
6826: else if (startline)
6827: {
6828: if (start_match > md->start_subject + start_offset)
6829: {
1.1.1.2 misho 6830: #ifdef SUPPORT_UTF
6831: if (utf)
1.1 misho 6832: {
6833: while (start_match < end_subject && !WAS_NEWLINE(start_match))
6834: {
6835: start_match++;
1.1.1.2 misho 6836: ACROSSCHAR(start_match < end_subject, *start_match,
6837: start_match++);
1.1 misho 6838: }
6839: }
6840: else
6841: #endif
6842: while (start_match < end_subject && !WAS_NEWLINE(start_match))
6843: start_match++;
6844:
6845: /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6846: and we are now at a LF, advance the match position by one more character.
6847: */
6848:
6849: if (start_match[-1] == CHAR_CR &&
6850: (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6851: start_match < end_subject &&
1.1.1.4 ! misho 6852: RAWUCHARTEST(start_match) == CHAR_NL)
1.1 misho 6853: start_match++;
6854: }
6855: }
6856:
6857: /* Or to a non-unique first byte after study */
6858:
6859: else if (start_bits != NULL)
6860: {
6861: while (start_match < end_subject)
6862: {
1.1.1.4 ! misho 6863: register pcre_uint32 c = RAWUCHARTEST(start_match);
1.1.1.2 misho 6864: #ifndef COMPILE_PCRE8
6865: if (c > 255) c = 255;
6866: #endif
1.1 misho 6867: if ((start_bits[c/8] & (1 << (c&7))) == 0)
6868: {
6869: start_match++;
1.1.1.2 misho 6870: #if defined SUPPORT_UTF && defined COMPILE_PCRE8
6871: /* In non 8-bit mode, the iteration will stop for
6872: characters > 255 at the beginning or not stop at all. */
6873: if (utf)
6874: ACROSSCHAR(start_match < end_subject, *start_match,
6875: start_match++);
1.1 misho 6876: #endif
6877: }
6878: else break;
6879: }
6880: }
6881: } /* Starting optimizations */
6882:
6883: /* Restore fudged end_subject */
6884:
6885: end_subject = save_end_subject;
6886:
6887: /* The following two optimizations are disabled for partial matching or if
6888: disabling is explicitly requested. */
6889:
6890: if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6891: {
6892: /* If the pattern was studied, a minimum subject length may be set. This is
6893: a lower bound; no actual string of that length may actually match the
6894: pattern. Although the value is, strictly, in characters, we treat it as
6895: bytes to avoid spending too much time in this optimization. */
6896:
6897: if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6898: (pcre_uint32)(end_subject - start_match) < study->minlength)
6899: {
6900: rc = MATCH_NOMATCH;
6901: break;
6902: }
6903:
1.1.1.2 misho 6904: /* If req_char is set, we know that that character must appear in the
6905: subject for the match to succeed. If the first character is set, req_char
1.1 misho 6906: must be later in the subject; otherwise the test starts at the match point.
6907: This optimization can save a huge amount of backtracking in patterns with
6908: nested unlimited repeats that aren't going to match. Writing separate code
6909: for cased/caseless versions makes it go faster, as does using an
6910: autoincrement and backing off on a match.
6911:
6912: HOWEVER: when the subject string is very, very long, searching to its end
6913: can take a long time, and give bad performance on quite ordinary patterns.
6914: This showed up when somebody was matching something like /^\d+C/ on a
6915: 32-megabyte string... so we don't do this when the string is sufficiently
6916: long. */
6917:
1.1.1.2 misho 6918: if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
1.1 misho 6919: {
1.1.1.2 misho 6920: register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
1.1 misho 6921:
6922: /* We don't need to repeat the search if we haven't yet reached the
6923: place we found it at last time. */
6924:
1.1.1.2 misho 6925: if (p > req_char_ptr)
1.1 misho 6926: {
1.1.1.2 misho 6927: if (req_char != req_char2)
1.1 misho 6928: {
6929: while (p < end_subject)
6930: {
1.1.1.4 ! misho 6931: register pcre_uint32 pp = RAWUCHARINCTEST(p);
1.1.1.2 misho 6932: if (pp == req_char || pp == req_char2) { p--; break; }
1.1 misho 6933: }
6934: }
6935: else
6936: {
6937: while (p < end_subject)
6938: {
1.1.1.4 ! misho 6939: if (RAWUCHARINCTEST(p) == req_char) { p--; break; }
1.1 misho 6940: }
6941: }
6942:
6943: /* If we can't find the required character, break the matching loop,
6944: forcing a match failure. */
6945:
6946: if (p >= end_subject)
6947: {
6948: rc = MATCH_NOMATCH;
6949: break;
6950: }
6951:
6952: /* If we have found the required character, save the point where we
6953: found it, so that we don't search again next time round the loop if
6954: the start hasn't passed this character yet. */
6955:
1.1.1.2 misho 6956: req_char_ptr = p;
1.1 misho 6957: }
6958: }
6959: }
6960:
6961: #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6962: printf(">>>> Match against: ");
6963: pchars(start_match, end_subject - start_match, TRUE, md);
6964: printf("\n");
6965: #endif
6966:
6967: /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6968: first starting point for which a partial match was found. */
6969:
6970: md->start_match_ptr = start_match;
6971: md->start_used_ptr = start_match;
6972: md->match_call_count = 0;
6973: md->match_function_type = 0;
6974: md->end_offset_top = 0;
1.1.1.4 ! misho 6975: md->skip_arg_count = 0;
1.1 misho 6976: rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0);
1.1.1.4 ! misho 6977: if (md->hitend && start_partial == NULL)
! 6978: {
! 6979: start_partial = md->start_used_ptr;
! 6980: match_partial = start_match;
! 6981: }
1.1 misho 6982:
6983: switch(rc)
6984: {
6985: /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6986: the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
6987: entirely. The only way we can do that is to re-do the match at the same
6988: point, with a flag to force SKIP with an argument to be ignored. Just
6989: treating this case as NOMATCH does not work because it does not check other
6990: alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
6991:
6992: case MATCH_SKIP_ARG:
6993: new_start_match = start_match;
1.1.1.4 ! misho 6994: md->ignore_skip_arg = md->skip_arg_count;
1.1 misho 6995: break;
6996:
1.1.1.4 ! misho 6997: /* SKIP passes back the next starting point explicitly, but if it is no
! 6998: greater than the match we have just done, treat it as NOMATCH. */
1.1 misho 6999:
7000: case MATCH_SKIP:
1.1.1.4 ! misho 7001: if (md->start_match_ptr > start_match)
1.1 misho 7002: {
7003: new_start_match = md->start_match_ptr;
7004: break;
7005: }
7006: /* Fall through */
7007:
7008: /* NOMATCH and PRUNE advance by one character. THEN at this level acts
1.1.1.4 ! misho 7009: exactly like PRUNE. Unset ignore SKIP-with-argument. */
1.1 misho 7010:
7011: case MATCH_NOMATCH:
7012: case MATCH_PRUNE:
7013: case MATCH_THEN:
1.1.1.4 ! misho 7014: md->ignore_skip_arg = 0;
1.1 misho 7015: new_start_match = start_match + 1;
1.1.1.2 misho 7016: #ifdef SUPPORT_UTF
7017: if (utf)
7018: ACROSSCHAR(new_start_match < end_subject, *new_start_match,
7019: new_start_match++);
1.1 misho 7020: #endif
7021: break;
7022:
7023: /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
7024:
7025: case MATCH_COMMIT:
7026: rc = MATCH_NOMATCH;
7027: goto ENDLOOP;
7028:
7029: /* Any other return is either a match, or some kind of error. */
7030:
7031: default:
7032: goto ENDLOOP;
7033: }
7034:
7035: /* Control reaches here for the various types of "no match at this point"
7036: result. Reset the code to MATCH_NOMATCH for subsequent checking. */
7037:
7038: rc = MATCH_NOMATCH;
7039:
7040: /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
7041: newline in the subject (though it may continue over the newline). Therefore,
7042: if we have just failed to match, starting at a newline, do not continue. */
7043:
7044: if (firstline && IS_NEWLINE(start_match)) break;
7045:
7046: /* Advance to new matching position */
7047:
7048: start_match = new_start_match;
7049:
7050: /* Break the loop if the pattern is anchored or if we have passed the end of
7051: the subject. */
7052:
7053: if (anchored || start_match > end_subject) break;
7054:
7055: /* If we have just passed a CR and we are now at a LF, and the pattern does
7056: not contain any explicit matches for \r or \n, and the newline option is CRLF
1.1.1.2 misho 7057: or ANY or ANYCRLF, advance the match position by one more character. In
7058: normal matching start_match will aways be greater than the first position at
7059: this stage, but a failed *SKIP can cause a return at the same point, which is
7060: why the first test exists. */
1.1 misho 7061:
1.1.1.2 misho 7062: if (start_match > (PCRE_PUCHAR)subject + start_offset &&
7063: start_match[-1] == CHAR_CR &&
1.1 misho 7064: start_match < end_subject &&
7065: *start_match == CHAR_NL &&
7066: (re->flags & PCRE_HASCRORLF) == 0 &&
7067: (md->nltype == NLTYPE_ANY ||
7068: md->nltype == NLTYPE_ANYCRLF ||
7069: md->nllen == 2))
7070: start_match++;
7071:
7072: md->mark = NULL; /* Reset for start of next match attempt */
7073: } /* End of for(;;) "bumpalong" loop */
7074:
7075: /* ==========================================================================*/
7076:
7077: /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
7078: conditions is true:
7079:
7080: (1) The pattern is anchored or the match was failed by (*COMMIT);
7081:
7082: (2) We are past the end of the subject;
7083:
7084: (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
7085: this option requests that a match occur at or before the first newline in
7086: the subject.
7087:
7088: When we have a match and the offset vector is big enough to deal with any
7089: backreferences, captured substring offsets will already be set up. In the case
7090: where we had to get some local store to hold offsets for backreference
7091: processing, copy those that we can. In this case there need not be overflow if
7092: certain parts of the pattern were not used, even though there are more
7093: capturing parentheses than vector slots. */
7094:
7095: ENDLOOP:
7096:
7097: if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
7098: {
7099: if (using_temporary_offsets)
7100: {
7101: if (arg_offset_max >= 4)
7102: {
7103: memcpy(offsets + 2, md->offset_vector + 2,
7104: (arg_offset_max - 2) * sizeof(int));
7105: DPRINTF(("Copied offsets from temporary memory\n"));
7106: }
1.1.1.4 ! misho 7107: if (md->end_offset_top > arg_offset_max) md->capture_last |= OVFLBIT;
1.1 misho 7108: DPRINTF(("Freeing temporary memory\n"));
1.1.1.2 misho 7109: (PUBL(free))(md->offset_vector);
1.1 misho 7110: }
7111:
7112: /* Set the return code to the number of captured strings, or 0 if there were
7113: too many to fit into the vector. */
7114:
1.1.1.4 ! misho 7115: rc = ((md->capture_last & OVFLBIT) != 0 &&
! 7116: md->end_offset_top >= arg_offset_max)?
1.1 misho 7117: 0 : md->end_offset_top/2;
7118:
7119: /* If there is space in the offset vector, set any unused pairs at the end of
7120: the pattern to -1 for backwards compatibility. It is documented that this
7121: happens. In earlier versions, the whole set of potential capturing offsets
7122: was set to -1 each time round the loop, but this is handled differently now.
7123: "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
7124: those at the end that need unsetting here. We can't just unset them all at
7125: the start of the whole thing because they may get set in one branch that is
7126: not the final matching branch. */
7127:
7128: if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
7129: {
7130: register int *iptr, *iend;
7131: int resetcount = 2 + re->top_bracket * 2;
1.1.1.3 misho 7132: if (resetcount > offsetcount) resetcount = offsetcount;
1.1 misho 7133: iptr = offsets + md->end_offset_top;
7134: iend = offsets + resetcount;
7135: while (iptr < iend) *iptr++ = -1;
7136: }
7137:
7138: /* If there is space, set up the whole thing as substring 0. The value of
7139: md->start_match_ptr might be modified if \K was encountered on the success
7140: matching path. */
7141:
7142: if (offsetcount < 2) rc = 0; else
7143: {
7144: offsets[0] = (int)(md->start_match_ptr - md->start_subject);
7145: offsets[1] = (int)(md->end_match_ptr - md->start_subject);
7146: }
7147:
7148: /* Return MARK data if requested */
7149:
7150: if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
1.1.1.2 misho 7151: *(extra_data->mark) = (pcre_uchar *)md->mark;
1.1 misho 7152: DPRINTF((">>>> returning %d\n", rc));
1.1.1.3 misho 7153: #ifdef NO_RECURSE
7154: release_match_heapframes(&frame_zero);
7155: #endif
1.1 misho 7156: return rc;
7157: }
7158:
7159: /* Control gets here if there has been an error, or if the overall match
7160: attempt has failed at all permitted starting positions. */
7161:
7162: if (using_temporary_offsets)
7163: {
7164: DPRINTF(("Freeing temporary memory\n"));
1.1.1.2 misho 7165: (PUBL(free))(md->offset_vector);
1.1 misho 7166: }
7167:
7168: /* For anything other than nomatch or partial match, just return the code. */
7169:
7170: if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
7171: {
7172: DPRINTF((">>>> error: returning %d\n", rc));
1.1.1.3 misho 7173: #ifdef NO_RECURSE
7174: release_match_heapframes(&frame_zero);
7175: #endif
1.1 misho 7176: return rc;
7177: }
7178:
7179: /* Handle partial matches - disable any mark data */
7180:
7181: if (start_partial != NULL)
7182: {
7183: DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
7184: md->mark = NULL;
7185: if (offsetcount > 1)
7186: {
1.1.1.2 misho 7187: offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject);
7188: offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
1.1.1.4 ! misho 7189: if (offsetcount > 2)
! 7190: offsets[2] = (int)(match_partial - (PCRE_PUCHAR)subject);
1.1 misho 7191: }
7192: rc = PCRE_ERROR_PARTIAL;
7193: }
7194:
7195: /* This is the classic nomatch case */
7196:
7197: else
7198: {
7199: DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
7200: rc = PCRE_ERROR_NOMATCH;
7201: }
7202:
7203: /* Return the MARK data if it has been requested. */
7204:
7205: if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
1.1.1.2 misho 7206: *(extra_data->mark) = (pcre_uchar *)md->nomatch_mark;
1.1.1.3 misho 7207: #ifdef NO_RECURSE
7208: release_match_heapframes(&frame_zero);
7209: #endif
1.1 misho 7210: return rc;
7211: }
7212:
7213: /* End of pcre_exec.c */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>