embedaddon/pcre/pcre_exec.c - view

File: [ELWIX - Embedded LightWeight unIX -] / embedaddon / pcre / pcre_exec.c
Revision 1.1.1.5 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Sun Jun 15 19:46:04 2014 UTC (11 years, 1 month ago) by misho
Branches: pcre, MAIN
CVS tags: v8_34, HEAD

pcre 8.34

1: /************************************************* 2: * Perl-Compatible Regular Expressions * 3: *************************************************/ 4: 5: /* PCRE is a library of functions to support regular expressions whose syntax 6: and semantics are as close as possible to those of the Perl 5 language. 7: 8: Written by Philip Hazel 9: Copyright (c) 1997-2013 University of Cambridge 10: 11: ----------------------------------------------------------------------------- 12: Redistribution and use in source and binary forms, with or without 13: modification, are permitted provided that the following conditions are met: 14: 15: * Redistributions of source code must retain the above copyright notice, 16: this list of conditions and the following disclaimer. 17: 18: * Redistributions in binary form must reproduce the above copyright 19: notice, this list of conditions and the following disclaimer in the 20: documentation and/or other materials provided with the distribution. 21: 22: * Neither the name of the University of Cambridge nor the names of its 23: contributors may be used to endorse or promote products derived from 24: this software without specific prior written permission. 25: 26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36: POSSIBILITY OF SUCH DAMAGE. 37: ----------------------------------------------------------------------------- 38: */ 39: 40: /* This module contains pcre_exec(), the externally visible function that does 41: pattern matching using an NFA algorithm, trying to mimic Perl as closely as 42: possible. There are also some static supporting functions. */ 43: 44: #ifdef HAVE_CONFIG_H 45: #include "config.h" 46: #endif 47: 48: #define NLBLOCK md /* Block containing newline information */ 49: #define PSSTART start_subject /* Field containing processed string start */ 50: #define PSEND end_subject /* Field containing processed string end */ 51: 52: #include "pcre_internal.h" 53: 54: /* Undefine some potentially clashing cpp symbols */ 55: 56: #undef min 57: #undef max 58: 59: /* The md->capture_last field uses the lower 16 bits for the last captured 60: substring (which can never be greater than 65535) and a bit in the top half 61: to mean "capture vector overflowed". This odd way of doing things was 62: implemented when it was realized that preserving and restoring the overflow bit 63: whenever the last capture number was saved/restored made for a neater 64: interface, and doing it this way saved on (a) another variable, which would 65: have increased the stack frame size (a big NO-NO in PCRE) and (b) another 66: separate set of save/restore instructions. The following defines are used in 67: implementing this. */ 68: 69: #define CAPLMASK 0x0000ffff /* The bits used for last_capture */ 70: #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */ 71: #define OVFLBIT 0x00010000 /* The bit that is set for overflow */ 72: 73: /* Values for setting in md->match_function_type to indicate two special types 74: of call to match(). We do it this way to save on using another stack variable, 75: as stack usage is to be discouraged. */ 76: 77: #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */ 78: #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */ 79: 80: /* Non-error returns from the match() function. Error returns are externally 81: defined PCRE_ERROR_xxx codes, which are all negative. */ 82: 83: #define MATCH_MATCH 1 84: #define MATCH_NOMATCH 0 85: 86: /* Special internal returns from the match() function. Make them sufficiently 87: negative to avoid the external error codes. */ 88: 89: #define MATCH_ACCEPT (-999) 90: #define MATCH_KETRPOS (-998) 91: #define MATCH_ONCE (-997) 92: /* The next 5 must be kept together and in sequence so that a test that checks 93: for any one of them can use a range. */ 94: #define MATCH_COMMIT (-996) 95: #define MATCH_PRUNE (-995) 96: #define MATCH_SKIP (-994) 97: #define MATCH_SKIP_ARG (-993) 98: #define MATCH_THEN (-992) 99: #define MATCH_BACKTRACK_MAX MATCH_THEN 100: #define MATCH_BACKTRACK_MIN MATCH_COMMIT 101: 102: /* Maximum number of ints of offset to save on the stack for recursive calls. 103: If the offset vector is bigger, malloc is used. This should be a multiple of 3, 104: because the offset vector is always a multiple of 3 long. */ 105: 106: #define REC_STACK_SAVE_MAX 30 107: 108: /* Min and max values for the common repeats; for the maxima, 0 => infinity */ 109: 110: static const char rep_min[] = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, }; 111: static const char rep_max[] = { 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, }; 112: 113: #ifdef PCRE_DEBUG 114: /************************************************* 115: * Debugging function to print chars * 116: *************************************************/ 117: 118: /* Print a sequence of chars in printable format, stopping at the end of the 119: subject if the requested. 120: 121: Arguments: 122: p points to characters 123: length number to print 124: is_subject TRUE if printing from within md->start_subject 125: md pointer to matching data block, if is_subject is TRUE 126: 127: Returns: nothing 128: */ 129: 130: static void 131: pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md) 132: { 133: pcre_uint32 c; 134: BOOL utf = md->utf; 135: if (is_subject && length > md->end_subject - p) length = md->end_subject - p; 136: while (length-- > 0) 137: if (isprint(c = RAWUCHARINCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c); 138: } 139: #endif 140: 141: 142: 143: /************************************************* 144: * Match a back-reference * 145: *************************************************/ 146: 147: /* Normally, if a back reference hasn't been set, the length that is passed is 148: negative, so the match always fails. However, in JavaScript compatibility mode, 149: the length passed is zero. Note that in caseless UTF-8 mode, the number of 150: subject bytes matched may be different to the number of reference bytes. 151: 152: Arguments: 153: offset index into the offset vector 154: eptr pointer into the subject 155: length length of reference to be matched (number of bytes) 156: md points to match data block 157: caseless TRUE if caseless 158: 159: Returns: >= 0 the number of subject bytes matched 160: -1 no match 161: -2 partial match; always given if at end subject 162: */ 163: 164: static int 165: match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md, 166: BOOL caseless) 167: { 168: PCRE_PUCHAR eptr_start = eptr; 169: register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset]; 170: #if defined SUPPORT_UTF && defined SUPPORT_UCP 171: BOOL utf = md->utf; 172: #endif 173: 174: #ifdef PCRE_DEBUG 175: if (eptr >= md->end_subject) 176: printf("matching subject <null>"); 177: else 178: { 179: printf("matching subject "); 180: pchars(eptr, length, TRUE, md); 181: } 182: printf(" against backref "); 183: pchars(p, length, FALSE, md); 184: printf("\n"); 185: #endif 186: 187: /* Always fail if reference not set (and not JavaScript compatible - in that 188: case the length is passed as zero). */ 189: 190: if (length < 0) return -1; 191: 192: /* Separate the caseless case for speed. In UTF-8 mode we can only do this 193: properly if Unicode properties are supported. Otherwise, we can check only 194: ASCII characters. */ 195: 196: if (caseless) 197: { 198: #if defined SUPPORT_UTF && defined SUPPORT_UCP 199: if (utf) 200: { 201: /* Match characters up to the end of the reference. NOTE: the number of 202: data units matched may differ, because in UTF-8 there are some characters 203: whose upper and lower case versions code have different numbers of bytes. 204: For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65 205: (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a 206: sequence of two of the latter. It is important, therefore, to check the 207: length along the reference, not along the subject (earlier code did this 208: wrong). */ 209: 210: PCRE_PUCHAR endptr = p + length; 211: while (p < endptr) 212: { 213: pcre_uint32 c, d; 214: const ucd_record *ur; 215: if (eptr >= md->end_subject) return -2; /* Partial match */ 216: GETCHARINC(c, eptr); 217: GETCHARINC(d, p); 218: ur = GET_UCD(d); 219: if (c != d && c != d + ur->other_case) 220: { 221: const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset; 222: for (;;) 223: { 224: if (c < *pp) return -1; 225: if (c == *pp++) break; 226: } 227: } 228: } 229: } 230: else 231: #endif 232: 233: /* The same code works when not in UTF-8 mode and in UTF-8 mode when there 234: is no UCP support. */ 235: { 236: while (length-- > 0) 237: { 238: pcre_uint32 cc, cp; 239: if (eptr >= md->end_subject) return -2; /* Partial match */ 240: cc = RAWUCHARTEST(eptr); 241: cp = RAWUCHARTEST(p); 242: if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1; 243: p++; 244: eptr++; 245: } 246: } 247: } 248: 249: /* In the caseful case, we can just compare the bytes, whether or not we 250: are in UTF-8 mode. */ 251: 252: else 253: { 254: while (length-- > 0) 255: { 256: if (eptr >= md->end_subject) return -2; /* Partial match */ 257: if (RAWUCHARINCTEST(p) != RAWUCHARINCTEST(eptr)) return -1; 258: } 259: } 260: 261: return (int)(eptr - eptr_start); 262: } 263: 264: 265: 266: /*************************************************************************** 267: **************************************************************************** 268: RECURSION IN THE match() FUNCTION 269: 270: The match() function is highly recursive, though not every recursive call 271: increases the recursive depth. Nevertheless, some regular expressions can cause 272: it to recurse to a great depth. I was writing for Unix, so I just let it call 273: itself recursively. This uses the stack for saving everything that has to be 274: saved for a recursive call. On Unix, the stack can be large, and this works 275: fine. 276: 277: It turns out that on some non-Unix-like systems there are problems with 278: programs that use a lot of stack. (This despite the fact that every last chip 279: has oodles of memory these days, and techniques for extending the stack have 280: been known for decades.) So.... 281: 282: There is a fudge, triggered by defining NO_RECURSE, which avoids recursive 283: calls by keeping local variables that need to be preserved in blocks of memory 284: obtained from malloc() instead instead of on the stack. Macros are used to 285: achieve this so that the actual code doesn't look very different to what it 286: always used to. 287: 288: The original heap-recursive code used longjmp(). However, it seems that this 289: can be very slow on some operating systems. Following a suggestion from Stan 290: Switzer, the use of longjmp() has been abolished, at the cost of having to 291: provide a unique number for each call to RMATCH. There is no way of generating 292: a sequence of numbers at compile time in C. I have given them names, to make 293: them stand out more clearly. 294: 295: Crude tests on x86 Linux show a small speedup of around 5-8%. However, on 296: FreeBSD, avoiding longjmp() more than halves the time taken to run the standard 297: tests. Furthermore, not using longjmp() means that local dynamic variables 298: don't have indeterminate values; this has meant that the frame size can be 299: reduced because the result can be "passed back" by straight setting of the 300: variable instead of being passed in the frame. 301: **************************************************************************** 302: ***************************************************************************/ 303: 304: /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN 305: below must be updated in sync. */ 306: 307: enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10, 308: RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20, 309: RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30, 310: RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40, 311: RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50, 312: RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60, 313: RM61, RM62, RM63, RM64, RM65, RM66, RM67 }; 314: 315: /* These versions of the macros use the stack, as normal. There are debugging 316: versions and production versions. Note that the "rw" argument of RMATCH isn't 317: actually used in this definition. */ 318: 319: #ifndef NO_RECURSE 320: #define REGISTER register 321: 322: #ifdef PCRE_DEBUG 323: #define RMATCH(ra,rb,rc,rd,re,rw) \ 324: { \ 325: printf("match() called in line %d\n", __LINE__); \ 326: rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \ 327: printf("to line %d\n", __LINE__); \ 328: } 329: #define RRETURN(ra) \ 330: { \ 331: printf("match() returned %d from line %d\n", ra, __LINE__); \ 332: return ra; \ 333: } 334: #else 335: #define RMATCH(ra,rb,rc,rd,re,rw) \ 336: rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1) 337: #define RRETURN(ra) return ra 338: #endif 339: 340: #else 341: 342: 343: /* These versions of the macros manage a private stack on the heap. Note that 344: the "rd" argument of RMATCH isn't actually used in this definition. It's the md 345: argument of match(), which never changes. */ 346: 347: #define REGISTER 348: 349: #define RMATCH(ra,rb,rc,rd,re,rw)\ 350: {\ 351: heapframe *newframe = frame->Xnextframe;\ 352: if (newframe == NULL)\ 353: {\ 354: newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\ 355: if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\ 356: newframe->Xnextframe = NULL;\ 357: frame->Xnextframe = newframe;\ 358: }\ 359: frame->Xwhere = rw;\ 360: newframe->Xeptr = ra;\ 361: newframe->Xecode = rb;\ 362: newframe->Xmstart = mstart;\ 363: newframe->Xoffset_top = rc;\ 364: newframe->Xeptrb = re;\ 365: newframe->Xrdepth = frame->Xrdepth + 1;\ 366: newframe->Xprevframe = frame;\ 367: frame = newframe;\ 368: DPRINTF(("restarting from line %d\n", __LINE__));\ 369: goto HEAP_RECURSE;\ 370: L_##rw:\ 371: DPRINTF(("jumped back to line %d\n", __LINE__));\ 372: } 373: 374: #define RRETURN(ra)\ 375: {\ 376: heapframe *oldframe = frame;\ 377: frame = oldframe->Xprevframe;\ 378: if (frame != NULL)\ 379: {\ 380: rrc = ra;\ 381: goto HEAP_RETURN;\ 382: }\ 383: return ra;\ 384: } 385: 386: 387: /* Structure for remembering the local variables in a private frame */ 388: 389: typedef struct heapframe { 390: struct heapframe *Xprevframe; 391: struct heapframe *Xnextframe; 392: 393: /* Function arguments that may change */ 394: 395: PCRE_PUCHAR Xeptr; 396: const pcre_uchar *Xecode; 397: PCRE_PUCHAR Xmstart; 398: int Xoffset_top; 399: eptrblock *Xeptrb; 400: unsigned int Xrdepth; 401: 402: /* Function local variables */ 403: 404: PCRE_PUCHAR Xcallpat; 405: #ifdef SUPPORT_UTF 406: PCRE_PUCHAR Xcharptr; 407: #endif 408: PCRE_PUCHAR Xdata; 409: PCRE_PUCHAR Xnext; 410: PCRE_PUCHAR Xpp; 411: PCRE_PUCHAR Xprev; 412: PCRE_PUCHAR Xsaved_eptr; 413: 414: recursion_info Xnew_recursive; 415: 416: BOOL Xcur_is_word; 417: BOOL Xcondition; 418: BOOL Xprev_is_word; 419: 420: #ifdef SUPPORT_UCP 421: int Xprop_type; 422: unsigned int Xprop_value; 423: int Xprop_fail_result; 424: int Xoclength; 425: pcre_uchar Xocchars[6]; 426: #endif 427: 428: int Xcodelink; 429: int Xctype; 430: unsigned int Xfc; 431: int Xfi; 432: int Xlength; 433: int Xmax; 434: int Xmin; 435: unsigned int Xnumber; 436: int Xoffset; 437: unsigned int Xop; 438: pcre_int32 Xsave_capture_last; 439: int Xsave_offset1, Xsave_offset2, Xsave_offset3; 440: int Xstacksave[REC_STACK_SAVE_MAX]; 441: 442: eptrblock Xnewptrb; 443: 444: /* Where to jump back to */ 445: 446: int Xwhere; 447: 448: } heapframe; 449: 450: #endif 451: 452: 453: /*************************************************************************** 454: ***************************************************************************/ 455: 456: 457: 458: /************************************************* 459: * Match from current position * 460: *************************************************/ 461: 462: /* This function is called recursively in many circumstances. Whenever it 463: returns a negative (error) response, the outer incarnation must also return the 464: same response. */ 465: 466: /* These macros pack up tests that are used for partial matching, and which 467: appear several times in the code. We set the "hit end" flag if the pointer is 468: at the end of the subject and also past the start of the subject (i.e. 469: something has been matched). For hard partial matching, we then return 470: immediately. The second one is used when we already know we are past the end of 471: the subject. */ 472: 473: #define CHECK_PARTIAL()\ 474: if (md->partial != 0 && eptr >= md->end_subject && \ 475: eptr > md->start_used_ptr) \ 476: { \ 477: md->hitend = TRUE; \ 478: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \ 479: } 480: 481: #define SCHECK_PARTIAL()\ 482: if (md->partial != 0 && eptr > md->start_used_ptr) \ 483: { \ 484: md->hitend = TRUE; \ 485: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \ 486: } 487: 488: 489: /* Performance note: It might be tempting to extract commonly used fields from 490: the md structure (e.g. utf, end_subject) into individual variables to improve 491: performance. Tests using gcc on a SPARC disproved this; in the first case, it 492: made performance worse. 493: 494: Arguments: 495: eptr pointer to current character in subject 496: ecode pointer to current position in compiled code 497: mstart pointer to the current match start position (can be modified 498: by encountering \K) 499: offset_top current top pointer 500: md pointer to "static" info for the match 501: eptrb pointer to chain of blocks containing eptr at start of 502: brackets - for testing for empty matches 503: rdepth the recursion depth 504: 505: Returns: MATCH_MATCH if matched ) these values are >= 0 506: MATCH_NOMATCH if failed to match ) 507: a negative MATCH_xxx value for PRUNE, SKIP, etc 508: a negative PCRE_ERROR_xxx value if aborted by an error condition 509: (e.g. stopped by repeated call or recursion limit) 510: */ 511: 512: static int 513: match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode, 514: PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb, 515: unsigned int rdepth) 516: { 517: /* These variables do not need to be preserved over recursion in this function, 518: so they can be ordinary variables in all cases. Mark some of them with 519: "register" because they are used a lot in loops. */ 520: 521: register int rrc; /* Returns from recursive calls */ 522: register int i; /* Used for loops not involving calls to RMATCH() */ 523: register pcre_uint32 c; /* Character values not kept over RMATCH() calls */ 524: register BOOL utf; /* Local copy of UTF flag for speed */ 525: 526: BOOL minimize, possessive; /* Quantifier options */ 527: BOOL caseless; 528: int condcode; 529: 530: /* When recursion is not being used, all "local" variables that have to be 531: preserved over calls to RMATCH() are part of a "frame". We set up the top-level 532: frame on the stack here; subsequent instantiations are obtained from the heap 533: whenever RMATCH() does a "recursion". See the macro definitions above. Putting 534: the top-level on the stack rather than malloc-ing them all gives a performance 535: boost in many cases where there is not much "recursion". */ 536: 537: #ifdef NO_RECURSE 538: heapframe *frame = (heapframe *)md->match_frames_base; 539: 540: /* Copy in the original argument variables */ 541: 542: frame->Xeptr = eptr; 543: frame->Xecode = ecode; 544: frame->Xmstart = mstart; 545: frame->Xoffset_top = offset_top; 546: frame->Xeptrb = eptrb; 547: frame->Xrdepth = rdepth; 548: 549: /* This is where control jumps back to to effect "recursion" */ 550: 551: HEAP_RECURSE: 552: 553: /* Macros make the argument variables come from the current frame */ 554: 555: #define eptr frame->Xeptr 556: #define ecode frame->Xecode 557: #define mstart frame->Xmstart 558: #define offset_top frame->Xoffset_top 559: #define eptrb frame->Xeptrb 560: #define rdepth frame->Xrdepth 561: 562: /* Ditto for the local variables */ 563: 564: #ifdef SUPPORT_UTF 565: #define charptr frame->Xcharptr 566: #endif 567: #define callpat frame->Xcallpat 568: #define codelink frame->Xcodelink 569: #define data frame->Xdata 570: #define next frame->Xnext 571: #define pp frame->Xpp 572: #define prev frame->Xprev 573: #define saved_eptr frame->Xsaved_eptr 574: 575: #define new_recursive frame->Xnew_recursive 576: 577: #define cur_is_word frame->Xcur_is_word 578: #define condition frame->Xcondition 579: #define prev_is_word frame->Xprev_is_word 580: 581: #ifdef SUPPORT_UCP 582: #define prop_type frame->Xprop_type 583: #define prop_value frame->Xprop_value 584: #define prop_fail_result frame->Xprop_fail_result 585: #define oclength frame->Xoclength 586: #define occhars frame->Xocchars 587: #endif 588: 589: #define ctype frame->Xctype 590: #define fc frame->Xfc 591: #define fi frame->Xfi 592: #define length frame->Xlength 593: #define max frame->Xmax 594: #define min frame->Xmin 595: #define number frame->Xnumber 596: #define offset frame->Xoffset 597: #define op frame->Xop 598: #define save_capture_last frame->Xsave_capture_last 599: #define save_offset1 frame->Xsave_offset1 600: #define save_offset2 frame->Xsave_offset2 601: #define save_offset3 frame->Xsave_offset3 602: #define stacksave frame->Xstacksave 603: 604: #define newptrb frame->Xnewptrb 605: 606: /* When recursion is being used, local variables are allocated on the stack and 607: get preserved during recursion in the normal way. In this environment, fi and 608: i, and fc and c, can be the same variables. */ 609: 610: #else /* NO_RECURSE not defined */ 611: #define fi i 612: #define fc c 613: 614: /* Many of the following variables are used only in small blocks of the code. 615: My normal style of coding would have declared them within each of those blocks. 616: However, in order to accommodate the version of this code that uses an external 617: "stack" implemented on the heap, it is easier to declare them all here, so the 618: declarations can be cut out in a block. The only declarations within blocks 619: below are for variables that do not have to be preserved over a recursive call 620: to RMATCH(). */ 621: 622: #ifdef SUPPORT_UTF 623: const pcre_uchar *charptr; 624: #endif 625: const pcre_uchar *callpat; 626: const pcre_uchar *data; 627: const pcre_uchar *next; 628: PCRE_PUCHAR pp; 629: const pcre_uchar *prev; 630: PCRE_PUCHAR saved_eptr; 631: 632: recursion_info new_recursive; 633: 634: BOOL cur_is_word; 635: BOOL condition; 636: BOOL prev_is_word; 637: 638: #ifdef SUPPORT_UCP 639: int prop_type; 640: unsigned int prop_value; 641: int prop_fail_result; 642: int oclength; 643: pcre_uchar occhars[6]; 644: #endif 645: 646: int codelink; 647: int ctype; 648: int length; 649: int max; 650: int min; 651: unsigned int number; 652: int offset; 653: unsigned int op; 654: pcre_int32 save_capture_last; 655: int save_offset1, save_offset2, save_offset3; 656: int stacksave[REC_STACK_SAVE_MAX]; 657: 658: eptrblock newptrb; 659: 660: /* There is a special fudge for calling match() in a way that causes it to 661: measure the size of its basic stack frame when the stack is being used for 662: recursion. The second argument (ecode) being NULL triggers this behaviour. It 663: cannot normally ever be NULL. The return is the negated value of the frame 664: size. */ 665: 666: if (ecode == NULL) 667: { 668: if (rdepth == 0) 669: return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1); 670: else 671: { 672: int len = (char *)&rdepth - (char *)eptr; 673: return (len > 0)? -len : len; 674: } 675: } 676: #endif /* NO_RECURSE */ 677: 678: /* To save space on the stack and in the heap frame, I have doubled up on some 679: of the local variables that are used only in localised parts of the code, but 680: still need to be preserved over recursive calls of match(). These macros define 681: the alternative names that are used. */ 682: 683: #define allow_zero cur_is_word 684: #define cbegroup condition 685: #define code_offset codelink 686: #define condassert condition 687: #define matched_once prev_is_word 688: #define foc number 689: #define save_mark data 690: 691: /* These statements are here to stop the compiler complaining about unitialized 692: variables. */ 693: 694: #ifdef SUPPORT_UCP 695: prop_value = 0; 696: prop_fail_result = 0; 697: #endif 698: 699: 700: /* This label is used for tail recursion, which is used in a few cases even 701: when NO_RECURSE is not defined, in order to reduce the amount of stack that is 702: used. Thanks to Ian Taylor for noticing this possibility and sending the 703: original patch. */ 704: 705: TAIL_RECURSE: 706: 707: /* OK, now we can get on with the real code of the function. Recursive calls 708: are specified by the macro RMATCH and RRETURN is used to return. When 709: NO_RECURSE is *not* defined, these just turn into a recursive call to match() 710: and a "return", respectively (possibly with some debugging if PCRE_DEBUG is 711: defined). However, RMATCH isn't like a function call because it's quite a 712: complicated macro. It has to be used in one particular way. This shouldn't, 713: however, impact performance when true recursion is being used. */ 714: 715: #ifdef SUPPORT_UTF 716: utf = md->utf; /* Local copy of the flag */ 717: #else 718: utf = FALSE; 719: #endif 720: 721: /* First check that we haven't called match() too many times, or that we 722: haven't exceeded the recursive call limit. */ 723: 724: if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT); 725: if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT); 726: 727: /* At the start of a group with an unlimited repeat that may match an empty 728: string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is 729: done this way to save having to use another function argument, which would take 730: up space on the stack. See also MATCH_CONDASSERT below. 731: 732: When MATCH_CBEGROUP is set, add the current subject pointer to the chain of 733: such remembered pointers, to be checked when we hit the closing ket, in order 734: to break infinite loops that match no characters. When match() is called in 735: other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must 736: NOT be used with tail recursion, because the memory block that is used is on 737: the stack, so a new one may be required for each match(). */ 738: 739: if (md->match_function_type == MATCH_CBEGROUP) 740: { 741: newptrb.epb_saved_eptr = eptr; 742: newptrb.epb_prev = eptrb; 743: eptrb = &newptrb; 744: md->match_function_type = 0; 745: } 746: 747: /* Now start processing the opcodes. */ 748: 749: for (;;) 750: { 751: minimize = possessive = FALSE; 752: op = *ecode; 753: 754: switch(op) 755: { 756: case OP_MARK: 757: md->nomatch_mark = ecode + 2; 758: md->mark = NULL; /* In case previously set by assertion */ 759: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md, 760: eptrb, RM55); 761: if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) && 762: md->mark == NULL) md->mark = ecode + 2; 763: 764: /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an 765: argument, and we must check whether that argument matches this MARK's 766: argument. It is passed back in md->start_match_ptr (an overloading of that 767: variable). If it does match, we reset that variable to the current subject 768: position and return MATCH_SKIP. Otherwise, pass back the return code 769: unaltered. */ 770: 771: else if (rrc == MATCH_SKIP_ARG && 772: STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0) 773: { 774: md->start_match_ptr = eptr; 775: RRETURN(MATCH_SKIP); 776: } 777: RRETURN(rrc); 778: 779: case OP_FAIL: 780: RRETURN(MATCH_NOMATCH); 781: 782: case OP_COMMIT: 783: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, 784: eptrb, RM52); 785: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 786: RRETURN(MATCH_COMMIT); 787: 788: case OP_PRUNE: 789: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, 790: eptrb, RM51); 791: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 792: RRETURN(MATCH_PRUNE); 793: 794: case OP_PRUNE_ARG: 795: md->nomatch_mark = ecode + 2; 796: md->mark = NULL; /* In case previously set by assertion */ 797: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md, 798: eptrb, RM56); 799: if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) && 800: md->mark == NULL) md->mark = ecode + 2; 801: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 802: RRETURN(MATCH_PRUNE); 803: 804: case OP_SKIP: 805: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, 806: eptrb, RM53); 807: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 808: md->start_match_ptr = eptr; /* Pass back current position */ 809: RRETURN(MATCH_SKIP); 810: 811: /* Note that, for Perl compatibility, SKIP with an argument does NOT set 812: nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was 813: not a matching mark, we have to re-run the match, ignoring the SKIP_ARG 814: that failed and any that precede it (either they also failed, or were not 815: triggered). To do this, we maintain a count of executed SKIP_ARGs. If a 816: SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg 817: set to the count of the one that failed. */ 818: 819: case OP_SKIP_ARG: 820: md->skip_arg_count++; 821: if (md->skip_arg_count <= md->ignore_skip_arg) 822: { 823: ecode += PRIV(OP_lengths)[*ecode] + ecode[1]; 824: break; 825: } 826: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md, 827: eptrb, RM57); 828: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 829: 830: /* Pass back the current skip name by overloading md->start_match_ptr and 831: returning the special MATCH_SKIP_ARG return code. This will either be 832: caught by a matching MARK, or get to the top, where it causes a rematch 833: with md->ignore_skip_arg set to the value of md->skip_arg_count. */ 834: 835: md->start_match_ptr = ecode + 2; 836: RRETURN(MATCH_SKIP_ARG); 837: 838: /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that 839: the branch in which it occurs can be determined. Overload the start of 840: match pointer to do this. */ 841: 842: case OP_THEN: 843: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, 844: eptrb, RM54); 845: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 846: md->start_match_ptr = ecode; 847: RRETURN(MATCH_THEN); 848: 849: case OP_THEN_ARG: 850: md->nomatch_mark = ecode + 2; 851: md->mark = NULL; /* In case previously set by assertion */ 852: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, 853: md, eptrb, RM58); 854: if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) && 855: md->mark == NULL) md->mark = ecode + 2; 856: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 857: md->start_match_ptr = ecode; 858: RRETURN(MATCH_THEN); 859: 860: /* Handle an atomic group that does not contain any capturing parentheses. 861: This can be handled like an assertion. Prior to 8.13, all atomic groups 862: were handled this way. In 8.13, the code was changed as below for ONCE, so 863: that backups pass through the group and thereby reset captured values. 864: However, this uses a lot more stack, so in 8.20, atomic groups that do not 865: contain any captures generate OP_ONCE_NC, which can be handled in the old, 866: less stack intensive way. 867: 868: Check the alternative branches in turn - the matching won't pass the KET 869: for this kind of subpattern. If any one branch matches, we carry on as at 870: the end of a normal bracket, leaving the subject pointer, but resetting 871: the start-of-match value in case it was changed by \K. */ 872: 873: case OP_ONCE_NC: 874: prev = ecode; 875: saved_eptr = eptr; 876: save_mark = md->mark; 877: do 878: { 879: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64); 880: if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */ 881: { 882: mstart = md->start_match_ptr; 883: break; 884: } 885: if (rrc == MATCH_THEN) 886: { 887: next = ecode + GET(ecode,1); 888: if (md->start_match_ptr < next && 889: (*ecode == OP_ALT || *next == OP_ALT)) 890: rrc = MATCH_NOMATCH; 891: } 892: 893: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 894: ecode += GET(ecode,1); 895: md->mark = save_mark; 896: } 897: while (*ecode == OP_ALT); 898: 899: /* If hit the end of the group (which could be repeated), fail */ 900: 901: if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH); 902: 903: /* Continue as from after the group, updating the offsets high water 904: mark, since extracts may have been taken. */ 905: 906: do ecode += GET(ecode, 1); while (*ecode == OP_ALT); 907: 908: offset_top = md->end_offset_top; 909: eptr = md->end_match_ptr; 910: 911: /* For a non-repeating ket, just continue at this level. This also 912: happens for a repeating ket if no characters were matched in the group. 913: This is the forcible breaking of infinite loops as implemented in Perl 914: 5.005. */ 915: 916: if (*ecode == OP_KET || eptr == saved_eptr) 917: { 918: ecode += 1+LINK_SIZE; 919: break; 920: } 921: 922: /* The repeating kets try the rest of the pattern or restart from the 923: preceding bracket, in the appropriate order. The second "call" of match() 924: uses tail recursion, to avoid using another stack frame. */ 925: 926: if (*ecode == OP_KETRMIN) 927: { 928: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65); 929: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 930: ecode = prev; 931: goto TAIL_RECURSE; 932: } 933: else /* OP_KETRMAX */ 934: { 935: RMATCH(eptr, prev, offset_top, md, eptrb, RM66); 936: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 937: ecode += 1 + LINK_SIZE; 938: goto TAIL_RECURSE; 939: } 940: /* Control never gets here */ 941: 942: /* Handle a capturing bracket, other than those that are possessive with an 943: unlimited repeat. If there is space in the offset vector, save the current 944: subject position in the working slot at the top of the vector. We mustn't 945: change the current values of the data slot, because they may be set from a 946: previous iteration of this group, and be referred to by a reference inside 947: the group. A failure to match might occur after the group has succeeded, 948: if something later on doesn't match. For this reason, we need to restore 949: the working value and also the values of the final offsets, in case they 950: were set by a previous iteration of the same bracket. 951: 952: If there isn't enough space in the offset vector, treat this as if it were 953: a non-capturing bracket. Don't worry about setting the flag for the error 954: case here; that is handled in the code for KET. */ 955: 956: case OP_CBRA: 957: case OP_SCBRA: 958: number = GET2(ecode, 1+LINK_SIZE); 959: offset = number << 1; 960: 961: #ifdef PCRE_DEBUG 962: printf("start bracket %d\n", number); 963: printf("subject="); 964: pchars(eptr, 16, TRUE, md); 965: printf("\n"); 966: #endif 967: 968: if (offset < md->offset_max) 969: { 970: save_offset1 = md->offset_vector[offset]; 971: save_offset2 = md->offset_vector[offset+1]; 972: save_offset3 = md->offset_vector[md->offset_end - number]; 973: save_capture_last = md->capture_last; 974: save_mark = md->mark; 975: 976: DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3)); 977: md->offset_vector[md->offset_end - number] = 978: (int)(eptr - md->start_subject); 979: 980: for (;;) 981: { 982: if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP; 983: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, 984: eptrb, RM1); 985: if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */ 986: 987: /* If we backed up to a THEN, check whether it is within the current 988: branch by comparing the address of the THEN that is passed back with 989: the end of the branch. If it is within the current branch, and the 990: branch is one of two or more alternatives (it either starts or ends 991: with OP_ALT), we have reached the limit of THEN's action, so convert 992: the return code to NOMATCH, which will cause normal backtracking to 993: happen from now on. Otherwise, THEN is passed back to an outer 994: alternative. This implements Perl's treatment of parenthesized groups, 995: where a group not containing | does not affect the current alternative, 996: that is, (X) is NOT the same as (X|(*F)). */ 997: 998: if (rrc == MATCH_THEN) 999: { 1000: next = ecode + GET(ecode,1); 1001: if (md->start_match_ptr < next && 1002: (*ecode == OP_ALT || *next == OP_ALT)) 1003: rrc = MATCH_NOMATCH; 1004: } 1005: 1006: /* Anything other than NOMATCH is passed back. */ 1007: 1008: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1009: md->capture_last = save_capture_last; 1010: ecode += GET(ecode, 1); 1011: md->mark = save_mark; 1012: if (*ecode != OP_ALT) break; 1013: } 1014: 1015: DPRINTF(("bracket %d failed\n", number)); 1016: md->offset_vector[offset] = save_offset1; 1017: md->offset_vector[offset+1] = save_offset2; 1018: md->offset_vector[md->offset_end - number] = save_offset3; 1019: 1020: /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */ 1021: 1022: RRETURN(rrc); 1023: } 1024: 1025: /* FALL THROUGH ... Insufficient room for saving captured contents. Treat 1026: as a non-capturing bracket. */ 1027: 1028: /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 1029: /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 1030: 1031: DPRINTF(("insufficient capture room: treat as non-capturing\n")); 1032: 1033: /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 1034: /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 1035: 1036: /* Non-capturing or atomic group, except for possessive with unlimited 1037: repeat and ONCE group with no captures. Loop for all the alternatives. 1038: 1039: When we get to the final alternative within the brackets, we used to return 1040: the result of a recursive call to match() whatever happened so it was 1041: possible to reduce stack usage by turning this into a tail recursion, 1042: except in the case of a possibly empty group. However, now that there is 1043: the possiblity of (*THEN) occurring in the final alternative, this 1044: optimization is no longer always possible. 1045: 1046: We can optimize if we know there are no (*THEN)s in the pattern; at present 1047: this is the best that can be done. 1048: 1049: MATCH_ONCE is returned when the end of an atomic group is successfully 1050: reached, but subsequent matching fails. It passes back up the tree (causing 1051: captured values to be reset) until the original atomic group level is 1052: reached. This is tested by comparing md->once_target with the start of the 1053: group. At this point, the return is converted into MATCH_NOMATCH so that 1054: previous backup points can be taken. */ 1055: 1056: case OP_ONCE: 1057: case OP_BRA: 1058: case OP_SBRA: 1059: DPRINTF(("start non-capturing bracket\n")); 1060: 1061: for (;;) 1062: { 1063: if (op >= OP_SBRA || op == OP_ONCE) 1064: md->match_function_type = MATCH_CBEGROUP; 1065: 1066: /* If this is not a possibly empty group, and there are no (*THEN)s in 1067: the pattern, and this is the final alternative, optimize as described 1068: above. */ 1069: 1070: else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT) 1071: { 1072: ecode += PRIV(OP_lengths)[*ecode]; 1073: goto TAIL_RECURSE; 1074: } 1075: 1076: /* In all other cases, we have to make another call to match(). */ 1077: 1078: save_mark = md->mark; 1079: save_capture_last = md->capture_last; 1080: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb, 1081: RM2); 1082: 1083: /* See comment in the code for capturing groups above about handling 1084: THEN. */ 1085: 1086: if (rrc == MATCH_THEN) 1087: { 1088: next = ecode + GET(ecode,1); 1089: if (md->start_match_ptr < next && 1090: (*ecode == OP_ALT || *next == OP_ALT)) 1091: rrc = MATCH_NOMATCH; 1092: } 1093: 1094: if (rrc != MATCH_NOMATCH) 1095: { 1096: if (rrc == MATCH_ONCE) 1097: { 1098: const pcre_uchar *scode = ecode; 1099: if (*scode != OP_ONCE) /* If not at start, find it */ 1100: { 1101: while (*scode == OP_ALT) scode += GET(scode, 1); 1102: scode -= GET(scode, 1); 1103: } 1104: if (md->once_target == scode) rrc = MATCH_NOMATCH; 1105: } 1106: RRETURN(rrc); 1107: } 1108: ecode += GET(ecode, 1); 1109: md->mark = save_mark; 1110: if (*ecode != OP_ALT) break; 1111: md->capture_last = save_capture_last; 1112: } 1113: 1114: RRETURN(MATCH_NOMATCH); 1115: 1116: /* Handle possessive capturing brackets with an unlimited repeat. We come 1117: here from BRAZERO with allow_zero set TRUE. The offset_vector values are 1118: handled similarly to the normal case above. However, the matching is 1119: different. The end of these brackets will always be OP_KETRPOS, which 1120: returns MATCH_KETRPOS without going further in the pattern. By this means 1121: we can handle the group by iteration rather than recursion, thereby 1122: reducing the amount of stack needed. */ 1123: 1124: case OP_CBRAPOS: 1125: case OP_SCBRAPOS: 1126: allow_zero = FALSE; 1127: 1128: POSSESSIVE_CAPTURE: 1129: number = GET2(ecode, 1+LINK_SIZE); 1130: offset = number << 1; 1131: 1132: #ifdef PCRE_DEBUG 1133: printf("start possessive bracket %d\n", number); 1134: printf("subject="); 1135: pchars(eptr, 16, TRUE, md); 1136: printf("\n"); 1137: #endif 1138: 1139: if (offset < md->offset_max) 1140: { 1141: matched_once = FALSE; 1142: code_offset = (int)(ecode - md->start_code); 1143: 1144: save_offset1 = md->offset_vector[offset]; 1145: save_offset2 = md->offset_vector[offset+1]; 1146: save_offset3 = md->offset_vector[md->offset_end - number]; 1147: save_capture_last = md->capture_last; 1148: 1149: DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3)); 1150: 1151: /* Each time round the loop, save the current subject position for use 1152: when the group matches. For MATCH_MATCH, the group has matched, so we 1153: restart it with a new subject starting position, remembering that we had 1154: at least one match. For MATCH_NOMATCH, carry on with the alternatives, as 1155: usual. If we haven't matched any alternatives in any iteration, check to 1156: see if a previous iteration matched. If so, the group has matched; 1157: continue from afterwards. Otherwise it has failed; restore the previous 1158: capture values before returning NOMATCH. */ 1159: 1160: for (;;) 1161: { 1162: md->offset_vector[md->offset_end - number] = 1163: (int)(eptr - md->start_subject); 1164: if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP; 1165: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, 1166: eptrb, RM63); 1167: if (rrc == MATCH_KETRPOS) 1168: { 1169: offset_top = md->end_offset_top; 1170: eptr = md->end_match_ptr; 1171: ecode = md->start_code + code_offset; 1172: save_capture_last = md->capture_last; 1173: matched_once = TRUE; 1174: mstart = md->start_match_ptr; /* In case \K changed it */ 1175: continue; 1176: } 1177: 1178: /* See comment in the code for capturing groups above about handling 1179: THEN. */ 1180: 1181: if (rrc == MATCH_THEN) 1182: { 1183: next = ecode + GET(ecode,1); 1184: if (md->start_match_ptr < next && 1185: (*ecode == OP_ALT || *next == OP_ALT)) 1186: rrc = MATCH_NOMATCH; 1187: } 1188: 1189: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1190: md->capture_last = save_capture_last; 1191: ecode += GET(ecode, 1); 1192: if (*ecode != OP_ALT) break; 1193: } 1194: 1195: if (!matched_once) 1196: { 1197: md->offset_vector[offset] = save_offset1; 1198: md->offset_vector[offset+1] = save_offset2; 1199: md->offset_vector[md->offset_end - number] = save_offset3; 1200: } 1201: 1202: if (allow_zero || matched_once) 1203: { 1204: ecode += 1 + LINK_SIZE; 1205: break; 1206: } 1207: 1208: RRETURN(MATCH_NOMATCH); 1209: } 1210: 1211: /* FALL THROUGH ... Insufficient room for saving captured contents. Treat 1212: as a non-capturing bracket. */ 1213: 1214: /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 1215: /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 1216: 1217: DPRINTF(("insufficient capture room: treat as non-capturing\n")); 1218: 1219: /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 1220: /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 1221: 1222: /* Non-capturing possessive bracket with unlimited repeat. We come here 1223: from BRAZERO with allow_zero = TRUE. The code is similar to the above, 1224: without the capturing complication. It is written out separately for speed 1225: and cleanliness. */ 1226: 1227: case OP_BRAPOS: 1228: case OP_SBRAPOS: 1229: allow_zero = FALSE; 1230: 1231: POSSESSIVE_NON_CAPTURE: 1232: matched_once = FALSE; 1233: code_offset = (int)(ecode - md->start_code); 1234: save_capture_last = md->capture_last; 1235: 1236: for (;;) 1237: { 1238: if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP; 1239: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, 1240: eptrb, RM48); 1241: if (rrc == MATCH_KETRPOS) 1242: { 1243: offset_top = md->end_offset_top; 1244: eptr = md->end_match_ptr; 1245: ecode = md->start_code + code_offset; 1246: matched_once = TRUE; 1247: mstart = md->start_match_ptr; /* In case \K reset it */ 1248: continue; 1249: } 1250: 1251: /* See comment in the code for capturing groups above about handling 1252: THEN. */ 1253: 1254: if (rrc == MATCH_THEN) 1255: { 1256: next = ecode + GET(ecode,1); 1257: if (md->start_match_ptr < next && 1258: (*ecode == OP_ALT || *next == OP_ALT)) 1259: rrc = MATCH_NOMATCH; 1260: } 1261: 1262: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1263: ecode += GET(ecode, 1); 1264: if (*ecode != OP_ALT) break; 1265: md->capture_last = save_capture_last; 1266: } 1267: 1268: if (matched_once || allow_zero) 1269: { 1270: ecode += 1 + LINK_SIZE; 1271: break; 1272: } 1273: RRETURN(MATCH_NOMATCH); 1274: 1275: /* Control never reaches here. */ 1276: 1277: /* Conditional group: compilation checked that there are no more than two 1278: branches. If the condition is false, skipping the first branch takes us 1279: past the end of the item if there is only one branch, but that's exactly 1280: what we want. */ 1281: 1282: case OP_COND: 1283: case OP_SCOND: 1284: 1285: /* The variable codelink will be added to ecode when the condition is 1286: false, to get to the second branch. Setting it to the offset to the ALT 1287: or KET, then incrementing ecode achieves this effect. We now have ecode 1288: pointing to the condition or callout. */ 1289: 1290: codelink = GET(ecode, 1); /* Offset to the second branch */ 1291: ecode += 1 + LINK_SIZE; /* From this opcode */ 1292: 1293: /* Because of the way auto-callout works during compile, a callout item is 1294: inserted between OP_COND and an assertion condition. */ 1295: 1296: if (*ecode == OP_CALLOUT) 1297: { 1298: if (PUBL(callout) != NULL) 1299: { 1300: PUBL(callout_block) cb; 1301: cb.version = 2; /* Version 1 of the callout block */ 1302: cb.callout_number = ecode[1]; 1303: cb.offset_vector = md->offset_vector; 1304: #if defined COMPILE_PCRE8 1305: cb.subject = (PCRE_SPTR)md->start_subject; 1306: #elif defined COMPILE_PCRE16 1307: cb.subject = (PCRE_SPTR16)md->start_subject; 1308: #elif defined COMPILE_PCRE32 1309: cb.subject = (PCRE_SPTR32)md->start_subject; 1310: #endif 1311: cb.subject_length = (int)(md->end_subject - md->start_subject); 1312: cb.start_match = (int)(mstart - md->start_subject); 1313: cb.current_position = (int)(eptr - md->start_subject); 1314: cb.pattern_position = GET(ecode, 2); 1315: cb.next_item_length = GET(ecode, 2 + LINK_SIZE); 1316: cb.capture_top = offset_top/2; 1317: cb.capture_last = md->capture_last & CAPLMASK; 1318: /* Internal change requires this for API compatibility. */ 1319: if (cb.capture_last == 0) cb.capture_last = -1; 1320: cb.callout_data = md->callout_data; 1321: cb.mark = md->nomatch_mark; 1322: if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH); 1323: if (rrc < 0) RRETURN(rrc); 1324: } 1325: 1326: /* Advance ecode past the callout, so it now points to the condition. We 1327: must adjust codelink so that the value of ecode+codelink is unchanged. */ 1328: 1329: ecode += PRIV(OP_lengths)[OP_CALLOUT]; 1330: codelink -= PRIV(OP_lengths)[OP_CALLOUT]; 1331: } 1332: 1333: /* Test the various possible conditions */ 1334: 1335: condition = FALSE; 1336: switch(condcode = *ecode) 1337: { 1338: case OP_RREF: /* Numbered group recursion test */ 1339: if (md->recursive != NULL) /* Not recursing => FALSE */ 1340: { 1341: unsigned int recno = GET2(ecode, 1); /* Recursion group number*/ 1342: condition = (recno == RREF_ANY || recno == md->recursive->group_num); 1343: } 1344: break; 1345: 1346: case OP_DNRREF: /* Duplicate named group recursion test */ 1347: if (md->recursive != NULL) 1348: { 1349: int count = GET2(ecode, 1 + IMM2_SIZE); 1350: pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size; 1351: while (count-- > 0) 1352: { 1353: unsigned int recno = GET2(slot, 0); 1354: condition = recno == md->recursive->group_num; 1355: if (condition) break; 1356: slot += md->name_entry_size; 1357: } 1358: } 1359: break; 1360: 1361: case OP_CREF: /* Numbered group used test */ 1362: offset = GET2(ecode, 1) << 1; /* Doubled ref number */ 1363: condition = offset < offset_top && md->offset_vector[offset] >= 0; 1364: break; 1365: 1366: case OP_DNCREF: /* Duplicate named group used test */ 1367: { 1368: int count = GET2(ecode, 1 + IMM2_SIZE); 1369: pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size; 1370: while (count-- > 0) 1371: { 1372: offset = GET2(slot, 0) << 1; 1373: condition = offset < offset_top && md->offset_vector[offset] >= 0; 1374: if (condition) break; 1375: slot += md->name_entry_size; 1376: } 1377: } 1378: break; 1379: 1380: case OP_DEF: /* DEFINE - always false */ 1381: break; 1382: 1383: /* The condition is an assertion. Call match() to evaluate it - setting 1384: md->match_function_type to MATCH_CONDASSERT causes it to stop at the end 1385: of an assertion. */ 1386: 1387: default: 1388: md->match_function_type = MATCH_CONDASSERT; 1389: RMATCH(eptr, ecode, offset_top, md, NULL, RM3); 1390: if (rrc == MATCH_MATCH) 1391: { 1392: if (md->end_offset_top > offset_top) 1393: offset_top = md->end_offset_top; /* Captures may have happened */ 1394: condition = TRUE; 1395: 1396: /* Advance ecode past the assertion to the start of the first branch, 1397: but adjust it so that the general choosing code below works. */ 1398: 1399: ecode += GET(ecode, 1); 1400: while (*ecode == OP_ALT) ecode += GET(ecode, 1); 1401: ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode]; 1402: } 1403: 1404: /* PCRE doesn't allow the effect of (*THEN) to escape beyond an 1405: assertion; it is therefore treated as NOMATCH. Any other return is an 1406: error. */ 1407: 1408: else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) 1409: { 1410: RRETURN(rrc); /* Need braces because of following else */ 1411: } 1412: break; 1413: } 1414: 1415: /* Choose branch according to the condition */ 1416: 1417: ecode += condition? PRIV(OP_lengths)[condcode] : codelink; 1418: 1419: /* We are now at the branch that is to be obeyed. As there is only one, we 1420: can use tail recursion to avoid using another stack frame, except when 1421: there is unlimited repeat of a possibly empty group. In the latter case, a 1422: recursive call to match() is always required, unless the second alternative 1423: doesn't exist, in which case we can just plough on. Note that, for 1424: compatibility with Perl, the | in a conditional group is NOT treated as 1425: creating two alternatives. If a THEN is encountered in the branch, it 1426: propagates out to the enclosing alternative (unless nested in a deeper set 1427: of alternatives, of course). */ 1428: 1429: if (condition || ecode[-(1+LINK_SIZE)] == OP_ALT) 1430: { 1431: if (op != OP_SCOND) 1432: { 1433: goto TAIL_RECURSE; 1434: } 1435: 1436: md->match_function_type = MATCH_CBEGROUP; 1437: RMATCH(eptr, ecode, offset_top, md, eptrb, RM49); 1438: RRETURN(rrc); 1439: } 1440: 1441: /* Condition false & no alternative; continue after the group. */ 1442: 1443: else 1444: { 1445: } 1446: break; 1447: 1448: 1449: /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes, 1450: to close any currently open capturing brackets. */ 1451: 1452: case OP_CLOSE: 1453: number = GET2(ecode, 1); /* Must be less than 65536 */ 1454: offset = number << 1; 1455: 1456: #ifdef PCRE_DEBUG 1457: printf("end bracket %d at *ACCEPT", number); 1458: printf("\n"); 1459: #endif 1460: 1461: md->capture_last = (md->capture_last & OVFLMASK) | number; 1462: if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else 1463: { 1464: md->offset_vector[offset] = 1465: md->offset_vector[md->offset_end - number]; 1466: md->offset_vector[offset+1] = (int)(eptr - md->start_subject); 1467: if (offset_top <= offset) offset_top = offset + 2; 1468: } 1469: ecode += 1 + IMM2_SIZE; 1470: break; 1471: 1472: 1473: /* End of the pattern, either real or forced. */ 1474: 1475: case OP_END: 1476: case OP_ACCEPT: 1477: case OP_ASSERT_ACCEPT: 1478: 1479: /* If we have matched an empty string, fail if not in an assertion and not 1480: in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART 1481: is set and we have matched at the start of the subject. In both cases, 1482: backtracking will then try other alternatives, if any. */ 1483: 1484: if (eptr == mstart && op != OP_ASSERT_ACCEPT && 1485: md->recursive == NULL && 1486: (md->notempty || 1487: (md->notempty_atstart && 1488: mstart == md->start_subject + md->start_offset))) 1489: RRETURN(MATCH_NOMATCH); 1490: 1491: /* Otherwise, we have a match. */ 1492: 1493: md->end_match_ptr = eptr; /* Record where we ended */ 1494: md->end_offset_top = offset_top; /* and how many extracts were taken */ 1495: md->start_match_ptr = mstart; /* and the start (\K can modify) */ 1496: 1497: /* For some reason, the macros don't work properly if an expression is 1498: given as the argument to RRETURN when the heap is in use. */ 1499: 1500: rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT; 1501: RRETURN(rrc); 1502: 1503: /* Assertion brackets. Check the alternative branches in turn - the 1504: matching won't pass the KET for an assertion. If any one branch matches, 1505: the assertion is true. Lookbehind assertions have an OP_REVERSE item at the 1506: start of each branch to move the current point backwards, so the code at 1507: this level is identical to the lookahead case. When the assertion is part 1508: of a condition, we want to return immediately afterwards. The caller of 1509: this incarnation of the match() function will have set MATCH_CONDASSERT in 1510: md->match_function type, and one of these opcodes will be the first opcode 1511: that is processed. We use a local variable that is preserved over calls to 1512: match() to remember this case. */ 1513: 1514: case OP_ASSERT: 1515: case OP_ASSERTBACK: 1516: save_mark = md->mark; 1517: if (md->match_function_type == MATCH_CONDASSERT) 1518: { 1519: condassert = TRUE; 1520: md->match_function_type = 0; 1521: } 1522: else condassert = FALSE; 1523: 1524: /* Loop for each branch */ 1525: 1526: do 1527: { 1528: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4); 1529: 1530: /* A match means that the assertion is true; break out of the loop 1531: that matches its alternatives. */ 1532: 1533: if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) 1534: { 1535: mstart = md->start_match_ptr; /* In case \K reset it */ 1536: break; 1537: } 1538: 1539: /* If not matched, restore the previous mark setting. */ 1540: 1541: md->mark = save_mark; 1542: 1543: /* See comment in the code for capturing groups above about handling 1544: THEN. */ 1545: 1546: if (rrc == MATCH_THEN) 1547: { 1548: next = ecode + GET(ecode,1); 1549: if (md->start_match_ptr < next && 1550: (*ecode == OP_ALT || *next == OP_ALT)) 1551: rrc = MATCH_NOMATCH; 1552: } 1553: 1554: /* Anything other than NOMATCH causes the entire assertion to fail, 1555: passing back the return code. This includes COMMIT, SKIP, PRUNE and an 1556: uncaptured THEN, which means they take their normal effect. This 1557: consistent approach does not always have exactly the same effect as in 1558: Perl. */ 1559: 1560: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1561: ecode += GET(ecode, 1); 1562: } 1563: while (*ecode == OP_ALT); /* Continue for next alternative */ 1564: 1565: /* If we have tried all the alternative branches, the assertion has 1566: failed. If not, we broke out after a match. */ 1567: 1568: if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH); 1569: 1570: /* If checking an assertion for a condition, return MATCH_MATCH. */ 1571: 1572: if (condassert) RRETURN(MATCH_MATCH); 1573: 1574: /* Continue from after a successful assertion, updating the offsets high 1575: water mark, since extracts may have been taken during the assertion. */ 1576: 1577: do ecode += GET(ecode,1); while (*ecode == OP_ALT); 1578: ecode += 1 + LINK_SIZE; 1579: offset_top = md->end_offset_top; 1580: continue; 1581: 1582: /* Negative assertion: all branches must fail to match for the assertion to 1583: succeed. */ 1584: 1585: case OP_ASSERT_NOT: 1586: case OP_ASSERTBACK_NOT: 1587: save_mark = md->mark; 1588: if (md->match_function_type == MATCH_CONDASSERT) 1589: { 1590: condassert = TRUE; 1591: md->match_function_type = 0; 1592: } 1593: else condassert = FALSE; 1594: 1595: /* Loop for each alternative branch. */ 1596: 1597: do 1598: { 1599: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5); 1600: md->mark = save_mark; /* Always restore the mark setting */ 1601: 1602: switch(rrc) 1603: { 1604: case MATCH_MATCH: /* A successful match means */ 1605: case MATCH_ACCEPT: /* the assertion has failed. */ 1606: RRETURN(MATCH_NOMATCH); 1607: 1608: case MATCH_NOMATCH: /* Carry on with next branch */ 1609: break; 1610: 1611: /* See comment in the code for capturing groups above about handling 1612: THEN. */ 1613: 1614: case MATCH_THEN: 1615: next = ecode + GET(ecode,1); 1616: if (md->start_match_ptr < next && 1617: (*ecode == OP_ALT || *next == OP_ALT)) 1618: { 1619: rrc = MATCH_NOMATCH; 1620: break; 1621: } 1622: /* Otherwise fall through. */ 1623: 1624: /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole 1625: assertion to fail to match, without considering any more alternatives. 1626: Failing to match means the assertion is true. This is a consistent 1627: approach, but does not always have the same effect as in Perl. */ 1628: 1629: case MATCH_COMMIT: 1630: case MATCH_SKIP: 1631: case MATCH_SKIP_ARG: 1632: case MATCH_PRUNE: 1633: do ecode += GET(ecode,1); while (*ecode == OP_ALT); 1634: goto NEG_ASSERT_TRUE; /* Break out of alternation loop */ 1635: 1636: /* Anything else is an error */ 1637: 1638: default: 1639: RRETURN(rrc); 1640: } 1641: 1642: /* Continue with next branch */ 1643: 1644: ecode += GET(ecode,1); 1645: } 1646: while (*ecode == OP_ALT); 1647: 1648: /* All branches in the assertion failed to match. */ 1649: 1650: NEG_ASSERT_TRUE: 1651: if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */ 1652: ecode += 1 + LINK_SIZE; /* Continue with current branch */ 1653: continue; 1654: 1655: /* Move the subject pointer back. This occurs only at the start of 1656: each branch of a lookbehind assertion. If we are too close to the start to 1657: move back, this match function fails. When working with UTF-8 we move 1658: back a number of characters, not bytes. */ 1659: 1660: case OP_REVERSE: 1661: #ifdef SUPPORT_UTF 1662: if (utf) 1663: { 1664: i = GET(ecode, 1); 1665: while (i-- > 0) 1666: { 1667: eptr--; 1668: if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); 1669: BACKCHAR(eptr); 1670: } 1671: } 1672: else 1673: #endif 1674: 1675: /* No UTF-8 support, or not in UTF-8 mode: count is byte count */ 1676: 1677: { 1678: eptr -= GET(ecode, 1); 1679: if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); 1680: } 1681: 1682: /* Save the earliest consulted character, then skip to next op code */ 1683: 1684: if (eptr < md->start_used_ptr) md->start_used_ptr = eptr; 1685: ecode += 1 + LINK_SIZE; 1686: break; 1687: 1688: /* The callout item calls an external function, if one is provided, passing 1689: details of the match so far. This is mainly for debugging, though the 1690: function is able to force a failure. */ 1691: 1692: case OP_CALLOUT: 1693: if (PUBL(callout) != NULL) 1694: { 1695: PUBL(callout_block) cb; 1696: cb.version = 2; /* Version 1 of the callout block */ 1697: cb.callout_number = ecode[1]; 1698: cb.offset_vector = md->offset_vector; 1699: #if defined COMPILE_PCRE8 1700: cb.subject = (PCRE_SPTR)md->start_subject; 1701: #elif defined COMPILE_PCRE16 1702: cb.subject = (PCRE_SPTR16)md->start_subject; 1703: #elif defined COMPILE_PCRE32 1704: cb.subject = (PCRE_SPTR32)md->start_subject; 1705: #endif 1706: cb.subject_length = (int)(md->end_subject - md->start_subject); 1707: cb.start_match = (int)(mstart - md->start_subject); 1708: cb.current_position = (int)(eptr - md->start_subject); 1709: cb.pattern_position = GET(ecode, 2); 1710: cb.next_item_length = GET(ecode, 2 + LINK_SIZE); 1711: cb.capture_top = offset_top/2; 1712: cb.capture_last = md->capture_last & CAPLMASK; 1713: /* Internal change requires this for API compatibility. */ 1714: if (cb.capture_last == 0) cb.capture_last = -1; 1715: cb.callout_data = md->callout_data; 1716: cb.mark = md->nomatch_mark; 1717: if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH); 1718: if (rrc < 0) RRETURN(rrc); 1719: } 1720: ecode += 2 + 2*LINK_SIZE; 1721: break; 1722: 1723: /* Recursion either matches the current regex, or some subexpression. The 1724: offset data is the offset to the starting bracket from the start of the 1725: whole pattern. (This is so that it works from duplicated subpatterns.) 1726: 1727: The state of the capturing groups is preserved over recursion, and 1728: re-instated afterwards. We don't know how many are started and not yet 1729: finished (offset_top records the completed total) so we just have to save 1730: all the potential data. There may be up to 65535 such values, which is too 1731: large to put on the stack, but using malloc for small numbers seems 1732: expensive. As a compromise, the stack is used when there are no more than 1733: REC_STACK_SAVE_MAX values to store; otherwise malloc is used. 1734: 1735: There are also other values that have to be saved. We use a chained 1736: sequence of blocks that actually live on the stack. Thanks to Robin Houston 1737: for the original version of this logic. It has, however, been hacked around 1738: a lot, so he is not to blame for the current way it works. */ 1739: 1740: case OP_RECURSE: 1741: { 1742: recursion_info *ri; 1743: unsigned int recno; 1744: 1745: callpat = md->start_code + GET(ecode, 1); 1746: recno = (callpat == md->start_code)? 0 : 1747: GET2(callpat, 1 + LINK_SIZE); 1748: 1749: /* Check for repeating a recursion without advancing the subject pointer. 1750: This should catch convoluted mutual recursions. (Some simple cases are 1751: caught at compile time.) */ 1752: 1753: for (ri = md->recursive; ri != NULL; ri = ri->prevrec) 1754: if (recno == ri->group_num && eptr == ri->subject_position) 1755: RRETURN(PCRE_ERROR_RECURSELOOP); 1756: 1757: /* Add to "recursing stack" */ 1758: 1759: new_recursive.group_num = recno; 1760: new_recursive.saved_capture_last = md->capture_last; 1761: new_recursive.subject_position = eptr; 1762: new_recursive.prevrec = md->recursive; 1763: md->recursive = &new_recursive; 1764: 1765: /* Where to continue from afterwards */ 1766: 1767: ecode += 1 + LINK_SIZE; 1768: 1769: /* Now save the offset data */ 1770: 1771: new_recursive.saved_max = md->offset_end; 1772: if (new_recursive.saved_max <= REC_STACK_SAVE_MAX) 1773: new_recursive.offset_save = stacksave; 1774: else 1775: { 1776: new_recursive.offset_save = 1777: (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int)); 1778: if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY); 1779: } 1780: memcpy(new_recursive.offset_save, md->offset_vector, 1781: new_recursive.saved_max * sizeof(int)); 1782: 1783: /* OK, now we can do the recursion. After processing each alternative, 1784: restore the offset data and the last captured value. If there were nested 1785: recursions, md->recursive might be changed, so reset it before looping. 1786: */ 1787: 1788: DPRINTF(("Recursing into group %d\n", new_recursive.group_num)); 1789: cbegroup = (*callpat >= OP_SBRA); 1790: do 1791: { 1792: if (cbegroup) md->match_function_type = MATCH_CBEGROUP; 1793: RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top, 1794: md, eptrb, RM6); 1795: memcpy(md->offset_vector, new_recursive.offset_save, 1796: new_recursive.saved_max * sizeof(int)); 1797: md->capture_last = new_recursive.saved_capture_last; 1798: md->recursive = new_recursive.prevrec; 1799: if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) 1800: { 1801: DPRINTF(("Recursion matched\n")); 1802: if (new_recursive.offset_save != stacksave) 1803: (PUBL(free))(new_recursive.offset_save); 1804: 1805: /* Set where we got to in the subject, and reset the start in case 1806: it was changed by \K. This *is* propagated back out of a recursion, 1807: for Perl compatibility. */ 1808: 1809: eptr = md->end_match_ptr; 1810: mstart = md->start_match_ptr; 1811: goto RECURSION_MATCHED; /* Exit loop; end processing */ 1812: } 1813: 1814: /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a 1815: recursion; they cause a NOMATCH for the entire recursion. These codes 1816: are defined in a range that can be tested for. */ 1817: 1818: if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX) 1819: RRETURN(MATCH_NOMATCH); 1820: 1821: /* Any return code other than NOMATCH is an error. */ 1822: 1823: if (rrc != MATCH_NOMATCH) 1824: { 1825: DPRINTF(("Recursion gave error %d\n", rrc)); 1826: if (new_recursive.offset_save != stacksave) 1827: (PUBL(free))(new_recursive.offset_save); 1828: RRETURN(rrc); 1829: } 1830: 1831: md->recursive = &new_recursive; 1832: callpat += GET(callpat, 1); 1833: } 1834: while (*callpat == OP_ALT); 1835: 1836: DPRINTF(("Recursion didn't match\n")); 1837: md->recursive = new_recursive.prevrec; 1838: if (new_recursive.offset_save != stacksave) 1839: (PUBL(free))(new_recursive.offset_save); 1840: RRETURN(MATCH_NOMATCH); 1841: } 1842: 1843: RECURSION_MATCHED: 1844: break; 1845: 1846: /* An alternation is the end of a branch; scan along to find the end of the 1847: bracketed group and go to there. */ 1848: 1849: case OP_ALT: 1850: do ecode += GET(ecode,1); while (*ecode == OP_ALT); 1851: break; 1852: 1853: /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group, 1854: indicating that it may occur zero times. It may repeat infinitely, or not 1855: at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets 1856: with fixed upper repeat limits are compiled as a number of copies, with the 1857: optional ones preceded by BRAZERO or BRAMINZERO. */ 1858: 1859: case OP_BRAZERO: 1860: next = ecode + 1; 1861: RMATCH(eptr, next, offset_top, md, eptrb, RM10); 1862: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1863: do next += GET(next, 1); while (*next == OP_ALT); 1864: ecode = next + 1 + LINK_SIZE; 1865: break; 1866: 1867: case OP_BRAMINZERO: 1868: next = ecode + 1; 1869: do next += GET(next, 1); while (*next == OP_ALT); 1870: RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11); 1871: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1872: ecode++; 1873: break; 1874: 1875: case OP_SKIPZERO: 1876: next = ecode+1; 1877: do next += GET(next,1); while (*next == OP_ALT); 1878: ecode = next + 1 + LINK_SIZE; 1879: break; 1880: 1881: /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything 1882: here; just jump to the group, with allow_zero set TRUE. */ 1883: 1884: case OP_BRAPOSZERO: 1885: op = *(++ecode); 1886: allow_zero = TRUE; 1887: if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE; 1888: goto POSSESSIVE_NON_CAPTURE; 1889: 1890: /* End of a group, repeated or non-repeating. */ 1891: 1892: case OP_KET: 1893: case OP_KETRMIN: 1894: case OP_KETRMAX: 1895: case OP_KETRPOS: 1896: prev = ecode - GET(ecode, 1); 1897: 1898: /* If this was a group that remembered the subject start, in order to break 1899: infinite repeats of empty string matches, retrieve the subject start from 1900: the chain. Otherwise, set it NULL. */ 1901: 1902: if (*prev >= OP_SBRA || *prev == OP_ONCE) 1903: { 1904: saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */ 1905: eptrb = eptrb->epb_prev; /* Backup to previous group */ 1906: } 1907: else saved_eptr = NULL; 1908: 1909: /* If we are at the end of an assertion group or a non-capturing atomic 1910: group, stop matching and return MATCH_MATCH, but record the current high 1911: water mark for use by positive assertions. We also need to record the match 1912: start in case it was changed by \K. */ 1913: 1914: if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) || 1915: *prev == OP_ONCE_NC) 1916: { 1917: md->end_match_ptr = eptr; /* For ONCE_NC */ 1918: md->end_offset_top = offset_top; 1919: md->start_match_ptr = mstart; 1920: RRETURN(MATCH_MATCH); /* Sets md->mark */ 1921: } 1922: 1923: /* For capturing groups we have to check the group number back at the start 1924: and if necessary complete handling an extraction by setting the offsets and 1925: bumping the high water mark. Whole-pattern recursion is coded as a recurse 1926: into group 0, so it won't be picked up here. Instead, we catch it when the 1927: OP_END is reached. Other recursion is handled here. We just have to record 1928: the current subject position and start match pointer and give a MATCH 1929: return. */ 1930: 1931: if (*prev == OP_CBRA || *prev == OP_SCBRA || 1932: *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS) 1933: { 1934: number = GET2(prev, 1+LINK_SIZE); 1935: offset = number << 1; 1936: 1937: #ifdef PCRE_DEBUG 1938: printf("end bracket %d", number); 1939: printf("\n"); 1940: #endif 1941: 1942: /* Handle a recursively called group. */ 1943: 1944: if (md->recursive != NULL && md->recursive->group_num == number) 1945: { 1946: md->end_match_ptr = eptr; 1947: md->start_match_ptr = mstart; 1948: RRETURN(MATCH_MATCH); 1949: } 1950: 1951: /* Deal with capturing */ 1952: 1953: md->capture_last = (md->capture_last & OVFLMASK) | number; 1954: if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else 1955: { 1956: /* If offset is greater than offset_top, it means that we are 1957: "skipping" a capturing group, and that group's offsets must be marked 1958: unset. In earlier versions of PCRE, all the offsets were unset at the 1959: start of matching, but this doesn't work because atomic groups and 1960: assertions can cause a value to be set that should later be unset. 1961: Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as 1962: part of the atomic group, but this is not on the final matching path, 1963: so must be unset when 2 is set. (If there is no group 2, there is no 1964: problem, because offset_top will then be 2, indicating no capture.) */ 1965: 1966: if (offset > offset_top) 1967: { 1968: register int *iptr = md->offset_vector + offset_top; 1969: register int *iend = md->offset_vector + offset; 1970: while (iptr < iend) *iptr++ = -1; 1971: } 1972: 1973: /* Now make the extraction */ 1974: 1975: md->offset_vector[offset] = 1976: md->offset_vector[md->offset_end - number]; 1977: md->offset_vector[offset+1] = (int)(eptr - md->start_subject); 1978: if (offset_top <= offset) offset_top = offset + 2; 1979: } 1980: } 1981: 1982: /* For an ordinary non-repeating ket, just continue at this level. This 1983: also happens for a repeating ket if no characters were matched in the 1984: group. This is the forcible breaking of infinite loops as implemented in 1985: Perl 5.005. For a non-repeating atomic group that includes captures, 1986: establish a backup point by processing the rest of the pattern at a lower 1987: level. If this results in a NOMATCH return, pass MATCH_ONCE back to the 1988: original OP_ONCE level, thereby bypassing intermediate backup points, but 1989: resetting any captures that happened along the way. */ 1990: 1991: if (*ecode == OP_KET || eptr == saved_eptr) 1992: { 1993: if (*prev == OP_ONCE) 1994: { 1995: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12); 1996: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1997: md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */ 1998: RRETURN(MATCH_ONCE); 1999: } 2000: ecode += 1 + LINK_SIZE; /* Carry on at this level */ 2001: break; 2002: } 2003: 2004: /* OP_KETRPOS is a possessive repeating ket. Remember the current position, 2005: and return the MATCH_KETRPOS. This makes it possible to do the repeats one 2006: at a time from the outer level, thus saving stack. */ 2007: 2008: if (*ecode == OP_KETRPOS) 2009: { 2010: md->start_match_ptr = mstart; /* In case \K reset it */ 2011: md->end_match_ptr = eptr; 2012: md->end_offset_top = offset_top; 2013: RRETURN(MATCH_KETRPOS); 2014: } 2015: 2016: /* The normal repeating kets try the rest of the pattern or restart from 2017: the preceding bracket, in the appropriate order. In the second case, we can 2018: use tail recursion to avoid using another stack frame, unless we have an 2019: an atomic group or an unlimited repeat of a group that can match an empty 2020: string. */ 2021: 2022: if (*ecode == OP_KETRMIN) 2023: { 2024: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7); 2025: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2026: if (*prev == OP_ONCE) 2027: { 2028: RMATCH(eptr, prev, offset_top, md, eptrb, RM8); 2029: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2030: md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */ 2031: RRETURN(MATCH_ONCE); 2032: } 2033: if (*prev >= OP_SBRA) /* Could match an empty string */ 2034: { 2035: RMATCH(eptr, prev, offset_top, md, eptrb, RM50); 2036: RRETURN(rrc); 2037: } 2038: ecode = prev; 2039: goto TAIL_RECURSE; 2040: } 2041: else /* OP_KETRMAX */ 2042: { 2043: RMATCH(eptr, prev, offset_top, md, eptrb, RM13); 2044: if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH; 2045: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2046: if (*prev == OP_ONCE) 2047: { 2048: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9); 2049: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2050: md->once_target = prev; 2051: RRETURN(MATCH_ONCE); 2052: } 2053: ecode += 1 + LINK_SIZE; 2054: goto TAIL_RECURSE; 2055: } 2056: /* Control never gets here */ 2057: 2058: /* Not multiline mode: start of subject assertion, unless notbol. */ 2059: 2060: case OP_CIRC: 2061: if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH); 2062: 2063: /* Start of subject assertion */ 2064: 2065: case OP_SOD: 2066: if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH); 2067: ecode++; 2068: break; 2069: 2070: /* Multiline mode: start of subject unless notbol, or after any newline. */ 2071: 2072: case OP_CIRCM: 2073: if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH); 2074: if (eptr != md->start_subject && 2075: (eptr == md->end_subject || !WAS_NEWLINE(eptr))) 2076: RRETURN(MATCH_NOMATCH); 2077: ecode++; 2078: break; 2079: 2080: /* Start of match assertion */ 2081: 2082: case OP_SOM: 2083: if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH); 2084: ecode++; 2085: break; 2086: 2087: /* Reset the start of match point */ 2088: 2089: case OP_SET_SOM: 2090: mstart = eptr; 2091: ecode++; 2092: break; 2093: 2094: /* Multiline mode: assert before any newline, or before end of subject 2095: unless noteol is set. */ 2096: 2097: case OP_DOLLM: 2098: if (eptr < md->end_subject) 2099: { 2100: if (!IS_NEWLINE(eptr)) 2101: { 2102: if (md->partial != 0 && 2103: eptr + 1 >= md->end_subject && 2104: NLBLOCK->nltype == NLTYPE_FIXED && 2105: NLBLOCK->nllen == 2 && 2106: RAWUCHARTEST(eptr) == NLBLOCK->nl[0]) 2107: { 2108: md->hitend = TRUE; 2109: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 2110: } 2111: RRETURN(MATCH_NOMATCH); 2112: } 2113: } 2114: else 2115: { 2116: if (md->noteol) RRETURN(MATCH_NOMATCH); 2117: SCHECK_PARTIAL(); 2118: } 2119: ecode++; 2120: break; 2121: 2122: /* Not multiline mode: assert before a terminating newline or before end of 2123: subject unless noteol is set. */ 2124: 2125: case OP_DOLL: 2126: if (md->noteol) RRETURN(MATCH_NOMATCH); 2127: if (!md->endonly) goto ASSERT_NL_OR_EOS; 2128: 2129: /* ... else fall through for endonly */ 2130: 2131: /* End of subject assertion (\z) */ 2132: 2133: case OP_EOD: 2134: if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH); 2135: SCHECK_PARTIAL(); 2136: ecode++; 2137: break; 2138: 2139: /* End of subject or ending \n assertion (\Z) */ 2140: 2141: case OP_EODN: 2142: ASSERT_NL_OR_EOS: 2143: if (eptr < md->end_subject && 2144: (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen)) 2145: { 2146: if (md->partial != 0 && 2147: eptr + 1 >= md->end_subject && 2148: NLBLOCK->nltype == NLTYPE_FIXED && 2149: NLBLOCK->nllen == 2 && 2150: RAWUCHARTEST(eptr) == NLBLOCK->nl[0]) 2151: { 2152: md->hitend = TRUE; 2153: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 2154: } 2155: RRETURN(MATCH_NOMATCH); 2156: } 2157: 2158: /* Either at end of string or \n before end. */ 2159: 2160: SCHECK_PARTIAL(); 2161: ecode++; 2162: break; 2163: 2164: /* Word boundary assertions */ 2165: 2166: case OP_NOT_WORD_BOUNDARY: 2167: case OP_WORD_BOUNDARY: 2168: { 2169: 2170: /* Find out if the previous and current characters are "word" characters. 2171: It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to 2172: be "non-word" characters. Remember the earliest consulted character for 2173: partial matching. */ 2174: 2175: #ifdef SUPPORT_UTF 2176: if (utf) 2177: { 2178: /* Get status of previous character */ 2179: 2180: if (eptr == md->start_subject) prev_is_word = FALSE; else 2181: { 2182: PCRE_PUCHAR lastptr = eptr - 1; 2183: BACKCHAR(lastptr); 2184: if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr; 2185: GETCHAR(c, lastptr); 2186: #ifdef SUPPORT_UCP 2187: if (md->use_ucp) 2188: { 2189: if (c == '_') prev_is_word = TRUE; else 2190: { 2191: int cat = UCD_CATEGORY(c); 2192: prev_is_word = (cat == ucp_L || cat == ucp_N); 2193: } 2194: } 2195: else 2196: #endif 2197: prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; 2198: } 2199: 2200: /* Get status of next character */ 2201: 2202: if (eptr >= md->end_subject) 2203: { 2204: SCHECK_PARTIAL(); 2205: cur_is_word = FALSE; 2206: } 2207: else 2208: { 2209: GETCHAR(c, eptr); 2210: #ifdef SUPPORT_UCP 2211: if (md->use_ucp) 2212: { 2213: if (c == '_') cur_is_word = TRUE; else 2214: { 2215: int cat = UCD_CATEGORY(c); 2216: cur_is_word = (cat == ucp_L || cat == ucp_N); 2217: } 2218: } 2219: else 2220: #endif 2221: cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; 2222: } 2223: } 2224: else 2225: #endif 2226: 2227: /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for 2228: consistency with the behaviour of \w we do use it in this case. */ 2229: 2230: { 2231: /* Get status of previous character */ 2232: 2233: if (eptr == md->start_subject) prev_is_word = FALSE; else 2234: { 2235: if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1; 2236: #ifdef SUPPORT_UCP 2237: if (md->use_ucp) 2238: { 2239: c = eptr[-1]; 2240: if (c == '_') prev_is_word = TRUE; else 2241: { 2242: int cat = UCD_CATEGORY(c); 2243: prev_is_word = (cat == ucp_L || cat == ucp_N); 2244: } 2245: } 2246: else 2247: #endif 2248: prev_is_word = MAX_255(eptr[-1]) 2249: && ((md->ctypes[eptr[-1]] & ctype_word) != 0); 2250: } 2251: 2252: /* Get status of next character */ 2253: 2254: if (eptr >= md->end_subject) 2255: { 2256: SCHECK_PARTIAL(); 2257: cur_is_word = FALSE; 2258: } 2259: else 2260: #ifdef SUPPORT_UCP 2261: if (md->use_ucp) 2262: { 2263: c = *eptr; 2264: if (c == '_') cur_is_word = TRUE; else 2265: { 2266: int cat = UCD_CATEGORY(c); 2267: cur_is_word = (cat == ucp_L || cat == ucp_N); 2268: } 2269: } 2270: else 2271: #endif 2272: cur_is_word = MAX_255(*eptr) 2273: && ((md->ctypes[*eptr] & ctype_word) != 0); 2274: } 2275: 2276: /* Now see if the situation is what we want */ 2277: 2278: if ((*ecode++ == OP_WORD_BOUNDARY)? 2279: cur_is_word == prev_is_word : cur_is_word != prev_is_word) 2280: RRETURN(MATCH_NOMATCH); 2281: } 2282: break; 2283: 2284: /* Match any single character type except newline; have to take care with 2285: CRLF newlines and partial matching. */ 2286: 2287: case OP_ANY: 2288: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); 2289: if (md->partial != 0 && 2290: eptr + 1 >= md->end_subject && 2291: NLBLOCK->nltype == NLTYPE_FIXED && 2292: NLBLOCK->nllen == 2 && 2293: RAWUCHARTEST(eptr) == NLBLOCK->nl[0]) 2294: { 2295: md->hitend = TRUE; 2296: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 2297: } 2298: 2299: /* Fall through */ 2300: 2301: /* Match any single character whatsoever. */ 2302: 2303: case OP_ALLANY: 2304: if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */ 2305: { /* not be updated before SCHECK_PARTIAL. */ 2306: SCHECK_PARTIAL(); 2307: RRETURN(MATCH_NOMATCH); 2308: } 2309: eptr++; 2310: #ifdef SUPPORT_UTF 2311: if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); 2312: #endif 2313: ecode++; 2314: break; 2315: 2316: /* Match a single byte, even in UTF-8 mode. This opcode really does match 2317: any byte, even newline, independent of the setting of PCRE_DOTALL. */ 2318: 2319: case OP_ANYBYTE: 2320: if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */ 2321: { /* not be updated before SCHECK_PARTIAL. */ 2322: SCHECK_PARTIAL(); 2323: RRETURN(MATCH_NOMATCH); 2324: } 2325: eptr++; 2326: ecode++; 2327: break; 2328: 2329: case OP_NOT_DIGIT: 2330: if (eptr >= md->end_subject) 2331: { 2332: SCHECK_PARTIAL(); 2333: RRETURN(MATCH_NOMATCH); 2334: } 2335: GETCHARINCTEST(c, eptr); 2336: if ( 2337: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) 2338: c < 256 && 2339: #endif 2340: (md->ctypes[c] & ctype_digit) != 0 2341: ) 2342: RRETURN(MATCH_NOMATCH); 2343: ecode++; 2344: break; 2345: 2346: case OP_DIGIT: 2347: if (eptr >= md->end_subject) 2348: { 2349: SCHECK_PARTIAL(); 2350: RRETURN(MATCH_NOMATCH); 2351: } 2352: GETCHARINCTEST(c, eptr); 2353: if ( 2354: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) 2355: c > 255 || 2356: #endif 2357: (md->ctypes[c] & ctype_digit) == 0 2358: ) 2359: RRETURN(MATCH_NOMATCH); 2360: ecode++; 2361: break; 2362: 2363: case OP_NOT_WHITESPACE: 2364: if (eptr >= md->end_subject) 2365: { 2366: SCHECK_PARTIAL(); 2367: RRETURN(MATCH_NOMATCH); 2368: } 2369: GETCHARINCTEST(c, eptr); 2370: if ( 2371: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) 2372: c < 256 && 2373: #endif 2374: (md->ctypes[c] & ctype_space) != 0 2375: ) 2376: RRETURN(MATCH_NOMATCH); 2377: ecode++; 2378: break; 2379: 2380: case OP_WHITESPACE: 2381: if (eptr >= md->end_subject) 2382: { 2383: SCHECK_PARTIAL(); 2384: RRETURN(MATCH_NOMATCH); 2385: } 2386: GETCHARINCTEST(c, eptr); 2387: if ( 2388: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) 2389: c > 255 || 2390: #endif 2391: (md->ctypes[c] & ctype_space) == 0 2392: ) 2393: RRETURN(MATCH_NOMATCH); 2394: ecode++; 2395: break; 2396: 2397: case OP_NOT_WORDCHAR: 2398: if (eptr >= md->end_subject) 2399: { 2400: SCHECK_PARTIAL(); 2401: RRETURN(MATCH_NOMATCH); 2402: } 2403: GETCHARINCTEST(c, eptr); 2404: if ( 2405: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) 2406: c < 256 && 2407: #endif 2408: (md->ctypes[c] & ctype_word) != 0 2409: ) 2410: RRETURN(MATCH_NOMATCH); 2411: ecode++; 2412: break; 2413: 2414: case OP_WORDCHAR: 2415: if (eptr >= md->end_subject) 2416: { 2417: SCHECK_PARTIAL(); 2418: RRETURN(MATCH_NOMATCH); 2419: } 2420: GETCHARINCTEST(c, eptr); 2421: if ( 2422: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) 2423: c > 255 || 2424: #endif 2425: (md->ctypes[c] & ctype_word) == 0 2426: ) 2427: RRETURN(MATCH_NOMATCH); 2428: ecode++; 2429: break; 2430: 2431: case OP_ANYNL: 2432: if (eptr >= md->end_subject) 2433: { 2434: SCHECK_PARTIAL(); 2435: RRETURN(MATCH_NOMATCH); 2436: } 2437: GETCHARINCTEST(c, eptr); 2438: switch(c) 2439: { 2440: default: RRETURN(MATCH_NOMATCH); 2441: 2442: case CHAR_CR: 2443: if (eptr >= md->end_subject) 2444: { 2445: SCHECK_PARTIAL(); 2446: } 2447: else if (RAWUCHARTEST(eptr) == CHAR_LF) eptr++; 2448: break; 2449: 2450: case CHAR_LF: 2451: break; 2452: 2453: case CHAR_VT: 2454: case CHAR_FF: 2455: case CHAR_NEL: 2456: #ifndef EBCDIC 2457: case 0x2028: 2458: case 0x2029: 2459: #endif /* Not EBCDIC */ 2460: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); 2461: break; 2462: } 2463: ecode++; 2464: break; 2465: 2466: case OP_NOT_HSPACE: 2467: if (eptr >= md->end_subject) 2468: { 2469: SCHECK_PARTIAL(); 2470: RRETURN(MATCH_NOMATCH); 2471: } 2472: GETCHARINCTEST(c, eptr); 2473: switch(c) 2474: { 2475: HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */ 2476: default: break; 2477: } 2478: ecode++; 2479: break; 2480: 2481: case OP_HSPACE: 2482: if (eptr >= md->end_subject) 2483: { 2484: SCHECK_PARTIAL(); 2485: RRETURN(MATCH_NOMATCH); 2486: } 2487: GETCHARINCTEST(c, eptr); 2488: switch(c) 2489: { 2490: HSPACE_CASES: break; /* Byte and multibyte cases */ 2491: default: RRETURN(MATCH_NOMATCH); 2492: } 2493: ecode++; 2494: break; 2495: 2496: case OP_NOT_VSPACE: 2497: if (eptr >= md->end_subject) 2498: { 2499: SCHECK_PARTIAL(); 2500: RRETURN(MATCH_NOMATCH); 2501: } 2502: GETCHARINCTEST(c, eptr); 2503: switch(c) 2504: { 2505: VSPACE_CASES: RRETURN(MATCH_NOMATCH); 2506: default: break; 2507: } 2508: ecode++; 2509: break; 2510: 2511: case OP_VSPACE: 2512: if (eptr >= md->end_subject) 2513: { 2514: SCHECK_PARTIAL(); 2515: RRETURN(MATCH_NOMATCH); 2516: } 2517: GETCHARINCTEST(c, eptr); 2518: switch(c) 2519: { 2520: VSPACE_CASES: break; 2521: default: RRETURN(MATCH_NOMATCH); 2522: } 2523: ecode++; 2524: break; 2525: 2526: #ifdef SUPPORT_UCP 2527: /* Check the next character by Unicode property. We will get here only 2528: if the support is in the binary; otherwise a compile-time error occurs. */ 2529: 2530: case OP_PROP: 2531: case OP_NOTPROP: 2532: if (eptr >= md->end_subject) 2533: { 2534: SCHECK_PARTIAL(); 2535: RRETURN(MATCH_NOMATCH); 2536: } 2537: GETCHARINCTEST(c, eptr); 2538: { 2539: const pcre_uint32 *cp; 2540: const ucd_record *prop = GET_UCD(c); 2541: 2542: switch(ecode[1]) 2543: { 2544: case PT_ANY: 2545: if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH); 2546: break; 2547: 2548: case PT_LAMP: 2549: if ((prop->chartype == ucp_Lu || 2550: prop->chartype == ucp_Ll || 2551: prop->chartype == ucp_Lt) == (op == OP_NOTPROP)) 2552: RRETURN(MATCH_NOMATCH); 2553: break; 2554: 2555: case PT_GC: 2556: if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP)) 2557: RRETURN(MATCH_NOMATCH); 2558: break; 2559: 2560: case PT_PC: 2561: if ((ecode[2] != prop->chartype) == (op == OP_PROP)) 2562: RRETURN(MATCH_NOMATCH); 2563: break; 2564: 2565: case PT_SC: 2566: if ((ecode[2] != prop->script) == (op == OP_PROP)) 2567: RRETURN(MATCH_NOMATCH); 2568: break; 2569: 2570: /* These are specials */ 2571: 2572: case PT_ALNUM: 2573: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || 2574: PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP)) 2575: RRETURN(MATCH_NOMATCH); 2576: break; 2577: 2578: /* Perl space used to exclude VT, but from Perl 5.18 it is included, 2579: which means that Perl space and POSIX space are now identical. PCRE 2580: was changed at release 8.34. */ 2581: 2582: case PT_SPACE: /* Perl space */ 2583: case PT_PXSPACE: /* POSIX space */ 2584: switch(c) 2585: { 2586: HSPACE_CASES: 2587: VSPACE_CASES: 2588: if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH); 2589: break; 2590: 2591: default: 2592: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == 2593: (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH); 2594: break; 2595: } 2596: break; 2597: 2598: case PT_WORD: 2599: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || 2600: PRIV(ucp_gentype)[prop->chartype] == ucp_N || 2601: c == CHAR_UNDERSCORE) == (op == OP_NOTPROP)) 2602: RRETURN(MATCH_NOMATCH); 2603: break; 2604: 2605: case PT_CLIST: 2606: cp = PRIV(ucd_caseless_sets) + ecode[2]; 2607: for (;;) 2608: { 2609: if (c < *cp) 2610: { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; } 2611: if (c == *cp++) 2612: { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } } 2613: } 2614: break; 2615: 2616: case PT_UCNC: 2617: if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 2618: c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 2619: c >= 0xe000) == (op == OP_NOTPROP)) 2620: RRETURN(MATCH_NOMATCH); 2621: break; 2622: 2623: /* This should never occur */ 2624: 2625: default: 2626: RRETURN(PCRE_ERROR_INTERNAL); 2627: } 2628: 2629: ecode += 3; 2630: } 2631: break; 2632: 2633: /* Match an extended Unicode sequence. We will get here only if the support 2634: is in the binary; otherwise a compile-time error occurs. */ 2635: 2636: case OP_EXTUNI: 2637: if (eptr >= md->end_subject) 2638: { 2639: SCHECK_PARTIAL(); 2640: RRETURN(MATCH_NOMATCH); 2641: } 2642: else 2643: { 2644: int lgb, rgb; 2645: GETCHARINCTEST(c, eptr); 2646: lgb = UCD_GRAPHBREAK(c); 2647: while (eptr < md->end_subject) 2648: { 2649: int len = 1; 2650: if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } 2651: rgb = UCD_GRAPHBREAK(c); 2652: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; 2653: lgb = rgb; 2654: eptr += len; 2655: } 2656: } 2657: CHECK_PARTIAL(); 2658: ecode++; 2659: break; 2660: #endif /* SUPPORT_UCP */ 2661: 2662: 2663: /* Match a back reference, possibly repeatedly. Look past the end of the 2664: item to see if there is repeat information following. The code is similar 2665: to that for character classes, but repeated for efficiency. Then obey 2666: similar code to character type repeats - written out again for speed. 2667: However, if the referenced string is the empty string, always treat 2668: it as matched, any number of times (otherwise there could be infinite 2669: loops). If the reference is unset, there are two possibilities: 2670: 2671: (a) In the default, Perl-compatible state, set the length negative; 2672: this ensures that every attempt at a match fails. We can't just fail 2673: here, because of the possibility of quantifiers with zero minima. 2674: 2675: (b) If the JavaScript compatibility flag is set, set the length to zero 2676: so that the back reference matches an empty string. 2677: 2678: Otherwise, set the length to the length of what was matched by the 2679: referenced subpattern. 2680: 2681: The OP_REF and OP_REFI opcodes are used for a reference to a numbered group 2682: or to a non-duplicated named group. For a duplicated named group, OP_DNREF 2683: and OP_DNREFI are used. In this case we must scan the list of groups to 2684: which the name refers, and use the first one that is set. */ 2685: 2686: case OP_DNREF: 2687: case OP_DNREFI: 2688: caseless = op == OP_DNREFI; 2689: { 2690: int count = GET2(ecode, 1+IMM2_SIZE); 2691: pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size; 2692: ecode += 1 + 2*IMM2_SIZE; 2693: 2694: while (count-- > 0) 2695: { 2696: offset = GET2(slot, 0) << 1; 2697: if (offset < offset_top && md->offset_vector[offset] >= 0) break; 2698: slot += md->name_entry_size; 2699: } 2700: if (count < 0) 2701: length = (md->jscript_compat)? 0 : -1; 2702: else 2703: length = md->offset_vector[offset+1] - md->offset_vector[offset]; 2704: } 2705: goto REF_REPEAT; 2706: 2707: case OP_REF: 2708: case OP_REFI: 2709: caseless = op == OP_REFI; 2710: offset = GET2(ecode, 1) << 1; /* Doubled ref number */ 2711: ecode += 1 + IMM2_SIZE; 2712: if (offset >= offset_top || md->offset_vector[offset] < 0) 2713: length = (md->jscript_compat)? 0 : -1; 2714: else 2715: length = md->offset_vector[offset+1] - md->offset_vector[offset]; 2716: 2717: /* Set up for repetition, or handle the non-repeated case */ 2718: 2719: REF_REPEAT: 2720: switch (*ecode) 2721: { 2722: case OP_CRSTAR: 2723: case OP_CRMINSTAR: 2724: case OP_CRPLUS: 2725: case OP_CRMINPLUS: 2726: case OP_CRQUERY: 2727: case OP_CRMINQUERY: 2728: c = *ecode++ - OP_CRSTAR; 2729: minimize = (c & 1) != 0; 2730: min = rep_min[c]; /* Pick up values from tables; */ 2731: max = rep_max[c]; /* zero for max => infinity */ 2732: if (max == 0) max = INT_MAX; 2733: break; 2734: 2735: case OP_CRRANGE: 2736: case OP_CRMINRANGE: 2737: minimize = (*ecode == OP_CRMINRANGE); 2738: min = GET2(ecode, 1); 2739: max = GET2(ecode, 1 + IMM2_SIZE); 2740: if (max == 0) max = INT_MAX; 2741: ecode += 1 + 2 * IMM2_SIZE; 2742: break; 2743: 2744: default: /* No repeat follows */ 2745: if ((length = match_ref(offset, eptr, length, md, caseless)) < 0) 2746: { 2747: if (length == -2) eptr = md->end_subject; /* Partial match */ 2748: CHECK_PARTIAL(); 2749: RRETURN(MATCH_NOMATCH); 2750: } 2751: eptr += length; 2752: continue; /* With the main loop */ 2753: } 2754: 2755: /* Handle repeated back references. If the length of the reference is 2756: zero, just continue with the main loop. If the length is negative, it 2757: means the reference is unset in non-Java-compatible mode. If the minimum is 2758: zero, we can continue at the same level without recursion. For any other 2759: minimum, carrying on will result in NOMATCH. */ 2760: 2761: if (length == 0) continue; 2762: if (length < 0 && min == 0) continue; 2763: 2764: /* First, ensure the minimum number of matches are present. We get back 2765: the length of the reference string explicitly rather than passing the 2766: address of eptr, so that eptr can be a register variable. */ 2767: 2768: for (i = 1; i <= min; i++) 2769: { 2770: int slength; 2771: if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0) 2772: { 2773: if (slength == -2) eptr = md->end_subject; /* Partial match */ 2774: CHECK_PARTIAL(); 2775: RRETURN(MATCH_NOMATCH); 2776: } 2777: eptr += slength; 2778: } 2779: 2780: /* If min = max, continue at the same level without recursion. 2781: They are not both allowed to be zero. */ 2782: 2783: if (min == max) continue; 2784: 2785: /* If minimizing, keep trying and advancing the pointer */ 2786: 2787: if (minimize) 2788: { 2789: for (fi = min;; fi++) 2790: { 2791: int slength; 2792: RMATCH(eptr, ecode, offset_top, md, eptrb, RM14); 2793: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2794: if (fi >= max) RRETURN(MATCH_NOMATCH); 2795: if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0) 2796: { 2797: if (slength == -2) eptr = md->end_subject; /* Partial match */ 2798: CHECK_PARTIAL(); 2799: RRETURN(MATCH_NOMATCH); 2800: } 2801: eptr += slength; 2802: } 2803: /* Control never gets here */ 2804: } 2805: 2806: /* If maximizing, find the longest string and work backwards */ 2807: 2808: else 2809: { 2810: pp = eptr; 2811: for (i = min; i < max; i++) 2812: { 2813: int slength; 2814: if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0) 2815: { 2816: /* Can't use CHECK_PARTIAL because we don't want to update eptr in 2817: the soft partial matching case. */ 2818: 2819: if (slength == -2 && md->partial != 0 && 2820: md->end_subject > md->start_used_ptr) 2821: { 2822: md->hitend = TRUE; 2823: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 2824: } 2825: break; 2826: } 2827: eptr += slength; 2828: } 2829: 2830: while (eptr >= pp) 2831: { 2832: RMATCH(eptr, ecode, offset_top, md, eptrb, RM15); 2833: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2834: eptr -= length; 2835: } 2836: RRETURN(MATCH_NOMATCH); 2837: } 2838: /* Control never gets here */ 2839: 2840: /* Match a bit-mapped character class, possibly repeatedly. This op code is 2841: used when all the characters in the class have values in the range 0-255, 2842: and either the matching is caseful, or the characters are in the range 2843: 0-127 when UTF-8 processing is enabled. The only difference between 2844: OP_CLASS and OP_NCLASS occurs when a data character outside the range is 2845: encountered. 2846: 2847: First, look past the end of the item to see if there is repeat information 2848: following. Then obey similar code to character type repeats - written out 2849: again for speed. */ 2850: 2851: case OP_NCLASS: 2852: case OP_CLASS: 2853: { 2854: /* The data variable is saved across frames, so the byte map needs to 2855: be stored there. */ 2856: #define BYTE_MAP ((pcre_uint8 *)data) 2857: data = ecode + 1; /* Save for matching */ 2858: ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */ 2859: 2860: switch (*ecode) 2861: { 2862: case OP_CRSTAR: 2863: case OP_CRMINSTAR: 2864: case OP_CRPLUS: 2865: case OP_CRMINPLUS: 2866: case OP_CRQUERY: 2867: case OP_CRMINQUERY: 2868: case OP_CRPOSSTAR: 2869: case OP_CRPOSPLUS: 2870: case OP_CRPOSQUERY: 2871: c = *ecode++ - OP_CRSTAR; 2872: if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0; 2873: else possessive = TRUE; 2874: min = rep_min[c]; /* Pick up values from tables; */ 2875: max = rep_max[c]; /* zero for max => infinity */ 2876: if (max == 0) max = INT_MAX; 2877: break; 2878: 2879: case OP_CRRANGE: 2880: case OP_CRMINRANGE: 2881: case OP_CRPOSRANGE: 2882: minimize = (*ecode == OP_CRMINRANGE); 2883: possessive = (*ecode == OP_CRPOSRANGE); 2884: min = GET2(ecode, 1); 2885: max = GET2(ecode, 1 + IMM2_SIZE); 2886: if (max == 0) max = INT_MAX; 2887: ecode += 1 + 2 * IMM2_SIZE; 2888: break; 2889: 2890: default: /* No repeat follows */ 2891: min = max = 1; 2892: break; 2893: } 2894: 2895: /* First, ensure the minimum number of matches are present. */ 2896: 2897: #ifdef SUPPORT_UTF 2898: if (utf) 2899: { 2900: for (i = 1; i <= min; i++) 2901: { 2902: if (eptr >= md->end_subject) 2903: { 2904: SCHECK_PARTIAL(); 2905: RRETURN(MATCH_NOMATCH); 2906: } 2907: GETCHARINC(c, eptr); 2908: if (c > 255) 2909: { 2910: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); 2911: } 2912: else 2913: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); 2914: } 2915: } 2916: else 2917: #endif 2918: /* Not UTF mode */ 2919: { 2920: for (i = 1; i <= min; i++) 2921: { 2922: if (eptr >= md->end_subject) 2923: { 2924: SCHECK_PARTIAL(); 2925: RRETURN(MATCH_NOMATCH); 2926: } 2927: c = *eptr++; 2928: #ifndef COMPILE_PCRE8 2929: if (c > 255) 2930: { 2931: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); 2932: } 2933: else 2934: #endif 2935: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); 2936: } 2937: } 2938: 2939: /* If max == min we can continue with the main loop without the 2940: need to recurse. */ 2941: 2942: if (min == max) continue; 2943: 2944: /* If minimizing, keep testing the rest of the expression and advancing 2945: the pointer while it matches the class. */ 2946: 2947: if (minimize) 2948: { 2949: #ifdef SUPPORT_UTF 2950: if (utf) 2951: { 2952: for (fi = min;; fi++) 2953: { 2954: RMATCH(eptr, ecode, offset_top, md, eptrb, RM16); 2955: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2956: if (fi >= max) RRETURN(MATCH_NOMATCH); 2957: if (eptr >= md->end_subject) 2958: { 2959: SCHECK_PARTIAL(); 2960: RRETURN(MATCH_NOMATCH); 2961: } 2962: GETCHARINC(c, eptr); 2963: if (c > 255) 2964: { 2965: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); 2966: } 2967: else 2968: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); 2969: } 2970: } 2971: else 2972: #endif 2973: /* Not UTF mode */ 2974: { 2975: for (fi = min;; fi++) 2976: { 2977: RMATCH(eptr, ecode, offset_top, md, eptrb, RM17); 2978: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2979: if (fi >= max) RRETURN(MATCH_NOMATCH); 2980: if (eptr >= md->end_subject) 2981: { 2982: SCHECK_PARTIAL(); 2983: RRETURN(MATCH_NOMATCH); 2984: } 2985: c = *eptr++; 2986: #ifndef COMPILE_PCRE8 2987: if (c > 255) 2988: { 2989: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); 2990: } 2991: else 2992: #endif 2993: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); 2994: } 2995: } 2996: /* Control never gets here */ 2997: } 2998: 2999: /* If maximizing, find the longest possible run, then work backwards. */ 3000: 3001: else 3002: { 3003: pp = eptr; 3004: 3005: #ifdef SUPPORT_UTF 3006: if (utf) 3007: { 3008: for (i = min; i < max; i++) 3009: { 3010: int len = 1; 3011: if (eptr >= md->end_subject) 3012: { 3013: SCHECK_PARTIAL(); 3014: break; 3015: } 3016: GETCHARLEN(c, eptr, len); 3017: if (c > 255) 3018: { 3019: if (op == OP_CLASS) break; 3020: } 3021: else 3022: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break; 3023: eptr += len; 3024: } 3025: 3026: if (possessive) continue; /* No backtracking */ 3027: 3028: for (;;) 3029: { 3030: RMATCH(eptr, ecode, offset_top, md, eptrb, RM18); 3031: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3032: if (eptr-- == pp) break; /* Stop if tried at original pos */ 3033: BACKCHAR(eptr); 3034: } 3035: } 3036: else 3037: #endif 3038: /* Not UTF mode */ 3039: { 3040: for (i = min; i < max; i++) 3041: { 3042: if (eptr >= md->end_subject) 3043: { 3044: SCHECK_PARTIAL(); 3045: break; 3046: } 3047: c = *eptr; 3048: #ifndef COMPILE_PCRE8 3049: if (c > 255) 3050: { 3051: if (op == OP_CLASS) break; 3052: } 3053: else 3054: #endif 3055: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break; 3056: eptr++; 3057: } 3058: 3059: if (possessive) continue; /* No backtracking */ 3060: 3061: while (eptr >= pp) 3062: { 3063: RMATCH(eptr, ecode, offset_top, md, eptrb, RM19); 3064: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3065: eptr--; 3066: } 3067: } 3068: 3069: RRETURN(MATCH_NOMATCH); 3070: } 3071: #undef BYTE_MAP 3072: } 3073: /* Control never gets here */ 3074: 3075: 3076: /* Match an extended character class. In the 8-bit library, this opcode is 3077: encountered only when UTF-8 mode mode is supported. In the 16-bit and 3078: 32-bit libraries, codepoints greater than 255 may be encountered even when 3079: UTF is not supported. */ 3080: 3081: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8 3082: case OP_XCLASS: 3083: { 3084: data = ecode + 1 + LINK_SIZE; /* Save for matching */ 3085: ecode += GET(ecode, 1); /* Advance past the item */ 3086: 3087: switch (*ecode) 3088: { 3089: case OP_CRSTAR: 3090: case OP_CRMINSTAR: 3091: case OP_CRPLUS: 3092: case OP_CRMINPLUS: 3093: case OP_CRQUERY: 3094: case OP_CRMINQUERY: 3095: case OP_CRPOSSTAR: 3096: case OP_CRPOSPLUS: 3097: case OP_CRPOSQUERY: 3098: c = *ecode++ - OP_CRSTAR; 3099: if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0; 3100: else possessive = TRUE; 3101: min = rep_min[c]; /* Pick up values from tables; */ 3102: max = rep_max[c]; /* zero for max => infinity */ 3103: if (max == 0) max = INT_MAX; 3104: break; 3105: 3106: case OP_CRRANGE: 3107: case OP_CRMINRANGE: 3108: case OP_CRPOSRANGE: 3109: minimize = (*ecode == OP_CRMINRANGE); 3110: possessive = (*ecode == OP_CRPOSRANGE); 3111: min = GET2(ecode, 1); 3112: max = GET2(ecode, 1 + IMM2_SIZE); 3113: if (max == 0) max = INT_MAX; 3114: ecode += 1 + 2 * IMM2_SIZE; 3115: break; 3116: 3117: default: /* No repeat follows */ 3118: min = max = 1; 3119: break; 3120: } 3121: 3122: /* First, ensure the minimum number of matches are present. */ 3123: 3124: for (i = 1; i <= min; i++) 3125: { 3126: if (eptr >= md->end_subject) 3127: { 3128: SCHECK_PARTIAL(); 3129: RRETURN(MATCH_NOMATCH); 3130: } 3131: GETCHARINCTEST(c, eptr); 3132: if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH); 3133: } 3134: 3135: /* If max == min we can continue with the main loop without the 3136: need to recurse. */ 3137: 3138: if (min == max) continue; 3139: 3140: /* If minimizing, keep testing the rest of the expression and advancing 3141: the pointer while it matches the class. */ 3142: 3143: if (minimize) 3144: { 3145: for (fi = min;; fi++) 3146: { 3147: RMATCH(eptr, ecode, offset_top, md, eptrb, RM20); 3148: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3149: if (fi >= max) RRETURN(MATCH_NOMATCH); 3150: if (eptr >= md->end_subject) 3151: { 3152: SCHECK_PARTIAL(); 3153: RRETURN(MATCH_NOMATCH); 3154: } 3155: GETCHARINCTEST(c, eptr); 3156: if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH); 3157: } 3158: /* Control never gets here */ 3159: } 3160: 3161: /* If maximizing, find the longest possible run, then work backwards. */ 3162: 3163: else 3164: { 3165: pp = eptr; 3166: for (i = min; i < max; i++) 3167: { 3168: int len = 1; 3169: if (eptr >= md->end_subject) 3170: { 3171: SCHECK_PARTIAL(); 3172: break; 3173: } 3174: #ifdef SUPPORT_UTF 3175: GETCHARLENTEST(c, eptr, len); 3176: #else 3177: c = *eptr; 3178: #endif 3179: if (!PRIV(xclass)(c, data, utf)) break; 3180: eptr += len; 3181: } 3182: 3183: if (possessive) continue; /* No backtracking */ 3184: 3185: for(;;) 3186: { 3187: RMATCH(eptr, ecode, offset_top, md, eptrb, RM21); 3188: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3189: if (eptr-- == pp) break; /* Stop if tried at original pos */ 3190: #ifdef SUPPORT_UTF 3191: if (utf) BACKCHAR(eptr); 3192: #endif 3193: } 3194: RRETURN(MATCH_NOMATCH); 3195: } 3196: 3197: /* Control never gets here */ 3198: } 3199: #endif /* End of XCLASS */ 3200: 3201: /* Match a single character, casefully */ 3202: 3203: case OP_CHAR: 3204: #ifdef SUPPORT_UTF 3205: if (utf) 3206: { 3207: length = 1; 3208: ecode++; 3209: GETCHARLEN(fc, ecode, length); 3210: if (length > md->end_subject - eptr) 3211: { 3212: CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */ 3213: RRETURN(MATCH_NOMATCH); 3214: } 3215: while (length-- > 0) if (*ecode++ != RAWUCHARINC(eptr)) RRETURN(MATCH_NOMATCH); 3216: } 3217: else 3218: #endif 3219: /* Not UTF mode */ 3220: { 3221: if (md->end_subject - eptr < 1) 3222: { 3223: SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */ 3224: RRETURN(MATCH_NOMATCH); 3225: } 3226: if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH); 3227: ecode += 2; 3228: } 3229: break; 3230: 3231: /* Match a single character, caselessly. If we are at the end of the 3232: subject, give up immediately. */ 3233: 3234: case OP_CHARI: 3235: if (eptr >= md->end_subject) 3236: { 3237: SCHECK_PARTIAL(); 3238: RRETURN(MATCH_NOMATCH); 3239: } 3240: 3241: #ifdef SUPPORT_UTF 3242: if (utf) 3243: { 3244: length = 1; 3245: ecode++; 3246: GETCHARLEN(fc, ecode, length); 3247: 3248: /* If the pattern character's value is < 128, we have only one byte, and 3249: we know that its other case must also be one byte long, so we can use the 3250: fast lookup table. We know that there is at least one byte left in the 3251: subject. */ 3252: 3253: if (fc < 128) 3254: { 3255: pcre_uint32 cc = RAWUCHAR(eptr); 3256: if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH); 3257: ecode++; 3258: eptr++; 3259: } 3260: 3261: /* Otherwise we must pick up the subject character. Note that we cannot 3262: use the value of "length" to check for sufficient bytes left, because the 3263: other case of the character may have more or fewer bytes. */ 3264: 3265: else 3266: { 3267: pcre_uint32 dc; 3268: GETCHARINC(dc, eptr); 3269: ecode += length; 3270: 3271: /* If we have Unicode property support, we can use it to test the other 3272: case of the character, if there is one. */ 3273: 3274: if (fc != dc) 3275: { 3276: #ifdef SUPPORT_UCP 3277: if (dc != UCD_OTHERCASE(fc)) 3278: #endif 3279: RRETURN(MATCH_NOMATCH); 3280: } 3281: } 3282: } 3283: else 3284: #endif /* SUPPORT_UTF */ 3285: 3286: /* Not UTF mode */ 3287: { 3288: if (TABLE_GET(ecode[1], md->lcc, ecode[1]) 3289: != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH); 3290: eptr++; 3291: ecode += 2; 3292: } 3293: break; 3294: 3295: /* Match a single character repeatedly. */ 3296: 3297: case OP_EXACT: 3298: case OP_EXACTI: 3299: min = max = GET2(ecode, 1); 3300: ecode += 1 + IMM2_SIZE; 3301: goto REPEATCHAR; 3302: 3303: case OP_POSUPTO: 3304: case OP_POSUPTOI: 3305: possessive = TRUE; 3306: /* Fall through */ 3307: 3308: case OP_UPTO: 3309: case OP_UPTOI: 3310: case OP_MINUPTO: 3311: case OP_MINUPTOI: 3312: min = 0; 3313: max = GET2(ecode, 1); 3314: minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI; 3315: ecode += 1 + IMM2_SIZE; 3316: goto REPEATCHAR; 3317: 3318: case OP_POSSTAR: 3319: case OP_POSSTARI: 3320: possessive = TRUE; 3321: min = 0; 3322: max = INT_MAX; 3323: ecode++; 3324: goto REPEATCHAR; 3325: 3326: case OP_POSPLUS: 3327: case OP_POSPLUSI: 3328: possessive = TRUE; 3329: min = 1; 3330: max = INT_MAX; 3331: ecode++; 3332: goto REPEATCHAR; 3333: 3334: case OP_POSQUERY: 3335: case OP_POSQUERYI: 3336: possessive = TRUE; 3337: min = 0; 3338: max = 1; 3339: ecode++; 3340: goto REPEATCHAR; 3341: 3342: case OP_STAR: 3343: case OP_STARI: 3344: case OP_MINSTAR: 3345: case OP_MINSTARI: 3346: case OP_PLUS: 3347: case OP_PLUSI: 3348: case OP_MINPLUS: 3349: case OP_MINPLUSI: 3350: case OP_QUERY: 3351: case OP_QUERYI: 3352: case OP_MINQUERY: 3353: case OP_MINQUERYI: 3354: c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI); 3355: minimize = (c & 1) != 0; 3356: min = rep_min[c]; /* Pick up values from tables; */ 3357: max = rep_max[c]; /* zero for max => infinity */ 3358: if (max == 0) max = INT_MAX; 3359: 3360: /* Common code for all repeated single-character matches. We first check 3361: for the minimum number of characters. If the minimum equals the maximum, we 3362: are done. Otherwise, if minimizing, check the rest of the pattern for a 3363: match; if there isn't one, advance up to the maximum, one character at a 3364: time. 3365: 3366: If maximizing, advance up to the maximum number of matching characters, 3367: until eptr is past the end of the maximum run. If possessive, we are 3368: then done (no backing up). Otherwise, match at this position; anything 3369: other than no match is immediately returned. For nomatch, back up one 3370: character, unless we are matching \R and the last thing matched was 3371: \r\n, in which case, back up two bytes. When we reach the first optional 3372: character position, we can save stack by doing a tail recurse. 3373: 3374: The various UTF/non-UTF and caseful/caseless cases are handled separately, 3375: for speed. */ 3376: 3377: REPEATCHAR: 3378: #ifdef SUPPORT_UTF 3379: if (utf) 3380: { 3381: length = 1; 3382: charptr = ecode; 3383: GETCHARLEN(fc, ecode, length); 3384: ecode += length; 3385: 3386: /* Handle multibyte character matching specially here. There is 3387: support for caseless matching if UCP support is present. */ 3388: 3389: if (length > 1) 3390: { 3391: #ifdef SUPPORT_UCP 3392: pcre_uint32 othercase; 3393: if (op >= OP_STARI && /* Caseless */ 3394: (othercase = UCD_OTHERCASE(fc)) != fc) 3395: oclength = PRIV(ord2utf)(othercase, occhars); 3396: else oclength = 0; 3397: #endif /* SUPPORT_UCP */ 3398: 3399: for (i = 1; i <= min; i++) 3400: { 3401: if (eptr <= md->end_subject - length && 3402: memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length; 3403: #ifdef SUPPORT_UCP 3404: else if (oclength > 0 && 3405: eptr <= md->end_subject - oclength && 3406: memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength; 3407: #endif /* SUPPORT_UCP */ 3408: else 3409: { 3410: CHECK_PARTIAL(); 3411: RRETURN(MATCH_NOMATCH); 3412: } 3413: } 3414: 3415: if (min == max) continue; 3416: 3417: if (minimize) 3418: { 3419: for (fi = min;; fi++) 3420: { 3421: RMATCH(eptr, ecode, offset_top, md, eptrb, RM22); 3422: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3423: if (fi >= max) RRETURN(MATCH_NOMATCH); 3424: if (eptr <= md->end_subject - length && 3425: memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length; 3426: #ifdef SUPPORT_UCP 3427: else if (oclength > 0 && 3428: eptr <= md->end_subject - oclength && 3429: memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength; 3430: #endif /* SUPPORT_UCP */ 3431: else 3432: { 3433: CHECK_PARTIAL(); 3434: RRETURN(MATCH_NOMATCH); 3435: } 3436: } 3437: /* Control never gets here */ 3438: } 3439: 3440: else /* Maximize */ 3441: { 3442: pp = eptr; 3443: for (i = min; i < max; i++) 3444: { 3445: if (eptr <= md->end_subject - length && 3446: memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length; 3447: #ifdef SUPPORT_UCP 3448: else if (oclength > 0 && 3449: eptr <= md->end_subject - oclength && 3450: memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength; 3451: #endif /* SUPPORT_UCP */ 3452: else 3453: { 3454: CHECK_PARTIAL(); 3455: break; 3456: } 3457: } 3458: 3459: if (possessive) continue; /* No backtracking */ 3460: for(;;) 3461: { 3462: if (eptr == pp) goto TAIL_RECURSE; 3463: RMATCH(eptr, ecode, offset_top, md, eptrb, RM23); 3464: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3465: #ifdef SUPPORT_UCP 3466: eptr--; 3467: BACKCHAR(eptr); 3468: #else /* without SUPPORT_UCP */ 3469: eptr -= length; 3470: #endif /* SUPPORT_UCP */ 3471: } 3472: } 3473: /* Control never gets here */ 3474: } 3475: 3476: /* If the length of a UTF-8 character is 1, we fall through here, and 3477: obey the code as for non-UTF-8 characters below, though in this case the 3478: value of fc will always be < 128. */ 3479: } 3480: else 3481: #endif /* SUPPORT_UTF */ 3482: /* When not in UTF-8 mode, load a single-byte character. */ 3483: fc = *ecode++; 3484: 3485: /* The value of fc at this point is always one character, though we may 3486: or may not be in UTF mode. The code is duplicated for the caseless and 3487: caseful cases, for speed, since matching characters is likely to be quite 3488: common. First, ensure the minimum number of matches are present. If min = 3489: max, continue at the same level without recursing. Otherwise, if 3490: minimizing, keep trying the rest of the expression and advancing one 3491: matching character if failing, up to the maximum. Alternatively, if 3492: maximizing, find the maximum number of characters and work backwards. */ 3493: 3494: DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max, 3495: max, (char *)eptr)); 3496: 3497: if (op >= OP_STARI) /* Caseless */ 3498: { 3499: #ifdef COMPILE_PCRE8 3500: /* fc must be < 128 if UTF is enabled. */ 3501: foc = md->fcc[fc]; 3502: #else 3503: #ifdef SUPPORT_UTF 3504: #ifdef SUPPORT_UCP 3505: if (utf && fc > 127) 3506: foc = UCD_OTHERCASE(fc); 3507: #else 3508: if (utf && fc > 127) 3509: foc = fc; 3510: #endif /* SUPPORT_UCP */ 3511: else 3512: #endif /* SUPPORT_UTF */ 3513: foc = TABLE_GET(fc, md->fcc, fc); 3514: #endif /* COMPILE_PCRE8 */ 3515: 3516: for (i = 1; i <= min; i++) 3517: { 3518: pcre_uint32 cc; /* Faster than pcre_uchar */ 3519: if (eptr >= md->end_subject) 3520: { 3521: SCHECK_PARTIAL(); 3522: RRETURN(MATCH_NOMATCH); 3523: } 3524: cc = RAWUCHARTEST(eptr); 3525: if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH); 3526: eptr++; 3527: } 3528: if (min == max) continue; 3529: if (minimize) 3530: { 3531: for (fi = min;; fi++) 3532: { 3533: pcre_uint32 cc; /* Faster than pcre_uchar */ 3534: RMATCH(eptr, ecode, offset_top, md, eptrb, RM24); 3535: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3536: if (fi >= max) RRETURN(MATCH_NOMATCH); 3537: if (eptr >= md->end_subject) 3538: { 3539: SCHECK_PARTIAL(); 3540: RRETURN(MATCH_NOMATCH); 3541: } 3542: cc = RAWUCHARTEST(eptr); 3543: if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH); 3544: eptr++; 3545: } 3546: /* Control never gets here */ 3547: } 3548: else /* Maximize */ 3549: { 3550: pp = eptr; 3551: for (i = min; i < max; i++) 3552: { 3553: pcre_uint32 cc; /* Faster than pcre_uchar */ 3554: if (eptr >= md->end_subject) 3555: { 3556: SCHECK_PARTIAL(); 3557: break; 3558: } 3559: cc = RAWUCHARTEST(eptr); 3560: if (fc != cc && foc != cc) break; 3561: eptr++; 3562: } 3563: if (possessive) continue; /* No backtracking */ 3564: for (;;) 3565: { 3566: if (eptr == pp) goto TAIL_RECURSE; 3567: RMATCH(eptr, ecode, offset_top, md, eptrb, RM25); 3568: eptr--; 3569: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3570: } 3571: /* Control never gets here */ 3572: } 3573: } 3574: 3575: /* Caseful comparisons (includes all multi-byte characters) */ 3576: 3577: else 3578: { 3579: for (i = 1; i <= min; i++) 3580: { 3581: if (eptr >= md->end_subject) 3582: { 3583: SCHECK_PARTIAL(); 3584: RRETURN(MATCH_NOMATCH); 3585: } 3586: if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH); 3587: } 3588: 3589: if (min == max) continue; 3590: 3591: if (minimize) 3592: { 3593: for (fi = min;; fi++) 3594: { 3595: RMATCH(eptr, ecode, offset_top, md, eptrb, RM26); 3596: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3597: if (fi >= max) RRETURN(MATCH_NOMATCH); 3598: if (eptr >= md->end_subject) 3599: { 3600: SCHECK_PARTIAL(); 3601: RRETURN(MATCH_NOMATCH); 3602: } 3603: if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH); 3604: } 3605: /* Control never gets here */ 3606: } 3607: else /* Maximize */ 3608: { 3609: pp = eptr; 3610: for (i = min; i < max; i++) 3611: { 3612: if (eptr >= md->end_subject) 3613: { 3614: SCHECK_PARTIAL(); 3615: break; 3616: } 3617: if (fc != RAWUCHARTEST(eptr)) break; 3618: eptr++; 3619: } 3620: if (possessive) continue; /* No backtracking */ 3621: for (;;) 3622: { 3623: if (eptr == pp) goto TAIL_RECURSE; 3624: RMATCH(eptr, ecode, offset_top, md, eptrb, RM27); 3625: eptr--; 3626: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3627: } 3628: /* Control never gets here */ 3629: } 3630: } 3631: /* Control never gets here */ 3632: 3633: /* Match a negated single one-byte character. The character we are 3634: checking can be multibyte. */ 3635: 3636: case OP_NOT: 3637: case OP_NOTI: 3638: if (eptr >= md->end_subject) 3639: { 3640: SCHECK_PARTIAL(); 3641: RRETURN(MATCH_NOMATCH); 3642: } 3643: #ifdef SUPPORT_UTF 3644: if (utf) 3645: { 3646: register pcre_uint32 ch, och; 3647: 3648: ecode++; 3649: GETCHARINC(ch, ecode); 3650: GETCHARINC(c, eptr); 3651: 3652: if (op == OP_NOT) 3653: { 3654: if (ch == c) RRETURN(MATCH_NOMATCH); 3655: } 3656: else 3657: { 3658: #ifdef SUPPORT_UCP 3659: if (ch > 127) 3660: och = UCD_OTHERCASE(ch); 3661: #else 3662: if (ch > 127) 3663: och = ch; 3664: #endif /* SUPPORT_UCP */ 3665: else 3666: och = TABLE_GET(ch, md->fcc, ch); 3667: if (ch == c || och == c) RRETURN(MATCH_NOMATCH); 3668: } 3669: } 3670: else 3671: #endif 3672: { 3673: register pcre_uint32 ch = ecode[1]; 3674: c = *eptr++; 3675: if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c)) 3676: RRETURN(MATCH_NOMATCH); 3677: ecode += 2; 3678: } 3679: break; 3680: 3681: /* Match a negated single one-byte character repeatedly. This is almost a 3682: repeat of the code for a repeated single character, but I haven't found a 3683: nice way of commoning these up that doesn't require a test of the 3684: positive/negative option for each character match. Maybe that wouldn't add 3685: very much to the time taken, but character matching *is* what this is all 3686: about... */ 3687: 3688: case OP_NOTEXACT: 3689: case OP_NOTEXACTI: 3690: min = max = GET2(ecode, 1); 3691: ecode += 1 + IMM2_SIZE; 3692: goto REPEATNOTCHAR; 3693: 3694: case OP_NOTUPTO: 3695: case OP_NOTUPTOI: 3696: case OP_NOTMINUPTO: 3697: case OP_NOTMINUPTOI: 3698: min = 0; 3699: max = GET2(ecode, 1); 3700: minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI; 3701: ecode += 1 + IMM2_SIZE; 3702: goto REPEATNOTCHAR; 3703: 3704: case OP_NOTPOSSTAR: 3705: case OP_NOTPOSSTARI: 3706: possessive = TRUE; 3707: min = 0; 3708: max = INT_MAX; 3709: ecode++; 3710: goto REPEATNOTCHAR; 3711: 3712: case OP_NOTPOSPLUS: 3713: case OP_NOTPOSPLUSI: 3714: possessive = TRUE; 3715: min = 1; 3716: max = INT_MAX; 3717: ecode++; 3718: goto REPEATNOTCHAR; 3719: 3720: case OP_NOTPOSQUERY: 3721: case OP_NOTPOSQUERYI: 3722: possessive = TRUE; 3723: min = 0; 3724: max = 1; 3725: ecode++; 3726: goto REPEATNOTCHAR; 3727: 3728: case OP_NOTPOSUPTO: 3729: case OP_NOTPOSUPTOI: 3730: possessive = TRUE; 3731: min = 0; 3732: max = GET2(ecode, 1); 3733: ecode += 1 + IMM2_SIZE; 3734: goto REPEATNOTCHAR; 3735: 3736: case OP_NOTSTAR: 3737: case OP_NOTSTARI: 3738: case OP_NOTMINSTAR: 3739: case OP_NOTMINSTARI: 3740: case OP_NOTPLUS: 3741: case OP_NOTPLUSI: 3742: case OP_NOTMINPLUS: 3743: case OP_NOTMINPLUSI: 3744: case OP_NOTQUERY: 3745: case OP_NOTQUERYI: 3746: case OP_NOTMINQUERY: 3747: case OP_NOTMINQUERYI: 3748: c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR); 3749: minimize = (c & 1) != 0; 3750: min = rep_min[c]; /* Pick up values from tables; */ 3751: max = rep_max[c]; /* zero for max => infinity */ 3752: if (max == 0) max = INT_MAX; 3753: 3754: /* Common code for all repeated single-byte matches. */ 3755: 3756: REPEATNOTCHAR: 3757: GETCHARINCTEST(fc, ecode); 3758: 3759: /* The code is duplicated for the caseless and caseful cases, for speed, 3760: since matching characters is likely to be quite common. First, ensure the 3761: minimum number of matches are present. If min = max, continue at the same 3762: level without recursing. Otherwise, if minimizing, keep trying the rest of 3763: the expression and advancing one matching character if failing, up to the 3764: maximum. Alternatively, if maximizing, find the maximum number of 3765: characters and work backwards. */ 3766: 3767: DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max, 3768: max, (char *)eptr)); 3769: 3770: if (op >= OP_NOTSTARI) /* Caseless */ 3771: { 3772: #ifdef SUPPORT_UTF 3773: #ifdef SUPPORT_UCP 3774: if (utf && fc > 127) 3775: foc = UCD_OTHERCASE(fc); 3776: #else 3777: if (utf && fc > 127) 3778: foc = fc; 3779: #endif /* SUPPORT_UCP */ 3780: else 3781: #endif /* SUPPORT_UTF */ 3782: foc = TABLE_GET(fc, md->fcc, fc); 3783: 3784: #ifdef SUPPORT_UTF 3785: if (utf) 3786: { 3787: register pcre_uint32 d; 3788: for (i = 1; i <= min; i++) 3789: { 3790: if (eptr >= md->end_subject) 3791: { 3792: SCHECK_PARTIAL(); 3793: RRETURN(MATCH_NOMATCH); 3794: } 3795: GETCHARINC(d, eptr); 3796: if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH); 3797: } 3798: } 3799: else 3800: #endif /* SUPPORT_UTF */ 3801: /* Not UTF mode */ 3802: { 3803: for (i = 1; i <= min; i++) 3804: { 3805: if (eptr >= md->end_subject) 3806: { 3807: SCHECK_PARTIAL(); 3808: RRETURN(MATCH_NOMATCH); 3809: } 3810: if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH); 3811: eptr++; 3812: } 3813: } 3814: 3815: if (min == max) continue; 3816: 3817: if (minimize) 3818: { 3819: #ifdef SUPPORT_UTF 3820: if (utf) 3821: { 3822: register pcre_uint32 d; 3823: for (fi = min;; fi++) 3824: { 3825: RMATCH(eptr, ecode, offset_top, md, eptrb, RM28); 3826: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3827: if (fi >= max) RRETURN(MATCH_NOMATCH); 3828: if (eptr >= md->end_subject) 3829: { 3830: SCHECK_PARTIAL(); 3831: RRETURN(MATCH_NOMATCH); 3832: } 3833: GETCHARINC(d, eptr); 3834: if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH); 3835: } 3836: } 3837: else 3838: #endif /*SUPPORT_UTF */ 3839: /* Not UTF mode */ 3840: { 3841: for (fi = min;; fi++) 3842: { 3843: RMATCH(eptr, ecode, offset_top, md, eptrb, RM29); 3844: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3845: if (fi >= max) RRETURN(MATCH_NOMATCH); 3846: if (eptr >= md->end_subject) 3847: { 3848: SCHECK_PARTIAL(); 3849: RRETURN(MATCH_NOMATCH); 3850: } 3851: if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH); 3852: eptr++; 3853: } 3854: } 3855: /* Control never gets here */ 3856: } 3857: 3858: /* Maximize case */ 3859: 3860: else 3861: { 3862: pp = eptr; 3863: 3864: #ifdef SUPPORT_UTF 3865: if (utf) 3866: { 3867: register pcre_uint32 d; 3868: for (i = min; i < max; i++) 3869: { 3870: int len = 1; 3871: if (eptr >= md->end_subject) 3872: { 3873: SCHECK_PARTIAL(); 3874: break; 3875: } 3876: GETCHARLEN(d, eptr, len); 3877: if (fc == d || (unsigned int)foc == d) break; 3878: eptr += len; 3879: } 3880: if (possessive) continue; /* No backtracking */ 3881: for(;;) 3882: { 3883: if (eptr == pp) goto TAIL_RECURSE; 3884: RMATCH(eptr, ecode, offset_top, md, eptrb, RM30); 3885: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3886: eptr--; 3887: BACKCHAR(eptr); 3888: } 3889: } 3890: else 3891: #endif /* SUPPORT_UTF */ 3892: /* Not UTF mode */ 3893: { 3894: for (i = min; i < max; i++) 3895: { 3896: if (eptr >= md->end_subject) 3897: { 3898: SCHECK_PARTIAL(); 3899: break; 3900: } 3901: if (fc == *eptr || foc == *eptr) break; 3902: eptr++; 3903: } 3904: if (possessive) continue; /* No backtracking */ 3905: for (;;) 3906: { 3907: if (eptr == pp) goto TAIL_RECURSE; 3908: RMATCH(eptr, ecode, offset_top, md, eptrb, RM31); 3909: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3910: eptr--; 3911: } 3912: } 3913: /* Control never gets here */ 3914: } 3915: } 3916: 3917: /* Caseful comparisons */ 3918: 3919: else 3920: { 3921: #ifdef SUPPORT_UTF 3922: if (utf) 3923: { 3924: register pcre_uint32 d; 3925: for (i = 1; i <= min; i++) 3926: { 3927: if (eptr >= md->end_subject) 3928: { 3929: SCHECK_PARTIAL(); 3930: RRETURN(MATCH_NOMATCH); 3931: } 3932: GETCHARINC(d, eptr); 3933: if (fc == d) RRETURN(MATCH_NOMATCH); 3934: } 3935: } 3936: else 3937: #endif 3938: /* Not UTF mode */ 3939: { 3940: for (i = 1; i <= min; i++) 3941: { 3942: if (eptr >= md->end_subject) 3943: { 3944: SCHECK_PARTIAL(); 3945: RRETURN(MATCH_NOMATCH); 3946: } 3947: if (fc == *eptr++) RRETURN(MATCH_NOMATCH); 3948: } 3949: } 3950: 3951: if (min == max) continue; 3952: 3953: if (minimize) 3954: { 3955: #ifdef SUPPORT_UTF 3956: if (utf) 3957: { 3958: register pcre_uint32 d; 3959: for (fi = min;; fi++) 3960: { 3961: RMATCH(eptr, ecode, offset_top, md, eptrb, RM32); 3962: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3963: if (fi >= max) RRETURN(MATCH_NOMATCH); 3964: if (eptr >= md->end_subject) 3965: { 3966: SCHECK_PARTIAL(); 3967: RRETURN(MATCH_NOMATCH); 3968: } 3969: GETCHARINC(d, eptr); 3970: if (fc == d) RRETURN(MATCH_NOMATCH); 3971: } 3972: } 3973: else 3974: #endif 3975: /* Not UTF mode */ 3976: { 3977: for (fi = min;; fi++) 3978: { 3979: RMATCH(eptr, ecode, offset_top, md, eptrb, RM33); 3980: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3981: if (fi >= max) RRETURN(MATCH_NOMATCH); 3982: if (eptr >= md->end_subject) 3983: { 3984: SCHECK_PARTIAL(); 3985: RRETURN(MATCH_NOMATCH); 3986: } 3987: if (fc == *eptr++) RRETURN(MATCH_NOMATCH); 3988: } 3989: } 3990: /* Control never gets here */ 3991: } 3992: 3993: /* Maximize case */ 3994: 3995: else 3996: { 3997: pp = eptr; 3998: 3999: #ifdef SUPPORT_UTF 4000: if (utf) 4001: { 4002: register pcre_uint32 d; 4003: for (i = min; i < max; i++) 4004: { 4005: int len = 1; 4006: if (eptr >= md->end_subject) 4007: { 4008: SCHECK_PARTIAL(); 4009: break; 4010: } 4011: GETCHARLEN(d, eptr, len); 4012: if (fc == d) break; 4013: eptr += len; 4014: } 4015: if (possessive) continue; /* No backtracking */ 4016: for(;;) 4017: { 4018: if (eptr == pp) goto TAIL_RECURSE; 4019: RMATCH(eptr, ecode, offset_top, md, eptrb, RM34); 4020: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4021: eptr--; 4022: BACKCHAR(eptr); 4023: } 4024: } 4025: else 4026: #endif 4027: /* Not UTF mode */ 4028: { 4029: for (i = min; i < max; i++) 4030: { 4031: if (eptr >= md->end_subject) 4032: { 4033: SCHECK_PARTIAL(); 4034: break; 4035: } 4036: if (fc == *eptr) break; 4037: eptr++; 4038: } 4039: if (possessive) continue; /* No backtracking */ 4040: for (;;) 4041: { 4042: if (eptr == pp) goto TAIL_RECURSE; 4043: RMATCH(eptr, ecode, offset_top, md, eptrb, RM35); 4044: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4045: eptr--; 4046: } 4047: } 4048: /* Control never gets here */ 4049: } 4050: } 4051: /* Control never gets here */ 4052: 4053: /* Match a single character type repeatedly; several different opcodes 4054: share code. This is very similar to the code for single characters, but we 4055: repeat it in the interests of efficiency. */ 4056: 4057: case OP_TYPEEXACT: 4058: min = max = GET2(ecode, 1); 4059: minimize = TRUE; 4060: ecode += 1 + IMM2_SIZE; 4061: goto REPEATTYPE; 4062: 4063: case OP_TYPEUPTO: 4064: case OP_TYPEMINUPTO: 4065: min = 0; 4066: max = GET2(ecode, 1); 4067: minimize = *ecode == OP_TYPEMINUPTO; 4068: ecode += 1 + IMM2_SIZE; 4069: goto REPEATTYPE; 4070: 4071: case OP_TYPEPOSSTAR: 4072: possessive = TRUE; 4073: min = 0; 4074: max = INT_MAX; 4075: ecode++; 4076: goto REPEATTYPE; 4077: 4078: case OP_TYPEPOSPLUS: 4079: possessive = TRUE; 4080: min = 1; 4081: max = INT_MAX; 4082: ecode++; 4083: goto REPEATTYPE; 4084: 4085: case OP_TYPEPOSQUERY: 4086: possessive = TRUE; 4087: min = 0; 4088: max = 1; 4089: ecode++; 4090: goto REPEATTYPE; 4091: 4092: case OP_TYPEPOSUPTO: 4093: possessive = TRUE; 4094: min = 0; 4095: max = GET2(ecode, 1); 4096: ecode += 1 + IMM2_SIZE; 4097: goto REPEATTYPE; 4098: 4099: case OP_TYPESTAR: 4100: case OP_TYPEMINSTAR: 4101: case OP_TYPEPLUS: 4102: case OP_TYPEMINPLUS: 4103: case OP_TYPEQUERY: 4104: case OP_TYPEMINQUERY: 4105: c = *ecode++ - OP_TYPESTAR; 4106: minimize = (c & 1) != 0; 4107: min = rep_min[c]; /* Pick up values from tables; */ 4108: max = rep_max[c]; /* zero for max => infinity */ 4109: if (max == 0) max = INT_MAX; 4110: 4111: /* Common code for all repeated single character type matches. Note that 4112: in UTF-8 mode, '.' matches a character of any length, but for the other 4113: character types, the valid characters are all one-byte long. */ 4114: 4115: REPEATTYPE: 4116: ctype = *ecode++; /* Code for the character type */ 4117: 4118: #ifdef SUPPORT_UCP 4119: if (ctype == OP_PROP || ctype == OP_NOTPROP) 4120: { 4121: prop_fail_result = ctype == OP_NOTPROP; 4122: prop_type = *ecode++; 4123: prop_value = *ecode++; 4124: } 4125: else prop_type = -1; 4126: #endif 4127: 4128: /* First, ensure the minimum number of matches are present. Use inline 4129: code for maximizing the speed, and do the type test once at the start 4130: (i.e. keep it out of the loop). Separate the UTF-8 code completely as that 4131: is tidier. Also separate the UCP code, which can be the same for both UTF-8 4132: and single-bytes. */ 4133: 4134: if (min > 0) 4135: { 4136: #ifdef SUPPORT_UCP 4137: if (prop_type >= 0) 4138: { 4139: switch(prop_type) 4140: { 4141: case PT_ANY: 4142: if (prop_fail_result) RRETURN(MATCH_NOMATCH); 4143: for (i = 1; i <= min; i++) 4144: { 4145: if (eptr >= md->end_subject) 4146: { 4147: SCHECK_PARTIAL(); 4148: RRETURN(MATCH_NOMATCH); 4149: } 4150: GETCHARINCTEST(c, eptr); 4151: } 4152: break; 4153: 4154: case PT_LAMP: 4155: for (i = 1; i <= min; i++) 4156: { 4157: int chartype; 4158: if (eptr >= md->end_subject) 4159: { 4160: SCHECK_PARTIAL(); 4161: RRETURN(MATCH_NOMATCH); 4162: } 4163: GETCHARINCTEST(c, eptr); 4164: chartype = UCD_CHARTYPE(c); 4165: if ((chartype == ucp_Lu || 4166: chartype == ucp_Ll || 4167: chartype == ucp_Lt) == prop_fail_result) 4168: RRETURN(MATCH_NOMATCH); 4169: } 4170: break; 4171: 4172: case PT_GC: 4173: for (i = 1; i <= min; i++) 4174: { 4175: if (eptr >= md->end_subject) 4176: { 4177: SCHECK_PARTIAL(); 4178: RRETURN(MATCH_NOMATCH); 4179: } 4180: GETCHARINCTEST(c, eptr); 4181: if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) 4182: RRETURN(MATCH_NOMATCH); 4183: } 4184: break; 4185: 4186: case PT_PC: 4187: for (i = 1; i <= min; i++) 4188: { 4189: if (eptr >= md->end_subject) 4190: { 4191: SCHECK_PARTIAL(); 4192: RRETURN(MATCH_NOMATCH); 4193: } 4194: GETCHARINCTEST(c, eptr); 4195: if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) 4196: RRETURN(MATCH_NOMATCH); 4197: } 4198: break; 4199: 4200: case PT_SC: 4201: for (i = 1; i <= min; i++) 4202: { 4203: if (eptr >= md->end_subject) 4204: { 4205: SCHECK_PARTIAL(); 4206: RRETURN(MATCH_NOMATCH); 4207: } 4208: GETCHARINCTEST(c, eptr); 4209: if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) 4210: RRETURN(MATCH_NOMATCH); 4211: } 4212: break; 4213: 4214: case PT_ALNUM: 4215: for (i = 1; i <= min; i++) 4216: { 4217: int category; 4218: if (eptr >= md->end_subject) 4219: { 4220: SCHECK_PARTIAL(); 4221: RRETURN(MATCH_NOMATCH); 4222: } 4223: GETCHARINCTEST(c, eptr); 4224: category = UCD_CATEGORY(c); 4225: if ((category == ucp_L || category == ucp_N) == prop_fail_result) 4226: RRETURN(MATCH_NOMATCH); 4227: } 4228: break; 4229: 4230: /* Perl space used to exclude VT, but from Perl 5.18 it is included, 4231: which means that Perl space and POSIX space are now identical. PCRE 4232: was changed at release 8.34. */ 4233: 4234: case PT_SPACE: /* Perl space */ 4235: case PT_PXSPACE: /* POSIX space */ 4236: for (i = 1; i <= min; i++) 4237: { 4238: if (eptr >= md->end_subject) 4239: { 4240: SCHECK_PARTIAL(); 4241: RRETURN(MATCH_NOMATCH); 4242: } 4243: GETCHARINCTEST(c, eptr); 4244: switch(c) 4245: { 4246: HSPACE_CASES: 4247: VSPACE_CASES: 4248: if (prop_fail_result) RRETURN(MATCH_NOMATCH); 4249: break; 4250: 4251: default: 4252: if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result) 4253: RRETURN(MATCH_NOMATCH); 4254: break; 4255: } 4256: } 4257: break; 4258: 4259: case PT_WORD: 4260: for (i = 1; i <= min; i++) 4261: { 4262: int category; 4263: if (eptr >= md->end_subject) 4264: { 4265: SCHECK_PARTIAL(); 4266: RRETURN(MATCH_NOMATCH); 4267: } 4268: GETCHARINCTEST(c, eptr); 4269: category = UCD_CATEGORY(c); 4270: if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE) 4271: == prop_fail_result) 4272: RRETURN(MATCH_NOMATCH); 4273: } 4274: break; 4275: 4276: case PT_CLIST: 4277: for (i = 1; i <= min; i++) 4278: { 4279: const pcre_uint32 *cp; 4280: if (eptr >= md->end_subject) 4281: { 4282: SCHECK_PARTIAL(); 4283: RRETURN(MATCH_NOMATCH); 4284: } 4285: GETCHARINCTEST(c, eptr); 4286: cp = PRIV(ucd_caseless_sets) + prop_value; 4287: for (;;) 4288: { 4289: if (c < *cp) 4290: { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } } 4291: if (c == *cp++) 4292: { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; } 4293: } 4294: } 4295: break; 4296: 4297: case PT_UCNC: 4298: for (i = 1; i <= min; i++) 4299: { 4300: if (eptr >= md->end_subject) 4301: { 4302: SCHECK_PARTIAL(); 4303: RRETURN(MATCH_NOMATCH); 4304: } 4305: GETCHARINCTEST(c, eptr); 4306: if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 4307: c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 4308: c >= 0xe000) == prop_fail_result) 4309: RRETURN(MATCH_NOMATCH); 4310: } 4311: break; 4312: 4313: /* This should not occur */ 4314: 4315: default: 4316: RRETURN(PCRE_ERROR_INTERNAL); 4317: } 4318: } 4319: 4320: /* Match extended Unicode sequences. We will get here only if the 4321: support is in the binary; otherwise a compile-time error occurs. */ 4322: 4323: else if (ctype == OP_EXTUNI) 4324: { 4325: for (i = 1; i <= min; i++) 4326: { 4327: if (eptr >= md->end_subject) 4328: { 4329: SCHECK_PARTIAL(); 4330: RRETURN(MATCH_NOMATCH); 4331: } 4332: else 4333: { 4334: int lgb, rgb; 4335: GETCHARINCTEST(c, eptr); 4336: lgb = UCD_GRAPHBREAK(c); 4337: while (eptr < md->end_subject) 4338: { 4339: int len = 1; 4340: if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } 4341: rgb = UCD_GRAPHBREAK(c); 4342: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; 4343: lgb = rgb; 4344: eptr += len; 4345: } 4346: } 4347: CHECK_PARTIAL(); 4348: } 4349: } 4350: 4351: else 4352: #endif /* SUPPORT_UCP */ 4353: 4354: /* Handle all other cases when the coding is UTF-8 */ 4355: 4356: #ifdef SUPPORT_UTF 4357: if (utf) switch(ctype) 4358: { 4359: case OP_ANY: 4360: for (i = 1; i <= min; i++) 4361: { 4362: if (eptr >= md->end_subject) 4363: { 4364: SCHECK_PARTIAL(); 4365: RRETURN(MATCH_NOMATCH); 4366: } 4367: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); 4368: if (md->partial != 0 && 4369: eptr + 1 >= md->end_subject && 4370: NLBLOCK->nltype == NLTYPE_FIXED && 4371: NLBLOCK->nllen == 2 && 4372: RAWUCHAR(eptr) == NLBLOCK->nl[0]) 4373: { 4374: md->hitend = TRUE; 4375: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 4376: } 4377: eptr++; 4378: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); 4379: } 4380: break; 4381: 4382: case OP_ALLANY: 4383: for (i = 1; i <= min; i++) 4384: { 4385: if (eptr >= md->end_subject) 4386: { 4387: SCHECK_PARTIAL(); 4388: RRETURN(MATCH_NOMATCH); 4389: } 4390: eptr++; 4391: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); 4392: } 4393: break; 4394: 4395: case OP_ANYBYTE: 4396: if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH); 4397: eptr += min; 4398: break; 4399: 4400: case OP_ANYNL: 4401: for (i = 1; i <= min; i++) 4402: { 4403: if (eptr >= md->end_subject) 4404: { 4405: SCHECK_PARTIAL(); 4406: RRETURN(MATCH_NOMATCH); 4407: } 4408: GETCHARINC(c, eptr); 4409: switch(c) 4410: { 4411: default: RRETURN(MATCH_NOMATCH); 4412: 4413: case CHAR_CR: 4414: if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++; 4415: break; 4416: 4417: case CHAR_LF: 4418: break; 4419: 4420: case CHAR_VT: 4421: case CHAR_FF: 4422: case CHAR_NEL: 4423: #ifndef EBCDIC 4424: case 0x2028: 4425: case 0x2029: 4426: #endif /* Not EBCDIC */ 4427: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); 4428: break; 4429: } 4430: } 4431: break; 4432: 4433: case OP_NOT_HSPACE: 4434: for (i = 1; i <= min; i++) 4435: { 4436: if (eptr >= md->end_subject) 4437: { 4438: SCHECK_PARTIAL(); 4439: RRETURN(MATCH_NOMATCH); 4440: } 4441: GETCHARINC(c, eptr); 4442: switch(c) 4443: { 4444: HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */ 4445: default: break; 4446: } 4447: } 4448: break; 4449: 4450: case OP_HSPACE: 4451: for (i = 1; i <= min; i++) 4452: { 4453: if (eptr >= md->end_subject) 4454: { 4455: SCHECK_PARTIAL(); 4456: RRETURN(MATCH_NOMATCH); 4457: } 4458: GETCHARINC(c, eptr); 4459: switch(c) 4460: { 4461: HSPACE_CASES: break; /* Byte and multibyte cases */ 4462: default: RRETURN(MATCH_NOMATCH); 4463: } 4464: } 4465: break; 4466: 4467: case OP_NOT_VSPACE: 4468: for (i = 1; i <= min; i++) 4469: { 4470: if (eptr >= md->end_subject) 4471: { 4472: SCHECK_PARTIAL(); 4473: RRETURN(MATCH_NOMATCH); 4474: } 4475: GETCHARINC(c, eptr); 4476: switch(c) 4477: { 4478: VSPACE_CASES: RRETURN(MATCH_NOMATCH); 4479: default: break; 4480: } 4481: } 4482: break; 4483: 4484: case OP_VSPACE: 4485: for (i = 1; i <= min; i++) 4486: { 4487: if (eptr >= md->end_subject) 4488: { 4489: SCHECK_PARTIAL(); 4490: RRETURN(MATCH_NOMATCH); 4491: } 4492: GETCHARINC(c, eptr); 4493: switch(c) 4494: { 4495: VSPACE_CASES: break; 4496: default: RRETURN(MATCH_NOMATCH); 4497: } 4498: } 4499: break; 4500: 4501: case OP_NOT_DIGIT: 4502: for (i = 1; i <= min; i++) 4503: { 4504: if (eptr >= md->end_subject) 4505: { 4506: SCHECK_PARTIAL(); 4507: RRETURN(MATCH_NOMATCH); 4508: } 4509: GETCHARINC(c, eptr); 4510: if (c < 128 && (md->ctypes[c] & ctype_digit) != 0) 4511: RRETURN(MATCH_NOMATCH); 4512: } 4513: break; 4514: 4515: case OP_DIGIT: 4516: for (i = 1; i <= min; i++) 4517: { 4518: pcre_uint32 cc; 4519: if (eptr >= md->end_subject) 4520: { 4521: SCHECK_PARTIAL(); 4522: RRETURN(MATCH_NOMATCH); 4523: } 4524: cc = RAWUCHAR(eptr); 4525: if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0) 4526: RRETURN(MATCH_NOMATCH); 4527: eptr++; 4528: /* No need to skip more bytes - we know it's a 1-byte character */ 4529: } 4530: break; 4531: 4532: case OP_NOT_WHITESPACE: 4533: for (i = 1; i <= min; i++) 4534: { 4535: pcre_uint32 cc; 4536: if (eptr >= md->end_subject) 4537: { 4538: SCHECK_PARTIAL(); 4539: RRETURN(MATCH_NOMATCH); 4540: } 4541: cc = RAWUCHAR(eptr); 4542: if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0) 4543: RRETURN(MATCH_NOMATCH); 4544: eptr++; 4545: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); 4546: } 4547: break; 4548: 4549: case OP_WHITESPACE: 4550: for (i = 1; i <= min; i++) 4551: { 4552: pcre_uint32 cc; 4553: if (eptr >= md->end_subject) 4554: { 4555: SCHECK_PARTIAL(); 4556: RRETURN(MATCH_NOMATCH); 4557: } 4558: cc = RAWUCHAR(eptr); 4559: if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0) 4560: RRETURN(MATCH_NOMATCH); 4561: eptr++; 4562: /* No need to skip more bytes - we know it's a 1-byte character */ 4563: } 4564: break; 4565: 4566: case OP_NOT_WORDCHAR: 4567: for (i = 1; i <= min; i++) 4568: { 4569: pcre_uint32 cc; 4570: if (eptr >= md->end_subject) 4571: { 4572: SCHECK_PARTIAL(); 4573: RRETURN(MATCH_NOMATCH); 4574: } 4575: cc = RAWUCHAR(eptr); 4576: if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0) 4577: RRETURN(MATCH_NOMATCH); 4578: eptr++; 4579: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); 4580: } 4581: break; 4582: 4583: case OP_WORDCHAR: 4584: for (i = 1; i <= min; i++) 4585: { 4586: pcre_uint32 cc; 4587: if (eptr >= md->end_subject) 4588: { 4589: SCHECK_PARTIAL(); 4590: RRETURN(MATCH_NOMATCH); 4591: } 4592: cc = RAWUCHAR(eptr); 4593: if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0) 4594: RRETURN(MATCH_NOMATCH); 4595: eptr++; 4596: /* No need to skip more bytes - we know it's a 1-byte character */ 4597: } 4598: break; 4599: 4600: default: 4601: RRETURN(PCRE_ERROR_INTERNAL); 4602: } /* End switch(ctype) */ 4603: 4604: else 4605: #endif /* SUPPORT_UTF */ 4606: 4607: /* Code for the non-UTF-8 case for minimum matching of operators other 4608: than OP_PROP and OP_NOTPROP. */ 4609: 4610: switch(ctype) 4611: { 4612: case OP_ANY: 4613: for (i = 1; i <= min; i++) 4614: { 4615: if (eptr >= md->end_subject) 4616: { 4617: SCHECK_PARTIAL(); 4618: RRETURN(MATCH_NOMATCH); 4619: } 4620: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); 4621: if (md->partial != 0 && 4622: eptr + 1 >= md->end_subject && 4623: NLBLOCK->nltype == NLTYPE_FIXED && 4624: NLBLOCK->nllen == 2 && 4625: *eptr == NLBLOCK->nl[0]) 4626: { 4627: md->hitend = TRUE; 4628: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 4629: } 4630: eptr++; 4631: } 4632: break; 4633: 4634: case OP_ALLANY: 4635: if (eptr > md->end_subject - min) 4636: { 4637: SCHECK_PARTIAL(); 4638: RRETURN(MATCH_NOMATCH); 4639: } 4640: eptr += min; 4641: break; 4642: 4643: case OP_ANYBYTE: 4644: if (eptr > md->end_subject - min) 4645: { 4646: SCHECK_PARTIAL(); 4647: RRETURN(MATCH_NOMATCH); 4648: } 4649: eptr += min; 4650: break; 4651: 4652: case OP_ANYNL: 4653: for (i = 1; i <= min; i++) 4654: { 4655: if (eptr >= md->end_subject) 4656: { 4657: SCHECK_PARTIAL(); 4658: RRETURN(MATCH_NOMATCH); 4659: } 4660: switch(*eptr++) 4661: { 4662: default: RRETURN(MATCH_NOMATCH); 4663: 4664: case CHAR_CR: 4665: if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++; 4666: break; 4667: 4668: case CHAR_LF: 4669: break; 4670: 4671: case CHAR_VT: 4672: case CHAR_FF: 4673: case CHAR_NEL: 4674: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 4675: case 0x2028: 4676: case 0x2029: 4677: #endif 4678: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); 4679: break; 4680: } 4681: } 4682: break; 4683: 4684: case OP_NOT_HSPACE: 4685: for (i = 1; i <= min; i++) 4686: { 4687: if (eptr >= md->end_subject) 4688: { 4689: SCHECK_PARTIAL(); 4690: RRETURN(MATCH_NOMATCH); 4691: } 4692: switch(*eptr++) 4693: { 4694: default: break; 4695: HSPACE_BYTE_CASES: 4696: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 4697: HSPACE_MULTIBYTE_CASES: 4698: #endif 4699: RRETURN(MATCH_NOMATCH); 4700: } 4701: } 4702: break; 4703: 4704: case OP_HSPACE: 4705: for (i = 1; i <= min; i++) 4706: { 4707: if (eptr >= md->end_subject) 4708: { 4709: SCHECK_PARTIAL(); 4710: RRETURN(MATCH_NOMATCH); 4711: } 4712: switch(*eptr++) 4713: { 4714: default: RRETURN(MATCH_NOMATCH); 4715: HSPACE_BYTE_CASES: 4716: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 4717: HSPACE_MULTIBYTE_CASES: 4718: #endif 4719: break; 4720: } 4721: } 4722: break; 4723: 4724: case OP_NOT_VSPACE: 4725: for (i = 1; i <= min; i++) 4726: { 4727: if (eptr >= md->end_subject) 4728: { 4729: SCHECK_PARTIAL(); 4730: RRETURN(MATCH_NOMATCH); 4731: } 4732: switch(*eptr++) 4733: { 4734: VSPACE_BYTE_CASES: 4735: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 4736: VSPACE_MULTIBYTE_CASES: 4737: #endif 4738: RRETURN(MATCH_NOMATCH); 4739: default: break; 4740: } 4741: } 4742: break; 4743: 4744: case OP_VSPACE: 4745: for (i = 1; i <= min; i++) 4746: { 4747: if (eptr >= md->end_subject) 4748: { 4749: SCHECK_PARTIAL(); 4750: RRETURN(MATCH_NOMATCH); 4751: } 4752: switch(*eptr++) 4753: { 4754: default: RRETURN(MATCH_NOMATCH); 4755: VSPACE_BYTE_CASES: 4756: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 4757: VSPACE_MULTIBYTE_CASES: 4758: #endif 4759: break; 4760: } 4761: } 4762: break; 4763: 4764: case OP_NOT_DIGIT: 4765: for (i = 1; i <= min; i++) 4766: { 4767: if (eptr >= md->end_subject) 4768: { 4769: SCHECK_PARTIAL(); 4770: RRETURN(MATCH_NOMATCH); 4771: } 4772: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) 4773: RRETURN(MATCH_NOMATCH); 4774: eptr++; 4775: } 4776: break; 4777: 4778: case OP_DIGIT: 4779: for (i = 1; i <= min; i++) 4780: { 4781: if (eptr >= md->end_subject) 4782: { 4783: SCHECK_PARTIAL(); 4784: RRETURN(MATCH_NOMATCH); 4785: } 4786: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) 4787: RRETURN(MATCH_NOMATCH); 4788: eptr++; 4789: } 4790: break; 4791: 4792: case OP_NOT_WHITESPACE: 4793: for (i = 1; i <= min; i++) 4794: { 4795: if (eptr >= md->end_subject) 4796: { 4797: SCHECK_PARTIAL(); 4798: RRETURN(MATCH_NOMATCH); 4799: } 4800: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) 4801: RRETURN(MATCH_NOMATCH); 4802: eptr++; 4803: } 4804: break; 4805: 4806: case OP_WHITESPACE: 4807: for (i = 1; i <= min; i++) 4808: { 4809: if (eptr >= md->end_subject) 4810: { 4811: SCHECK_PARTIAL(); 4812: RRETURN(MATCH_NOMATCH); 4813: } 4814: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) 4815: RRETURN(MATCH_NOMATCH); 4816: eptr++; 4817: } 4818: break; 4819: 4820: case OP_NOT_WORDCHAR: 4821: for (i = 1; i <= min; i++) 4822: { 4823: if (eptr >= md->end_subject) 4824: { 4825: SCHECK_PARTIAL(); 4826: RRETURN(MATCH_NOMATCH); 4827: } 4828: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) 4829: RRETURN(MATCH_NOMATCH); 4830: eptr++; 4831: } 4832: break; 4833: 4834: case OP_WORDCHAR: 4835: for (i = 1; i <= min; i++) 4836: { 4837: if (eptr >= md->end_subject) 4838: { 4839: SCHECK_PARTIAL(); 4840: RRETURN(MATCH_NOMATCH); 4841: } 4842: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) 4843: RRETURN(MATCH_NOMATCH); 4844: eptr++; 4845: } 4846: break; 4847: 4848: default: 4849: RRETURN(PCRE_ERROR_INTERNAL); 4850: } 4851: } 4852: 4853: /* If min = max, continue at the same level without recursing */ 4854: 4855: if (min == max) continue; 4856: 4857: /* If minimizing, we have to test the rest of the pattern before each 4858: subsequent match. Again, separate the UTF-8 case for speed, and also 4859: separate the UCP cases. */ 4860: 4861: if (minimize) 4862: { 4863: #ifdef SUPPORT_UCP 4864: if (prop_type >= 0) 4865: { 4866: switch(prop_type) 4867: { 4868: case PT_ANY: 4869: for (fi = min;; fi++) 4870: { 4871: RMATCH(eptr, ecode, offset_top, md, eptrb, RM36); 4872: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4873: if (fi >= max) RRETURN(MATCH_NOMATCH); 4874: if (eptr >= md->end_subject) 4875: { 4876: SCHECK_PARTIAL(); 4877: RRETURN(MATCH_NOMATCH); 4878: } 4879: GETCHARINCTEST(c, eptr); 4880: if (prop_fail_result) RRETURN(MATCH_NOMATCH); 4881: } 4882: /* Control never gets here */ 4883: 4884: case PT_LAMP: 4885: for (fi = min;; fi++) 4886: { 4887: int chartype; 4888: RMATCH(eptr, ecode, offset_top, md, eptrb, RM37); 4889: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4890: if (fi >= max) RRETURN(MATCH_NOMATCH); 4891: if (eptr >= md->end_subject) 4892: { 4893: SCHECK_PARTIAL(); 4894: RRETURN(MATCH_NOMATCH); 4895: } 4896: GETCHARINCTEST(c, eptr); 4897: chartype = UCD_CHARTYPE(c); 4898: if ((chartype == ucp_Lu || 4899: chartype == ucp_Ll || 4900: chartype == ucp_Lt) == prop_fail_result) 4901: RRETURN(MATCH_NOMATCH); 4902: } 4903: /* Control never gets here */ 4904: 4905: case PT_GC: 4906: for (fi = min;; fi++) 4907: { 4908: RMATCH(eptr, ecode, offset_top, md, eptrb, RM38); 4909: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4910: if (fi >= max) RRETURN(MATCH_NOMATCH); 4911: if (eptr >= md->end_subject) 4912: { 4913: SCHECK_PARTIAL(); 4914: RRETURN(MATCH_NOMATCH); 4915: } 4916: GETCHARINCTEST(c, eptr); 4917: if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) 4918: RRETURN(MATCH_NOMATCH); 4919: } 4920: /* Control never gets here */ 4921: 4922: case PT_PC: 4923: for (fi = min;; fi++) 4924: { 4925: RMATCH(eptr, ecode, offset_top, md, eptrb, RM39); 4926: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4927: if (fi >= max) RRETURN(MATCH_NOMATCH); 4928: if (eptr >= md->end_subject) 4929: { 4930: SCHECK_PARTIAL(); 4931: RRETURN(MATCH_NOMATCH); 4932: } 4933: GETCHARINCTEST(c, eptr); 4934: if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) 4935: RRETURN(MATCH_NOMATCH); 4936: } 4937: /* Control never gets here */ 4938: 4939: case PT_SC: 4940: for (fi = min;; fi++) 4941: { 4942: RMATCH(eptr, ecode, offset_top, md, eptrb, RM40); 4943: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4944: if (fi >= max) RRETURN(MATCH_NOMATCH); 4945: if (eptr >= md->end_subject) 4946: { 4947: SCHECK_PARTIAL(); 4948: RRETURN(MATCH_NOMATCH); 4949: } 4950: GETCHARINCTEST(c, eptr); 4951: if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) 4952: RRETURN(MATCH_NOMATCH); 4953: } 4954: /* Control never gets here */ 4955: 4956: case PT_ALNUM: 4957: for (fi = min;; fi++) 4958: { 4959: int category; 4960: RMATCH(eptr, ecode, offset_top, md, eptrb, RM59); 4961: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4962: if (fi >= max) RRETURN(MATCH_NOMATCH); 4963: if (eptr >= md->end_subject) 4964: { 4965: SCHECK_PARTIAL(); 4966: RRETURN(MATCH_NOMATCH); 4967: } 4968: GETCHARINCTEST(c, eptr); 4969: category = UCD_CATEGORY(c); 4970: if ((category == ucp_L || category == ucp_N) == prop_fail_result) 4971: RRETURN(MATCH_NOMATCH); 4972: } 4973: /* Control never gets here */ 4974: 4975: /* Perl space used to exclude VT, but from Perl 5.18 it is included, 4976: which means that Perl space and POSIX space are now identical. PCRE 4977: was changed at release 8.34. */ 4978: 4979: case PT_SPACE: /* Perl space */ 4980: case PT_PXSPACE: /* POSIX space */ 4981: for (fi = min;; fi++) 4982: { 4983: RMATCH(eptr, ecode, offset_top, md, eptrb, RM61); 4984: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4985: if (fi >= max) RRETURN(MATCH_NOMATCH); 4986: if (eptr >= md->end_subject) 4987: { 4988: SCHECK_PARTIAL(); 4989: RRETURN(MATCH_NOMATCH); 4990: } 4991: GETCHARINCTEST(c, eptr); 4992: switch(c) 4993: { 4994: HSPACE_CASES: 4995: VSPACE_CASES: 4996: if (prop_fail_result) RRETURN(MATCH_NOMATCH); 4997: break; 4998: 4999: default: 5000: if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result) 5001: RRETURN(MATCH_NOMATCH); 5002: break; 5003: } 5004: } 5005: /* Control never gets here */ 5006: 5007: case PT_WORD: 5008: for (fi = min;; fi++) 5009: { 5010: int category; 5011: RMATCH(eptr, ecode, offset_top, md, eptrb, RM62); 5012: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5013: if (fi >= max) RRETURN(MATCH_NOMATCH); 5014: if (eptr >= md->end_subject) 5015: { 5016: SCHECK_PARTIAL(); 5017: RRETURN(MATCH_NOMATCH); 5018: } 5019: GETCHARINCTEST(c, eptr); 5020: category = UCD_CATEGORY(c); 5021: if ((category == ucp_L || 5022: category == ucp_N || 5023: c == CHAR_UNDERSCORE) 5024: == prop_fail_result) 5025: RRETURN(MATCH_NOMATCH); 5026: } 5027: /* Control never gets here */ 5028: 5029: case PT_CLIST: 5030: for (fi = min;; fi++) 5031: { 5032: const pcre_uint32 *cp; 5033: RMATCH(eptr, ecode, offset_top, md, eptrb, RM67); 5034: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5035: if (fi >= max) RRETURN(MATCH_NOMATCH); 5036: if (eptr >= md->end_subject) 5037: { 5038: SCHECK_PARTIAL(); 5039: RRETURN(MATCH_NOMATCH); 5040: } 5041: GETCHARINCTEST(c, eptr); 5042: cp = PRIV(ucd_caseless_sets) + prop_value; 5043: for (;;) 5044: { 5045: if (c < *cp) 5046: { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } } 5047: if (c == *cp++) 5048: { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; } 5049: } 5050: } 5051: /* Control never gets here */ 5052: 5053: case PT_UCNC: 5054: for (fi = min;; fi++) 5055: { 5056: RMATCH(eptr, ecode, offset_top, md, eptrb, RM60); 5057: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5058: if (fi >= max) RRETURN(MATCH_NOMATCH); 5059: if (eptr >= md->end_subject) 5060: { 5061: SCHECK_PARTIAL(); 5062: RRETURN(MATCH_NOMATCH); 5063: } 5064: GETCHARINCTEST(c, eptr); 5065: if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 5066: c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 5067: c >= 0xe000) == prop_fail_result) 5068: RRETURN(MATCH_NOMATCH); 5069: } 5070: /* Control never gets here */ 5071: 5072: /* This should never occur */ 5073: default: 5074: RRETURN(PCRE_ERROR_INTERNAL); 5075: } 5076: } 5077: 5078: /* Match extended Unicode sequences. We will get here only if the 5079: support is in the binary; otherwise a compile-time error occurs. */ 5080: 5081: else if (ctype == OP_EXTUNI) 5082: { 5083: for (fi = min;; fi++) 5084: { 5085: RMATCH(eptr, ecode, offset_top, md, eptrb, RM41); 5086: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5087: if (fi >= max) RRETURN(MATCH_NOMATCH); 5088: if (eptr >= md->end_subject) 5089: { 5090: SCHECK_PARTIAL(); 5091: RRETURN(MATCH_NOMATCH); 5092: } 5093: else 5094: { 5095: int lgb, rgb; 5096: GETCHARINCTEST(c, eptr); 5097: lgb = UCD_GRAPHBREAK(c); 5098: while (eptr < md->end_subject) 5099: { 5100: int len = 1; 5101: if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } 5102: rgb = UCD_GRAPHBREAK(c); 5103: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; 5104: lgb = rgb; 5105: eptr += len; 5106: } 5107: } 5108: CHECK_PARTIAL(); 5109: } 5110: } 5111: else 5112: #endif /* SUPPORT_UCP */ 5113: 5114: #ifdef SUPPORT_UTF 5115: if (utf) 5116: { 5117: for (fi = min;; fi++) 5118: { 5119: RMATCH(eptr, ecode, offset_top, md, eptrb, RM42); 5120: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5121: if (fi >= max) RRETURN(MATCH_NOMATCH); 5122: if (eptr >= md->end_subject) 5123: { 5124: SCHECK_PARTIAL(); 5125: RRETURN(MATCH_NOMATCH); 5126: } 5127: if (ctype == OP_ANY && IS_NEWLINE(eptr)) 5128: RRETURN(MATCH_NOMATCH); 5129: GETCHARINC(c, eptr); 5130: switch(ctype) 5131: { 5132: case OP_ANY: /* This is the non-NL case */ 5133: if (md->partial != 0 && /* Take care with CRLF partial */ 5134: eptr >= md->end_subject && 5135: NLBLOCK->nltype == NLTYPE_FIXED && 5136: NLBLOCK->nllen == 2 && 5137: c == NLBLOCK->nl[0]) 5138: { 5139: md->hitend = TRUE; 5140: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 5141: } 5142: break; 5143: 5144: case OP_ALLANY: 5145: case OP_ANYBYTE: 5146: break; 5147: 5148: case OP_ANYNL: 5149: switch(c) 5150: { 5151: default: RRETURN(MATCH_NOMATCH); 5152: case CHAR_CR: 5153: if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++; 5154: break; 5155: 5156: case CHAR_LF: 5157: break; 5158: 5159: case CHAR_VT: 5160: case CHAR_FF: 5161: case CHAR_NEL: 5162: #ifndef EBCDIC 5163: case 0x2028: 5164: case 0x2029: 5165: #endif /* Not EBCDIC */ 5166: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); 5167: break; 5168: } 5169: break; 5170: 5171: case OP_NOT_HSPACE: 5172: switch(c) 5173: { 5174: HSPACE_CASES: RRETURN(MATCH_NOMATCH); 5175: default: break; 5176: } 5177: break; 5178: 5179: case OP_HSPACE: 5180: switch(c) 5181: { 5182: HSPACE_CASES: break; 5183: default: RRETURN(MATCH_NOMATCH); 5184: } 5185: break; 5186: 5187: case OP_NOT_VSPACE: 5188: switch(c) 5189: { 5190: VSPACE_CASES: RRETURN(MATCH_NOMATCH); 5191: default: break; 5192: } 5193: break; 5194: 5195: case OP_VSPACE: 5196: switch(c) 5197: { 5198: VSPACE_CASES: break; 5199: default: RRETURN(MATCH_NOMATCH); 5200: } 5201: break; 5202: 5203: case OP_NOT_DIGIT: 5204: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) 5205: RRETURN(MATCH_NOMATCH); 5206: break; 5207: 5208: case OP_DIGIT: 5209: if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0) 5210: RRETURN(MATCH_NOMATCH); 5211: break; 5212: 5213: case OP_NOT_WHITESPACE: 5214: if (c < 256 && (md->ctypes[c] & ctype_space) != 0) 5215: RRETURN(MATCH_NOMATCH); 5216: break; 5217: 5218: case OP_WHITESPACE: 5219: if (c >= 256 || (md->ctypes[c] & ctype_space) == 0) 5220: RRETURN(MATCH_NOMATCH); 5221: break; 5222: 5223: case OP_NOT_WORDCHAR: 5224: if (c < 256 && (md->ctypes[c] & ctype_word) != 0) 5225: RRETURN(MATCH_NOMATCH); 5226: break; 5227: 5228: case OP_WORDCHAR: 5229: if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) 5230: RRETURN(MATCH_NOMATCH); 5231: break; 5232: 5233: default: 5234: RRETURN(PCRE_ERROR_INTERNAL); 5235: } 5236: } 5237: } 5238: else 5239: #endif 5240: /* Not UTF mode */ 5241: { 5242: for (fi = min;; fi++) 5243: { 5244: RMATCH(eptr, ecode, offset_top, md, eptrb, RM43); 5245: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5246: if (fi >= max) RRETURN(MATCH_NOMATCH); 5247: if (eptr >= md->end_subject) 5248: { 5249: SCHECK_PARTIAL(); 5250: RRETURN(MATCH_NOMATCH); 5251: } 5252: if (ctype == OP_ANY && IS_NEWLINE(eptr)) 5253: RRETURN(MATCH_NOMATCH); 5254: c = *eptr++; 5255: switch(ctype) 5256: { 5257: case OP_ANY: /* This is the non-NL case */ 5258: if (md->partial != 0 && /* Take care with CRLF partial */ 5259: eptr >= md->end_subject && 5260: NLBLOCK->nltype == NLTYPE_FIXED && 5261: NLBLOCK->nllen == 2 && 5262: c == NLBLOCK->nl[0]) 5263: { 5264: md->hitend = TRUE; 5265: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 5266: } 5267: break; 5268: 5269: case OP_ALLANY: 5270: case OP_ANYBYTE: 5271: break; 5272: 5273: case OP_ANYNL: 5274: switch(c) 5275: { 5276: default: RRETURN(MATCH_NOMATCH); 5277: case CHAR_CR: 5278: if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++; 5279: break; 5280: 5281: case CHAR_LF: 5282: break; 5283: 5284: case CHAR_VT: 5285: case CHAR_FF: 5286: case CHAR_NEL: 5287: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 5288: case 0x2028: 5289: case 0x2029: 5290: #endif 5291: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); 5292: break; 5293: } 5294: break; 5295: 5296: case OP_NOT_HSPACE: 5297: switch(c) 5298: { 5299: default: break; 5300: HSPACE_BYTE_CASES: 5301: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 5302: HSPACE_MULTIBYTE_CASES: 5303: #endif 5304: RRETURN(MATCH_NOMATCH); 5305: } 5306: break; 5307: 5308: case OP_HSPACE: 5309: switch(c) 5310: { 5311: default: RRETURN(MATCH_NOMATCH); 5312: HSPACE_BYTE_CASES: 5313: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 5314: HSPACE_MULTIBYTE_CASES: 5315: #endif 5316: break; 5317: } 5318: break; 5319: 5320: case OP_NOT_VSPACE: 5321: switch(c) 5322: { 5323: default: break; 5324: VSPACE_BYTE_CASES: 5325: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 5326: VSPACE_MULTIBYTE_CASES: 5327: #endif 5328: RRETURN(MATCH_NOMATCH); 5329: } 5330: break; 5331: 5332: case OP_VSPACE: 5333: switch(c) 5334: { 5335: default: RRETURN(MATCH_NOMATCH); 5336: VSPACE_BYTE_CASES: 5337: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 5338: VSPACE_MULTIBYTE_CASES: 5339: #endif 5340: break; 5341: } 5342: break; 5343: 5344: case OP_NOT_DIGIT: 5345: if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); 5346: break; 5347: 5348: case OP_DIGIT: 5349: if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); 5350: break; 5351: 5352: case OP_NOT_WHITESPACE: 5353: if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); 5354: break; 5355: 5356: case OP_WHITESPACE: 5357: if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); 5358: break; 5359: 5360: case OP_NOT_WORDCHAR: 5361: if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH); 5362: break; 5363: 5364: case OP_WORDCHAR: 5365: if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); 5366: break; 5367: 5368: default: 5369: RRETURN(PCRE_ERROR_INTERNAL); 5370: } 5371: } 5372: } 5373: /* Control never gets here */ 5374: } 5375: 5376: /* If maximizing, it is worth using inline code for speed, doing the type 5377: test once at the start (i.e. keep it out of the loop). Again, keep the 5378: UTF-8 and UCP stuff separate. */ 5379: 5380: else 5381: { 5382: pp = eptr; /* Remember where we started */ 5383: 5384: #ifdef SUPPORT_UCP 5385: if (prop_type >= 0) 5386: { 5387: switch(prop_type) 5388: { 5389: case PT_ANY: 5390: for (i = min; i < max; i++) 5391: { 5392: int len = 1; 5393: if (eptr >= md->end_subject) 5394: { 5395: SCHECK_PARTIAL(); 5396: break; 5397: } 5398: GETCHARLENTEST(c, eptr, len); 5399: if (prop_fail_result) break; 5400: eptr+= len; 5401: } 5402: break; 5403: 5404: case PT_LAMP: 5405: for (i = min; i < max; i++) 5406: { 5407: int chartype; 5408: int len = 1; 5409: if (eptr >= md->end_subject) 5410: { 5411: SCHECK_PARTIAL(); 5412: break; 5413: } 5414: GETCHARLENTEST(c, eptr, len); 5415: chartype = UCD_CHARTYPE(c); 5416: if ((chartype == ucp_Lu || 5417: chartype == ucp_Ll || 5418: chartype == ucp_Lt) == prop_fail_result) 5419: break; 5420: eptr+= len; 5421: } 5422: break; 5423: 5424: case PT_GC: 5425: for (i = min; i < max; i++) 5426: { 5427: int len = 1; 5428: if (eptr >= md->end_subject) 5429: { 5430: SCHECK_PARTIAL(); 5431: break; 5432: } 5433: GETCHARLENTEST(c, eptr, len); 5434: if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break; 5435: eptr+= len; 5436: } 5437: break; 5438: 5439: case PT_PC: 5440: for (i = min; i < max; i++) 5441: { 5442: int len = 1; 5443: if (eptr >= md->end_subject) 5444: { 5445: SCHECK_PARTIAL(); 5446: break; 5447: } 5448: GETCHARLENTEST(c, eptr, len); 5449: if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break; 5450: eptr+= len; 5451: } 5452: break; 5453: 5454: case PT_SC: 5455: for (i = min; i < max; i++) 5456: { 5457: int len = 1; 5458: if (eptr >= md->end_subject) 5459: { 5460: SCHECK_PARTIAL(); 5461: break; 5462: } 5463: GETCHARLENTEST(c, eptr, len); 5464: if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break; 5465: eptr+= len; 5466: } 5467: break; 5468: 5469: case PT_ALNUM: 5470: for (i = min; i < max; i++) 5471: { 5472: int category; 5473: int len = 1; 5474: if (eptr >= md->end_subject) 5475: { 5476: SCHECK_PARTIAL(); 5477: break; 5478: } 5479: GETCHARLENTEST(c, eptr, len); 5480: category = UCD_CATEGORY(c); 5481: if ((category == ucp_L || category == ucp_N) == prop_fail_result) 5482: break; 5483: eptr+= len; 5484: } 5485: break; 5486: 5487: /* Perl space used to exclude VT, but from Perl 5.18 it is included, 5488: which means that Perl space and POSIX space are now identical. PCRE 5489: was changed at release 8.34. */ 5490: 5491: case PT_SPACE: /* Perl space */ 5492: case PT_PXSPACE: /* POSIX space */ 5493: for (i = min; i < max; i++) 5494: { 5495: int len = 1; 5496: if (eptr >= md->end_subject) 5497: { 5498: SCHECK_PARTIAL(); 5499: break; 5500: } 5501: GETCHARLENTEST(c, eptr, len); 5502: switch(c) 5503: { 5504: HSPACE_CASES: 5505: VSPACE_CASES: 5506: if (prop_fail_result) goto ENDLOOP99; /* Break the loop */ 5507: break; 5508: 5509: default: 5510: if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result) 5511: goto ENDLOOP99; /* Break the loop */ 5512: break; 5513: } 5514: eptr+= len; 5515: } 5516: ENDLOOP99: 5517: break; 5518: 5519: case PT_WORD: 5520: for (i = min; i < max; i++) 5521: { 5522: int category; 5523: int len = 1; 5524: if (eptr >= md->end_subject) 5525: { 5526: SCHECK_PARTIAL(); 5527: break; 5528: } 5529: GETCHARLENTEST(c, eptr, len); 5530: category = UCD_CATEGORY(c); 5531: if ((category == ucp_L || category == ucp_N || 5532: c == CHAR_UNDERSCORE) == prop_fail_result) 5533: break; 5534: eptr+= len; 5535: } 5536: break; 5537: 5538: case PT_CLIST: 5539: for (i = min; i < max; i++) 5540: { 5541: const pcre_uint32 *cp; 5542: int len = 1; 5543: if (eptr >= md->end_subject) 5544: { 5545: SCHECK_PARTIAL(); 5546: break; 5547: } 5548: GETCHARLENTEST(c, eptr, len); 5549: cp = PRIV(ucd_caseless_sets) + prop_value; 5550: for (;;) 5551: { 5552: if (c < *cp) 5553: { if (prop_fail_result) break; else goto GOT_MAX; } 5554: if (c == *cp++) 5555: { if (prop_fail_result) goto GOT_MAX; else break; } 5556: } 5557: eptr += len; 5558: } 5559: GOT_MAX: 5560: break; 5561: 5562: case PT_UCNC: 5563: for (i = min; i < max; i++) 5564: { 5565: int len = 1; 5566: if (eptr >= md->end_subject) 5567: { 5568: SCHECK_PARTIAL(); 5569: break; 5570: } 5571: GETCHARLENTEST(c, eptr, len); 5572: if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 5573: c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 5574: c >= 0xe000) == prop_fail_result) 5575: break; 5576: eptr += len; 5577: } 5578: break; 5579: 5580: default: 5581: RRETURN(PCRE_ERROR_INTERNAL); 5582: } 5583: 5584: /* eptr is now past the end of the maximum run */ 5585: 5586: if (possessive) continue; /* No backtracking */ 5587: for(;;) 5588: { 5589: if (eptr == pp) goto TAIL_RECURSE; 5590: RMATCH(eptr, ecode, offset_top, md, eptrb, RM44); 5591: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5592: eptr--; 5593: if (utf) BACKCHAR(eptr); 5594: } 5595: } 5596: 5597: /* Match extended Unicode grapheme clusters. We will get here only if the 5598: support is in the binary; otherwise a compile-time error occurs. */ 5599: 5600: else if (ctype == OP_EXTUNI) 5601: { 5602: for (i = min; i < max; i++) 5603: { 5604: if (eptr >= md->end_subject) 5605: { 5606: SCHECK_PARTIAL(); 5607: break; 5608: } 5609: else 5610: { 5611: int lgb, rgb; 5612: GETCHARINCTEST(c, eptr); 5613: lgb = UCD_GRAPHBREAK(c); 5614: while (eptr < md->end_subject) 5615: { 5616: int len = 1; 5617: if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } 5618: rgb = UCD_GRAPHBREAK(c); 5619: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; 5620: lgb = rgb; 5621: eptr += len; 5622: } 5623: } 5624: CHECK_PARTIAL(); 5625: } 5626: 5627: /* eptr is now past the end of the maximum run */ 5628: 5629: if (possessive) continue; /* No backtracking */ 5630: 5631: for(;;) 5632: { 5633: int lgb, rgb; 5634: PCRE_PUCHAR fptr; 5635: 5636: if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */ 5637: RMATCH(eptr, ecode, offset_top, md, eptrb, RM45); 5638: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5639: 5640: /* Backtracking over an extended grapheme cluster involves inspecting 5641: the previous two characters (if present) to see if a break is 5642: permitted between them. */ 5643: 5644: eptr--; 5645: if (!utf) c = *eptr; else 5646: { 5647: BACKCHAR(eptr); 5648: GETCHAR(c, eptr); 5649: } 5650: rgb = UCD_GRAPHBREAK(c); 5651: 5652: for (;;) 5653: { 5654: if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */ 5655: fptr = eptr - 1; 5656: if (!utf) c = *fptr; else 5657: { 5658: BACKCHAR(fptr); 5659: GETCHAR(c, fptr); 5660: } 5661: lgb = UCD_GRAPHBREAK(c); 5662: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; 5663: eptr = fptr; 5664: rgb = lgb; 5665: } 5666: } 5667: } 5668: 5669: else 5670: #endif /* SUPPORT_UCP */ 5671: 5672: #ifdef SUPPORT_UTF 5673: if (utf) 5674: { 5675: switch(ctype) 5676: { 5677: case OP_ANY: 5678: if (max < INT_MAX) 5679: { 5680: for (i = min; i < max; i++) 5681: { 5682: if (eptr >= md->end_subject) 5683: { 5684: SCHECK_PARTIAL(); 5685: break; 5686: } 5687: if (IS_NEWLINE(eptr)) break; 5688: if (md->partial != 0 && /* Take care with CRLF partial */ 5689: eptr + 1 >= md->end_subject && 5690: NLBLOCK->nltype == NLTYPE_FIXED && 5691: NLBLOCK->nllen == 2 && 5692: RAWUCHAR(eptr) == NLBLOCK->nl[0]) 5693: { 5694: md->hitend = TRUE; 5695: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 5696: } 5697: eptr++; 5698: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); 5699: } 5700: } 5701: 5702: /* Handle unlimited UTF-8 repeat */ 5703: 5704: else 5705: { 5706: for (i = min; i < max; i++) 5707: { 5708: if (eptr >= md->end_subject) 5709: { 5710: SCHECK_PARTIAL(); 5711: break; 5712: } 5713: if (IS_NEWLINE(eptr)) break; 5714: if (md->partial != 0 && /* Take care with CRLF partial */ 5715: eptr + 1 >= md->end_subject && 5716: NLBLOCK->nltype == NLTYPE_FIXED && 5717: NLBLOCK->nllen == 2 && 5718: RAWUCHAR(eptr) == NLBLOCK->nl[0]) 5719: { 5720: md->hitend = TRUE; 5721: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 5722: } 5723: eptr++; 5724: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); 5725: } 5726: } 5727: break; 5728: 5729: case OP_ALLANY: 5730: if (max < INT_MAX) 5731: { 5732: for (i = min; i < max; i++) 5733: { 5734: if (eptr >= md->end_subject) 5735: { 5736: SCHECK_PARTIAL(); 5737: break; 5738: } 5739: eptr++; 5740: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); 5741: } 5742: } 5743: else 5744: { 5745: eptr = md->end_subject; /* Unlimited UTF-8 repeat */ 5746: SCHECK_PARTIAL(); 5747: } 5748: break; 5749: 5750: /* The byte case is the same as non-UTF8 */ 5751: 5752: case OP_ANYBYTE: 5753: c = max - min; 5754: if (c > (unsigned int)(md->end_subject - eptr)) 5755: { 5756: eptr = md->end_subject; 5757: SCHECK_PARTIAL(); 5758: } 5759: else eptr += c; 5760: break; 5761: 5762: case OP_ANYNL: 5763: for (i = min; i < max; i++) 5764: { 5765: int len = 1; 5766: if (eptr >= md->end_subject) 5767: { 5768: SCHECK_PARTIAL(); 5769: break; 5770: } 5771: GETCHARLEN(c, eptr, len); 5772: if (c == CHAR_CR) 5773: { 5774: if (++eptr >= md->end_subject) break; 5775: if (RAWUCHAR(eptr) == CHAR_LF) eptr++; 5776: } 5777: else 5778: { 5779: if (c != CHAR_LF && 5780: (md->bsr_anycrlf || 5781: (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL 5782: #ifndef EBCDIC 5783: && c != 0x2028 && c != 0x2029 5784: #endif /* Not EBCDIC */ 5785: ))) 5786: break; 5787: eptr += len; 5788: } 5789: } 5790: break; 5791: 5792: case OP_NOT_HSPACE: 5793: case OP_HSPACE: 5794: for (i = min; i < max; i++) 5795: { 5796: BOOL gotspace; 5797: int len = 1; 5798: if (eptr >= md->end_subject) 5799: { 5800: SCHECK_PARTIAL(); 5801: break; 5802: } 5803: GETCHARLEN(c, eptr, len); 5804: switch(c) 5805: { 5806: HSPACE_CASES: gotspace = TRUE; break; 5807: default: gotspace = FALSE; break; 5808: } 5809: if (gotspace == (ctype == OP_NOT_HSPACE)) break; 5810: eptr += len; 5811: } 5812: break; 5813: 5814: case OP_NOT_VSPACE: 5815: case OP_VSPACE: 5816: for (i = min; i < max; i++) 5817: { 5818: BOOL gotspace; 5819: int len = 1; 5820: if (eptr >= md->end_subject) 5821: { 5822: SCHECK_PARTIAL(); 5823: break; 5824: } 5825: GETCHARLEN(c, eptr, len); 5826: switch(c) 5827: { 5828: VSPACE_CASES: gotspace = TRUE; break; 5829: default: gotspace = FALSE; break; 5830: } 5831: if (gotspace == (ctype == OP_NOT_VSPACE)) break; 5832: eptr += len; 5833: } 5834: break; 5835: 5836: case OP_NOT_DIGIT: 5837: for (i = min; i < max; i++) 5838: { 5839: int len = 1; 5840: if (eptr >= md->end_subject) 5841: { 5842: SCHECK_PARTIAL(); 5843: break; 5844: } 5845: GETCHARLEN(c, eptr, len); 5846: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break; 5847: eptr+= len; 5848: } 5849: break; 5850: 5851: case OP_DIGIT: 5852: for (i = min; i < max; i++) 5853: { 5854: int len = 1; 5855: if (eptr >= md->end_subject) 5856: { 5857: SCHECK_PARTIAL(); 5858: break; 5859: } 5860: GETCHARLEN(c, eptr, len); 5861: if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break; 5862: eptr+= len; 5863: } 5864: break; 5865: 5866: case OP_NOT_WHITESPACE: 5867: for (i = min; i < max; i++) 5868: { 5869: int len = 1; 5870: if (eptr >= md->end_subject) 5871: { 5872: SCHECK_PARTIAL(); 5873: break; 5874: } 5875: GETCHARLEN(c, eptr, len); 5876: if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break; 5877: eptr+= len; 5878: } 5879: break; 5880: 5881: case OP_WHITESPACE: 5882: for (i = min; i < max; i++) 5883: { 5884: int len = 1; 5885: if (eptr >= md->end_subject) 5886: { 5887: SCHECK_PARTIAL(); 5888: break; 5889: } 5890: GETCHARLEN(c, eptr, len); 5891: if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break; 5892: eptr+= len; 5893: } 5894: break; 5895: 5896: case OP_NOT_WORDCHAR: 5897: for (i = min; i < max; i++) 5898: { 5899: int len = 1; 5900: if (eptr >= md->end_subject) 5901: { 5902: SCHECK_PARTIAL(); 5903: break; 5904: } 5905: GETCHARLEN(c, eptr, len); 5906: if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break; 5907: eptr+= len; 5908: } 5909: break; 5910: 5911: case OP_WORDCHAR: 5912: for (i = min; i < max; i++) 5913: { 5914: int len = 1; 5915: if (eptr >= md->end_subject) 5916: { 5917: SCHECK_PARTIAL(); 5918: break; 5919: } 5920: GETCHARLEN(c, eptr, len); 5921: if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break; 5922: eptr+= len; 5923: } 5924: break; 5925: 5926: default: 5927: RRETURN(PCRE_ERROR_INTERNAL); 5928: } 5929: 5930: if (possessive) continue; /* No backtracking */ 5931: for(;;) 5932: { 5933: if (eptr == pp) goto TAIL_RECURSE; 5934: RMATCH(eptr, ecode, offset_top, md, eptrb, RM46); 5935: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5936: eptr--; 5937: BACKCHAR(eptr); 5938: if (ctype == OP_ANYNL && eptr > pp && RAWUCHAR(eptr) == CHAR_NL && 5939: RAWUCHAR(eptr - 1) == CHAR_CR) eptr--; 5940: } 5941: } 5942: else 5943: #endif /* SUPPORT_UTF */ 5944: /* Not UTF mode */ 5945: { 5946: switch(ctype) 5947: { 5948: case OP_ANY: 5949: for (i = min; i < max; i++) 5950: { 5951: if (eptr >= md->end_subject) 5952: { 5953: SCHECK_PARTIAL(); 5954: break; 5955: } 5956: if (IS_NEWLINE(eptr)) break; 5957: if (md->partial != 0 && /* Take care with CRLF partial */ 5958: eptr + 1 >= md->end_subject && 5959: NLBLOCK->nltype == NLTYPE_FIXED && 5960: NLBLOCK->nllen == 2 && 5961: *eptr == NLBLOCK->nl[0]) 5962: { 5963: md->hitend = TRUE; 5964: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 5965: } 5966: eptr++; 5967: } 5968: break; 5969: 5970: case OP_ALLANY: 5971: case OP_ANYBYTE: 5972: c = max - min; 5973: if (c > (unsigned int)(md->end_subject - eptr)) 5974: { 5975: eptr = md->end_subject; 5976: SCHECK_PARTIAL(); 5977: } 5978: else eptr += c; 5979: break; 5980: 5981: case OP_ANYNL: 5982: for (i = min; i < max; i++) 5983: { 5984: if (eptr >= md->end_subject) 5985: { 5986: SCHECK_PARTIAL(); 5987: break; 5988: } 5989: c = *eptr; 5990: if (c == CHAR_CR) 5991: { 5992: if (++eptr >= md->end_subject) break; 5993: if (*eptr == CHAR_LF) eptr++; 5994: } 5995: else 5996: { 5997: if (c != CHAR_LF && (md->bsr_anycrlf || 5998: (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL 5999: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 6000: && c != 0x2028 && c != 0x2029 6001: #endif 6002: ))) break; 6003: eptr++; 6004: } 6005: } 6006: break; 6007: 6008: case OP_NOT_HSPACE: 6009: for (i = min; i < max; i++) 6010: { 6011: if (eptr >= md->end_subject) 6012: { 6013: SCHECK_PARTIAL(); 6014: break; 6015: } 6016: switch(*eptr) 6017: { 6018: default: eptr++; break; 6019: HSPACE_BYTE_CASES: 6020: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 6021: HSPACE_MULTIBYTE_CASES: 6022: #endif 6023: goto ENDLOOP00; 6024: } 6025: } 6026: ENDLOOP00: 6027: break; 6028: 6029: case OP_HSPACE: 6030: for (i = min; i < max; i++) 6031: { 6032: if (eptr >= md->end_subject) 6033: { 6034: SCHECK_PARTIAL(); 6035: break; 6036: } 6037: switch(*eptr) 6038: { 6039: default: goto ENDLOOP01; 6040: HSPACE_BYTE_CASES: 6041: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 6042: HSPACE_MULTIBYTE_CASES: 6043: #endif 6044: eptr++; break; 6045: } 6046: } 6047: ENDLOOP01: 6048: break; 6049: 6050: case OP_NOT_VSPACE: 6051: for (i = min; i < max; i++) 6052: { 6053: if (eptr >= md->end_subject) 6054: { 6055: SCHECK_PARTIAL(); 6056: break; 6057: } 6058: switch(*eptr) 6059: { 6060: default: eptr++; break; 6061: VSPACE_BYTE_CASES: 6062: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 6063: VSPACE_MULTIBYTE_CASES: 6064: #endif 6065: goto ENDLOOP02; 6066: } 6067: } 6068: ENDLOOP02: 6069: break; 6070: 6071: case OP_VSPACE: 6072: for (i = min; i < max; i++) 6073: { 6074: if (eptr >= md->end_subject) 6075: { 6076: SCHECK_PARTIAL(); 6077: break; 6078: } 6079: switch(*eptr) 6080: { 6081: default: goto ENDLOOP03; 6082: VSPACE_BYTE_CASES: 6083: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 6084: VSPACE_MULTIBYTE_CASES: 6085: #endif 6086: eptr++; break; 6087: } 6088: } 6089: ENDLOOP03: 6090: break; 6091: 6092: case OP_NOT_DIGIT: 6093: for (i = min; i < max; i++) 6094: { 6095: if (eptr >= md->end_subject) 6096: { 6097: SCHECK_PARTIAL(); 6098: break; 6099: } 6100: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break; 6101: eptr++; 6102: } 6103: break; 6104: 6105: case OP_DIGIT: 6106: for (i = min; i < max; i++) 6107: { 6108: if (eptr >= md->end_subject) 6109: { 6110: SCHECK_PARTIAL(); 6111: break; 6112: } 6113: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break; 6114: eptr++; 6115: } 6116: break; 6117: 6118: case OP_NOT_WHITESPACE: 6119: for (i = min; i < max; i++) 6120: { 6121: if (eptr >= md->end_subject) 6122: { 6123: SCHECK_PARTIAL(); 6124: break; 6125: } 6126: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break; 6127: eptr++; 6128: } 6129: break; 6130: 6131: case OP_WHITESPACE: 6132: for (i = min; i < max; i++) 6133: { 6134: if (eptr >= md->end_subject) 6135: { 6136: SCHECK_PARTIAL(); 6137: break; 6138: } 6139: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break; 6140: eptr++; 6141: } 6142: break; 6143: 6144: case OP_NOT_WORDCHAR: 6145: for (i = min; i < max; i++) 6146: { 6147: if (eptr >= md->end_subject) 6148: { 6149: SCHECK_PARTIAL(); 6150: break; 6151: } 6152: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break; 6153: eptr++; 6154: } 6155: break; 6156: 6157: case OP_WORDCHAR: 6158: for (i = min; i < max; i++) 6159: { 6160: if (eptr >= md->end_subject) 6161: { 6162: SCHECK_PARTIAL(); 6163: break; 6164: } 6165: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break; 6166: eptr++; 6167: } 6168: break; 6169: 6170: default: 6171: RRETURN(PCRE_ERROR_INTERNAL); 6172: } 6173: 6174: if (possessive) continue; /* No backtracking */ 6175: for (;;) 6176: { 6177: if (eptr == pp) goto TAIL_RECURSE; 6178: RMATCH(eptr, ecode, offset_top, md, eptrb, RM47); 6179: if (rrc != MATCH_NOMATCH) RRETURN(rrc); 6180: eptr--; 6181: if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF && 6182: eptr[-1] == CHAR_CR) eptr--; 6183: } 6184: } 6185: 6186: /* Control never gets here */ 6187: } 6188: 6189: /* There's been some horrible disaster. Arrival here can only mean there is 6190: something seriously wrong in the code above or the OP_xxx definitions. */ 6191: 6192: default: 6193: DPRINTF(("Unknown opcode %d\n", *ecode)); 6194: RRETURN(PCRE_ERROR_UNKNOWN_OPCODE); 6195: } 6196: 6197: /* Do not stick any code in here without much thought; it is assumed 6198: that "continue" in the code above comes out to here to repeat the main 6199: loop. */ 6200: 6201: } /* End of main loop */ 6202: /* Control never reaches here */ 6203: 6204: 6205: /* When compiling to use the heap rather than the stack for recursive calls to 6206: match(), the RRETURN() macro jumps here. The number that is saved in 6207: frame->Xwhere indicates which label we actually want to return to. */ 6208: 6209: #ifdef NO_RECURSE 6210: #define LBL(val) case val: goto L_RM##val; 6211: HEAP_RETURN: 6212: switch (frame->Xwhere) 6213: { 6214: LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8) 6215: LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17) 6216: LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33) 6217: LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52) 6218: LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64) 6219: LBL(65) LBL(66) 6220: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8 6221: LBL(20) LBL(21) 6222: #endif 6223: #ifdef SUPPORT_UTF 6224: LBL(16) LBL(18) 6225: LBL(22) LBL(23) LBL(28) LBL(30) 6226: LBL(32) LBL(34) LBL(42) LBL(46) 6227: #ifdef SUPPORT_UCP 6228: LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45) 6229: LBL(59) LBL(60) LBL(61) LBL(62) LBL(67) 6230: #endif /* SUPPORT_UCP */ 6231: #endif /* SUPPORT_UTF */ 6232: default: 6233: DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere)); 6234: return PCRE_ERROR_INTERNAL; 6235: } 6236: #undef LBL 6237: #endif /* NO_RECURSE */ 6238: } 6239: 6240: 6241: /*************************************************************************** 6242: **************************************************************************** 6243: RECURSION IN THE match() FUNCTION 6244: 6245: Undefine all the macros that were defined above to handle this. */ 6246: 6247: #ifdef NO_RECURSE 6248: #undef eptr 6249: #undef ecode 6250: #undef mstart 6251: #undef offset_top 6252: #undef eptrb 6253: #undef flags 6254: 6255: #undef callpat 6256: #undef charptr 6257: #undef data 6258: #undef next 6259: #undef pp 6260: #undef prev 6261: #undef saved_eptr 6262: 6263: #undef new_recursive 6264: 6265: #undef cur_is_word 6266: #undef condition 6267: #undef prev_is_word 6268: 6269: #undef ctype 6270: #undef length 6271: #undef max 6272: #undef min 6273: #undef number 6274: #undef offset 6275: #undef op 6276: #undef save_capture_last 6277: #undef save_offset1 6278: #undef save_offset2 6279: #undef save_offset3 6280: #undef stacksave 6281: 6282: #undef newptrb 6283: 6284: #endif 6285: 6286: /* These two are defined as macros in both cases */ 6287: 6288: #undef fc 6289: #undef fi 6290: 6291: /*************************************************************************** 6292: ***************************************************************************/ 6293: 6294: 6295: #ifdef NO_RECURSE 6296: /************************************************* 6297: * Release allocated heap frames * 6298: *************************************************/ 6299: 6300: /* This function releases all the allocated frames. The base frame is on the 6301: machine stack, and so must not be freed. 6302: 6303: Argument: the address of the base frame 6304: Returns: nothing 6305: */ 6306: 6307: static void 6308: release_match_heapframes (heapframe *frame_base) 6309: { 6310: heapframe *nextframe = frame_base->Xnextframe; 6311: while (nextframe != NULL) 6312: { 6313: heapframe *oldframe = nextframe; 6314: nextframe = nextframe->Xnextframe; 6315: (PUBL(stack_free))(oldframe); 6316: } 6317: } 6318: #endif 6319: 6320: 6321: /************************************************* 6322: * Execute a Regular Expression * 6323: *************************************************/ 6324: 6325: /* This function applies a compiled re to a subject string and picks out 6326: portions of the string if it matches. Two elements in the vector are set for 6327: each substring: the offsets to the start and end of the substring. 6328: 6329: Arguments: 6330: argument_re points to the compiled expression 6331: extra_data points to extra data or is NULL 6332: subject points to the subject string 6333: length length of subject string (may contain binary zeros) 6334: start_offset where to start in the subject string 6335: options option bits 6336: offsets points to a vector of ints to be filled in with offsets 6337: offsetcount the number of elements in the vector 6338: 6339: Returns: > 0 => success; value is the number of elements filled in 6340: = 0 => success, but offsets is not big enough 6341: -1 => failed to match 6342: < -1 => some kind of unexpected problem 6343: */ 6344: 6345: #if defined COMPILE_PCRE8 6346: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION 6347: pcre_exec(const pcre *argument_re, const pcre_extra *extra_data, 6348: PCRE_SPTR subject, int length, int start_offset, int options, int *offsets, 6349: int offsetcount) 6350: #elif defined COMPILE_PCRE16 6351: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION 6352: pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data, 6353: PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets, 6354: int offsetcount) 6355: #elif defined COMPILE_PCRE32 6356: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION 6357: pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data, 6358: PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets, 6359: int offsetcount) 6360: #endif 6361: { 6362: int rc, ocount, arg_offset_max; 6363: int newline; 6364: BOOL using_temporary_offsets = FALSE; 6365: BOOL anchored; 6366: BOOL startline; 6367: BOOL firstline; 6368: BOOL utf; 6369: BOOL has_first_char = FALSE; 6370: BOOL has_req_char = FALSE; 6371: pcre_uchar first_char = 0; 6372: pcre_uchar first_char2 = 0; 6373: pcre_uchar req_char = 0; 6374: pcre_uchar req_char2 = 0; 6375: match_data match_block; 6376: match_data *md = &match_block; 6377: const pcre_uint8 *tables; 6378: const pcre_uint8 *start_bits = NULL; 6379: PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset; 6380: PCRE_PUCHAR end_subject; 6381: PCRE_PUCHAR start_partial = NULL; 6382: PCRE_PUCHAR match_partial = NULL; 6383: PCRE_PUCHAR req_char_ptr = start_match - 1; 6384: 6385: const pcre_study_data *study; 6386: const REAL_PCRE *re = (const REAL_PCRE *)argument_re; 6387: 6388: #ifdef NO_RECURSE 6389: heapframe frame_zero; 6390: frame_zero.Xprevframe = NULL; /* Marks the top level */ 6391: frame_zero.Xnextframe = NULL; /* None are allocated yet */ 6392: md->match_frames_base = &frame_zero; 6393: #endif 6394: 6395: /* Check for the special magic call that measures the size of the stack used 6396: per recursive call of match(). Without the funny casting for sizeof, a Windows 6397: compiler gave this error: "unary minus operator applied to unsigned type, 6398: result still unsigned". Hopefully the cast fixes that. */ 6399: 6400: if (re == NULL && extra_data == NULL && subject == NULL && length == -999 && 6401: start_offset == -999) 6402: #ifdef NO_RECURSE 6403: return -((int)sizeof(heapframe)); 6404: #else 6405: return match(NULL, NULL, NULL, 0, NULL, NULL, 0); 6406: #endif 6407: 6408: /* Plausibility checks */ 6409: 6410: if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; 6411: if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0)) 6412: return PCRE_ERROR_NULL; 6413: if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; 6414: if (length < 0) return PCRE_ERROR_BADLENGTH; 6415: if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET; 6416: 6417: /* Check that the first field in the block is the magic number. If it is not, 6418: return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to 6419: REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which 6420: means that the pattern is likely compiled with different endianness. */ 6421: 6422: if (re->magic_number != MAGIC_NUMBER) 6423: return re->magic_number == REVERSED_MAGIC_NUMBER? 6424: PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC; 6425: if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE; 6426: 6427: /* These two settings are used in the code for checking a UTF-8 string that 6428: follows immediately afterwards. Other values in the md block are used only 6429: during "normal" pcre_exec() processing, not when the JIT support is in use, 6430: so they are set up later. */ 6431: 6432: /* PCRE_UTF16 has the same value as PCRE_UTF8. */ 6433: utf = md->utf = (re->options & PCRE_UTF8) != 0; 6434: md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 : 6435: ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0; 6436: 6437: /* Check a UTF-8 string if required. Pass back the character offset and error 6438: code for an invalid string if a results vector is available. */ 6439: 6440: #ifdef SUPPORT_UTF 6441: if (utf && (options & PCRE_NO_UTF8_CHECK) == 0) 6442: { 6443: int erroroffset; 6444: int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset); 6445: if (errorcode != 0) 6446: { 6447: if (offsetcount >= 2) 6448: { 6449: offsets[0] = erroroffset; 6450: offsets[1] = errorcode; 6451: } 6452: #if defined COMPILE_PCRE8 6453: return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)? 6454: PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8; 6455: #elif defined COMPILE_PCRE16 6456: return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)? 6457: PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16; 6458: #elif defined COMPILE_PCRE32 6459: return PCRE_ERROR_BADUTF32; 6460: #endif 6461: } 6462: #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16 6463: /* Check that a start_offset points to the start of a UTF character. */ 6464: if (start_offset > 0 && start_offset < length && 6465: NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset])) 6466: return PCRE_ERROR_BADUTF8_OFFSET; 6467: #endif 6468: } 6469: #endif 6470: 6471: /* If the pattern was successfully studied with JIT support, run the JIT 6472: executable instead of the rest of this function. Most options must be set at 6473: compile time for the JIT code to be usable. Fallback to the normal code path if 6474: an unsupported flag is set. */ 6475: 6476: #ifdef SUPPORT_JIT 6477: if (extra_data != NULL 6478: && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT | 6479: PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT 6480: && extra_data->executable_jit != NULL 6481: && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0) 6482: { 6483: rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length, 6484: start_offset, options, offsets, offsetcount); 6485: 6486: /* PCRE_ERROR_NULL means that the selected normal or partial matching 6487: mode is not compiled. In this case we simply fallback to interpreter. */ 6488: 6489: if (rc != PCRE_ERROR_JIT_BADOPTION) return rc; 6490: } 6491: #endif 6492: 6493: /* Carry on with non-JIT matching. This information is for finding all the 6494: numbers associated with a given name, for condition testing. */ 6495: 6496: md->name_table = (pcre_uchar *)re + re->name_table_offset; 6497: md->name_count = re->name_count; 6498: md->name_entry_size = re->name_entry_size; 6499: 6500: /* Fish out the optional data from the extra_data structure, first setting 6501: the default values. */ 6502: 6503: study = NULL; 6504: md->match_limit = MATCH_LIMIT; 6505: md->match_limit_recursion = MATCH_LIMIT_RECURSION; 6506: md->callout_data = NULL; 6507: 6508: /* The table pointer is always in native byte order. */ 6509: 6510: tables = re->tables; 6511: 6512: /* The two limit values override the defaults, whatever their value. */ 6513: 6514: if (extra_data != NULL) 6515: { 6516: register unsigned int flags = extra_data->flags; 6517: if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) 6518: study = (const pcre_study_data *)extra_data->study_data; 6519: if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) 6520: md->match_limit = extra_data->match_limit; 6521: if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) 6522: md->match_limit_recursion = extra_data->match_limit_recursion; 6523: if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) 6524: md->callout_data = extra_data->callout_data; 6525: if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables; 6526: } 6527: 6528: /* Limits in the regex override only if they are smaller. */ 6529: 6530: if ((re->flags & PCRE_MLSET) != 0 && re->limit_match < md->match_limit) 6531: md->match_limit = re->limit_match; 6532: 6533: if ((re->flags & PCRE_RLSET) != 0 && 6534: re->limit_recursion < md->match_limit_recursion) 6535: md->match_limit_recursion = re->limit_recursion; 6536: 6537: /* If the exec call supplied NULL for tables, use the inbuilt ones. This 6538: is a feature that makes it possible to save compiled regex and re-use them 6539: in other programs later. */ 6540: 6541: if (tables == NULL) tables = PRIV(default_tables); 6542: 6543: /* Set up other data */ 6544: 6545: anchored = ((re->options | options) & PCRE_ANCHORED) != 0; 6546: startline = (re->flags & PCRE_STARTLINE) != 0; 6547: firstline = (re->options & PCRE_FIRSTLINE) != 0; 6548: 6549: /* The code starts after the real_pcre block and the capture name table. */ 6550: 6551: md->start_code = (const pcre_uchar *)re + re->name_table_offset + 6552: re->name_count * re->name_entry_size; 6553: 6554: md->start_subject = (PCRE_PUCHAR)subject; 6555: md->start_offset = start_offset; 6556: md->end_subject = md->start_subject + length; 6557: end_subject = md->end_subject; 6558: 6559: md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; 6560: md->use_ucp = (re->options & PCRE_UCP) != 0; 6561: md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0; 6562: md->ignore_skip_arg = 0; 6563: 6564: /* Some options are unpacked into BOOL variables in the hope that testing 6565: them will be faster than individual option bits. */ 6566: 6567: md->notbol = (options & PCRE_NOTBOL) != 0; 6568: md->noteol = (options & PCRE_NOTEOL) != 0; 6569: md->notempty = (options & PCRE_NOTEMPTY) != 0; 6570: md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0; 6571: 6572: md->hitend = FALSE; 6573: md->mark = md->nomatch_mark = NULL; /* In case never set */ 6574: 6575: md->recursive = NULL; /* No recursion at top level */ 6576: md->hasthen = (re->flags & PCRE_HASTHEN) != 0; 6577: 6578: md->lcc = tables + lcc_offset; 6579: md->fcc = tables + fcc_offset; 6580: md->ctypes = tables + ctypes_offset; 6581: 6582: /* Handle different \R options. */ 6583: 6584: switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) 6585: { 6586: case 0: 6587: if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0) 6588: md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0; 6589: else 6590: #ifdef BSR_ANYCRLF 6591: md->bsr_anycrlf = TRUE; 6592: #else 6593: md->bsr_anycrlf = FALSE; 6594: #endif 6595: break; 6596: 6597: case PCRE_BSR_ANYCRLF: 6598: md->bsr_anycrlf = TRUE; 6599: break; 6600: 6601: case PCRE_BSR_UNICODE: 6602: md->bsr_anycrlf = FALSE; 6603: break; 6604: 6605: default: return PCRE_ERROR_BADNEWLINE; 6606: } 6607: 6608: /* Handle different types of newline. The three bits give eight cases. If 6609: nothing is set at run time, whatever was used at compile time applies. */ 6610: 6611: switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : 6612: (pcre_uint32)options) & PCRE_NEWLINE_BITS) 6613: { 6614: case 0: newline = NEWLINE; break; /* Compile-time default */ 6615: case PCRE_NEWLINE_CR: newline = CHAR_CR; break; 6616: case PCRE_NEWLINE_LF: newline = CHAR_NL; break; 6617: case PCRE_NEWLINE_CR+ 6618: PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break; 6619: case PCRE_NEWLINE_ANY: newline = -1; break; 6620: case PCRE_NEWLINE_ANYCRLF: newline = -2; break; 6621: default: return PCRE_ERROR_BADNEWLINE; 6622: } 6623: 6624: if (newline == -2) 6625: { 6626: md->nltype = NLTYPE_ANYCRLF; 6627: } 6628: else if (newline < 0) 6629: { 6630: md->nltype = NLTYPE_ANY; 6631: } 6632: else 6633: { 6634: md->nltype = NLTYPE_FIXED; 6635: if (newline > 255) 6636: { 6637: md->nllen = 2; 6638: md->nl[0] = (newline >> 8) & 255; 6639: md->nl[1] = newline & 255; 6640: } 6641: else 6642: { 6643: md->nllen = 1; 6644: md->nl[0] = newline; 6645: } 6646: } 6647: 6648: /* Partial matching was originally supported only for a restricted set of 6649: regexes; from release 8.00 there are no restrictions, but the bits are still 6650: defined (though never set). So there's no harm in leaving this code. */ 6651: 6652: if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0) 6653: return PCRE_ERROR_BADPARTIAL; 6654: 6655: /* If the expression has got more back references than the offsets supplied can 6656: hold, we get a temporary chunk of working store to use during the matching. 6657: Otherwise, we can use the vector supplied, rounding down its size to a multiple 6658: of 3. */ 6659: 6660: ocount = offsetcount - (offsetcount % 3); 6661: arg_offset_max = (2*ocount)/3; 6662: 6663: if (re->top_backref > 0 && re->top_backref >= ocount/3) 6664: { 6665: ocount = re->top_backref * 3 + 3; 6666: md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int)); 6667: if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY; 6668: using_temporary_offsets = TRUE; 6669: DPRINTF(("Got memory to hold back references\n")); 6670: } 6671: else md->offset_vector = offsets; 6672: md->offset_end = ocount; 6673: md->offset_max = (2*ocount)/3; 6674: md->capture_last = 0; 6675: 6676: /* Reset the working variable associated with each extraction. These should 6677: never be used unless previously set, but they get saved and restored, and so we 6678: initialize them to avoid reading uninitialized locations. Also, unset the 6679: offsets for the matched string. This is really just for tidiness with callouts, 6680: in case they inspect these fields. */ 6681: 6682: if (md->offset_vector != NULL) 6683: { 6684: register int *iptr = md->offset_vector + ocount; 6685: register int *iend = iptr - re->top_bracket; 6686: if (iend < md->offset_vector + 2) iend = md->offset_vector + 2; 6687: while (--iptr >= iend) *iptr = -1; 6688: md->offset_vector[0] = md->offset_vector[1] = -1; 6689: } 6690: 6691: /* Set up the first character to match, if available. The first_char value is 6692: never set for an anchored regular expression, but the anchoring may be forced 6693: at run time, so we have to test for anchoring. The first char may be unset for 6694: an unanchored pattern, of course. If there's no first char and the pattern was 6695: studied, there may be a bitmap of possible first characters. */ 6696: 6697: if (!anchored) 6698: { 6699: if ((re->flags & PCRE_FIRSTSET) != 0) 6700: { 6701: has_first_char = TRUE; 6702: first_char = first_char2 = (pcre_uchar)(re->first_char); 6703: if ((re->flags & PCRE_FCH_CASELESS) != 0) 6704: { 6705: first_char2 = TABLE_GET(first_char, md->fcc, first_char); 6706: #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) 6707: if (utf && first_char > 127) 6708: first_char2 = UCD_OTHERCASE(first_char); 6709: #endif 6710: } 6711: } 6712: else 6713: if (!startline && study != NULL && 6714: (study->flags & PCRE_STUDY_MAPPED) != 0) 6715: start_bits = study->start_bits; 6716: } 6717: 6718: /* For anchored or unanchored matches, there may be a "last known required 6719: character" set. */ 6720: 6721: if ((re->flags & PCRE_REQCHSET) != 0) 6722: { 6723: has_req_char = TRUE; 6724: req_char = req_char2 = (pcre_uchar)(re->req_char); 6725: if ((re->flags & PCRE_RCH_CASELESS) != 0) 6726: { 6727: req_char2 = TABLE_GET(req_char, md->fcc, req_char); 6728: #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) 6729: if (utf && req_char > 127) 6730: req_char2 = UCD_OTHERCASE(req_char); 6731: #endif 6732: } 6733: } 6734: 6735: 6736: /* ==========================================================================*/ 6737: 6738: /* Loop for handling unanchored repeated matching attempts; for anchored regexs 6739: the loop runs just once. */ 6740: 6741: for(;;) 6742: { 6743: PCRE_PUCHAR save_end_subject = end_subject; 6744: PCRE_PUCHAR new_start_match; 6745: 6746: /* If firstline is TRUE, the start of the match is constrained to the first 6747: line of a multiline string. That is, the match must be before or at the first 6748: newline. Implement this by temporarily adjusting end_subject so that we stop 6749: scanning at a newline. If the match fails at the newline, later code breaks 6750: this loop. */ 6751: 6752: if (firstline) 6753: { 6754: PCRE_PUCHAR t = start_match; 6755: #ifdef SUPPORT_UTF 6756: if (utf) 6757: { 6758: while (t < md->end_subject && !IS_NEWLINE(t)) 6759: { 6760: t++; 6761: ACROSSCHAR(t < end_subject, *t, t++); 6762: } 6763: } 6764: else 6765: #endif 6766: while (t < md->end_subject && !IS_NEWLINE(t)) t++; 6767: end_subject = t; 6768: } 6769: 6770: /* There are some optimizations that avoid running the match if a known 6771: starting point is not found, or if a known later character is not present. 6772: However, there is an option that disables these, for testing and for ensuring 6773: that all callouts do actually occur. The option can be set in the regex by 6774: (*NO_START_OPT) or passed in match-time options. */ 6775: 6776: if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0) 6777: { 6778: /* Advance to a unique first char if there is one. */ 6779: 6780: if (has_first_char) 6781: { 6782: pcre_uchar smc; 6783: 6784: if (first_char != first_char2) 6785: while (start_match < end_subject && 6786: (smc = RAWUCHARTEST(start_match)) != first_char && smc != first_char2) 6787: start_match++; 6788: else 6789: while (start_match < end_subject && RAWUCHARTEST(start_match) != first_char) 6790: start_match++; 6791: } 6792: 6793: /* Or to just after a linebreak for a multiline match */ 6794: 6795: else if (startline) 6796: { 6797: if (start_match > md->start_subject + start_offset) 6798: { 6799: #ifdef SUPPORT_UTF 6800: if (utf) 6801: { 6802: while (start_match < end_subject && !WAS_NEWLINE(start_match)) 6803: { 6804: start_match++; 6805: ACROSSCHAR(start_match < end_subject, *start_match, 6806: start_match++); 6807: } 6808: } 6809: else 6810: #endif 6811: while (start_match < end_subject && !WAS_NEWLINE(start_match)) 6812: start_match++; 6813: 6814: /* If we have just passed a CR and the newline option is ANY or ANYCRLF, 6815: and we are now at a LF, advance the match position by one more character. 6816: */ 6817: 6818: if (start_match[-1] == CHAR_CR && 6819: (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) && 6820: start_match < end_subject && 6821: RAWUCHARTEST(start_match) == CHAR_NL) 6822: start_match++; 6823: } 6824: } 6825: 6826: /* Or to a non-unique first byte after study */ 6827: 6828: else if (start_bits != NULL) 6829: { 6830: while (start_match < end_subject) 6831: { 6832: register pcre_uint32 c = RAWUCHARTEST(start_match); 6833: #ifndef COMPILE_PCRE8 6834: if (c > 255) c = 255; 6835: #endif 6836: if ((start_bits[c/8] & (1 << (c&7))) == 0) 6837: { 6838: start_match++; 6839: #if defined SUPPORT_UTF && defined COMPILE_PCRE8 6840: /* In non 8-bit mode, the iteration will stop for 6841: characters > 255 at the beginning or not stop at all. */ 6842: if (utf) 6843: ACROSSCHAR(start_match < end_subject, *start_match, 6844: start_match++); 6845: #endif 6846: } 6847: else break; 6848: } 6849: } 6850: } /* Starting optimizations */ 6851: 6852: /* Restore fudged end_subject */ 6853: 6854: end_subject = save_end_subject; 6855: 6856: /* The following two optimizations are disabled for partial matching or if 6857: disabling is explicitly requested. */ 6858: 6859: if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial) 6860: { 6861: /* If the pattern was studied, a minimum subject length may be set. This is 6862: a lower bound; no actual string of that length may actually match the 6863: pattern. Although the value is, strictly, in characters, we treat it as 6864: bytes to avoid spending too much time in this optimization. */ 6865: 6866: if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 && 6867: (pcre_uint32)(end_subject - start_match) < study->minlength) 6868: { 6869: rc = MATCH_NOMATCH; 6870: break; 6871: } 6872: 6873: /* If req_char is set, we know that that character must appear in the 6874: subject for the match to succeed. If the first character is set, req_char 6875: must be later in the subject; otherwise the test starts at the match point. 6876: This optimization can save a huge amount of backtracking in patterns with 6877: nested unlimited repeats that aren't going to match. Writing separate code 6878: for cased/caseless versions makes it go faster, as does using an 6879: autoincrement and backing off on a match. 6880: 6881: HOWEVER: when the subject string is very, very long, searching to its end 6882: can take a long time, and give bad performance on quite ordinary patterns. 6883: This showed up when somebody was matching something like /^\d+C/ on a 6884: 32-megabyte string... so we don't do this when the string is sufficiently 6885: long. */ 6886: 6887: if (has_req_char && end_subject - start_match < REQ_BYTE_MAX) 6888: { 6889: register PCRE_PUCHAR p = start_match + (has_first_char? 1:0); 6890: 6891: /* We don't need to repeat the search if we haven't yet reached the 6892: place we found it at last time. */ 6893: 6894: if (p > req_char_ptr) 6895: { 6896: if (req_char != req_char2) 6897: { 6898: while (p < end_subject) 6899: { 6900: register pcre_uint32 pp = RAWUCHARINCTEST(p); 6901: if (pp == req_char || pp == req_char2) { p--; break; } 6902: } 6903: } 6904: else 6905: { 6906: while (p < end_subject) 6907: { 6908: if (RAWUCHARINCTEST(p) == req_char) { p--; break; } 6909: } 6910: } 6911: 6912: /* If we can't find the required character, break the matching loop, 6913: forcing a match failure. */ 6914: 6915: if (p >= end_subject) 6916: { 6917: rc = MATCH_NOMATCH; 6918: break; 6919: } 6920: 6921: /* If we have found the required character, save the point where we 6922: found it, so that we don't search again next time round the loop if 6923: the start hasn't passed this character yet. */ 6924: 6925: req_char_ptr = p; 6926: } 6927: } 6928: } 6929: 6930: #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */ 6931: printf(">>>> Match against: "); 6932: pchars(start_match, end_subject - start_match, TRUE, md); 6933: printf("\n"); 6934: #endif 6935: 6936: /* OK, we can now run the match. If "hitend" is set afterwards, remember the 6937: first starting point for which a partial match was found. */ 6938: 6939: md->start_match_ptr = start_match; 6940: md->start_used_ptr = start_match; 6941: md->match_call_count = 0; 6942: md->match_function_type = 0; 6943: md->end_offset_top = 0; 6944: md->skip_arg_count = 0; 6945: rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0); 6946: if (md->hitend && start_partial == NULL) 6947: { 6948: start_partial = md->start_used_ptr; 6949: match_partial = start_match; 6950: } 6951: 6952: switch(rc) 6953: { 6954: /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched 6955: the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP 6956: entirely. The only way we can do that is to re-do the match at the same 6957: point, with a flag to force SKIP with an argument to be ignored. Just 6958: treating this case as NOMATCH does not work because it does not check other 6959: alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */ 6960: 6961: case MATCH_SKIP_ARG: 6962: new_start_match = start_match; 6963: md->ignore_skip_arg = md->skip_arg_count; 6964: break; 6965: 6966: /* SKIP passes back the next starting point explicitly, but if it is no 6967: greater than the match we have just done, treat it as NOMATCH. */ 6968: 6969: case MATCH_SKIP: 6970: if (md->start_match_ptr > start_match) 6971: { 6972: new_start_match = md->start_match_ptr; 6973: break; 6974: } 6975: /* Fall through */ 6976: 6977: /* NOMATCH and PRUNE advance by one character. THEN at this level acts 6978: exactly like PRUNE. Unset ignore SKIP-with-argument. */ 6979: 6980: case MATCH_NOMATCH: 6981: case MATCH_PRUNE: 6982: case MATCH_THEN: 6983: md->ignore_skip_arg = 0; 6984: new_start_match = start_match + 1; 6985: #ifdef SUPPORT_UTF 6986: if (utf) 6987: ACROSSCHAR(new_start_match < end_subject, *new_start_match, 6988: new_start_match++); 6989: #endif 6990: break; 6991: 6992: /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */ 6993: 6994: case MATCH_COMMIT: 6995: rc = MATCH_NOMATCH; 6996: goto ENDLOOP; 6997: 6998: /* Any other return is either a match, or some kind of error. */ 6999: 7000: default: 7001: goto ENDLOOP; 7002: } 7003: 7004: /* Control reaches here for the various types of "no match at this point" 7005: result. Reset the code to MATCH_NOMATCH for subsequent checking. */ 7006: 7007: rc = MATCH_NOMATCH; 7008: 7009: /* If PCRE_FIRSTLINE is set, the match must happen before or at the first 7010: newline in the subject (though it may continue over the newline). Therefore, 7011: if we have just failed to match, starting at a newline, do not continue. */ 7012: 7013: if (firstline && IS_NEWLINE(start_match)) break; 7014: 7015: /* Advance to new matching position */ 7016: 7017: start_match = new_start_match; 7018: 7019: /* Break the loop if the pattern is anchored or if we have passed the end of 7020: the subject. */ 7021: 7022: if (anchored || start_match > end_subject) break; 7023: 7024: /* If we have just passed a CR and we are now at a LF, and the pattern does 7025: not contain any explicit matches for \r or \n, and the newline option is CRLF 7026: or ANY or ANYCRLF, advance the match position by one more character. In 7027: normal matching start_match will aways be greater than the first position at 7028: this stage, but a failed *SKIP can cause a return at the same point, which is 7029: why the first test exists. */ 7030: 7031: if (start_match > (PCRE_PUCHAR)subject + start_offset && 7032: start_match[-1] == CHAR_CR && 7033: start_match < end_subject && 7034: *start_match == CHAR_NL && 7035: (re->flags & PCRE_HASCRORLF) == 0 && 7036: (md->nltype == NLTYPE_ANY || 7037: md->nltype == NLTYPE_ANYCRLF || 7038: md->nllen == 2)) 7039: start_match++; 7040: 7041: md->mark = NULL; /* Reset for start of next match attempt */ 7042: } /* End of for(;;) "bumpalong" loop */ 7043: 7044: /* ==========================================================================*/ 7045: 7046: /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping 7047: conditions is true: 7048: 7049: (1) The pattern is anchored or the match was failed by (*COMMIT); 7050: 7051: (2) We are past the end of the subject; 7052: 7053: (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because 7054: this option requests that a match occur at or before the first newline in 7055: the subject. 7056: 7057: When we have a match and the offset vector is big enough to deal with any 7058: backreferences, captured substring offsets will already be set up. In the case 7059: where we had to get some local store to hold offsets for backreference 7060: processing, copy those that we can. In this case there need not be overflow if 7061: certain parts of the pattern were not used, even though there are more 7062: capturing parentheses than vector slots. */ 7063: 7064: ENDLOOP: 7065: 7066: if (rc == MATCH_MATCH || rc == MATCH_ACCEPT) 7067: { 7068: if (using_temporary_offsets) 7069: { 7070: if (arg_offset_max >= 4) 7071: { 7072: memcpy(offsets + 2, md->offset_vector + 2, 7073: (arg_offset_max - 2) * sizeof(int)); 7074: DPRINTF(("Copied offsets from temporary memory\n")); 7075: } 7076: if (md->end_offset_top > arg_offset_max) md->capture_last |= OVFLBIT; 7077: DPRINTF(("Freeing temporary memory\n")); 7078: (PUBL(free))(md->offset_vector); 7079: } 7080: 7081: /* Set the return code to the number of captured strings, or 0 if there were 7082: too many to fit into the vector. */ 7083: 7084: rc = ((md->capture_last & OVFLBIT) != 0 && 7085: md->end_offset_top >= arg_offset_max)? 7086: 0 : md->end_offset_top/2; 7087: 7088: /* If there is space in the offset vector, set any unused pairs at the end of 7089: the pattern to -1 for backwards compatibility. It is documented that this 7090: happens. In earlier versions, the whole set of potential capturing offsets 7091: was set to -1 each time round the loop, but this is handled differently now. 7092: "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only 7093: those at the end that need unsetting here. We can't just unset them all at 7094: the start of the whole thing because they may get set in one branch that is 7095: not the final matching branch. */ 7096: 7097: if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL) 7098: { 7099: register int *iptr, *iend; 7100: int resetcount = 2 + re->top_bracket * 2; 7101: if (resetcount > offsetcount) resetcount = offsetcount; 7102: iptr = offsets + md->end_offset_top; 7103: iend = offsets + resetcount; 7104: while (iptr < iend) *iptr++ = -1; 7105: } 7106: 7107: /* If there is space, set up the whole thing as substring 0. The value of 7108: md->start_match_ptr might be modified if \K was encountered on the success 7109: matching path. */ 7110: 7111: if (offsetcount < 2) rc = 0; else 7112: { 7113: offsets[0] = (int)(md->start_match_ptr - md->start_subject); 7114: offsets[1] = (int)(md->end_match_ptr - md->start_subject); 7115: } 7116: 7117: /* Return MARK data if requested */ 7118: 7119: if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0) 7120: *(extra_data->mark) = (pcre_uchar *)md->mark; 7121: DPRINTF((">>>> returning %d\n", rc)); 7122: #ifdef NO_RECURSE 7123: release_match_heapframes(&frame_zero); 7124: #endif 7125: return rc; 7126: } 7127: 7128: /* Control gets here if there has been an error, or if the overall match 7129: attempt has failed at all permitted starting positions. */ 7130: 7131: if (using_temporary_offsets) 7132: { 7133: DPRINTF(("Freeing temporary memory\n")); 7134: (PUBL(free))(md->offset_vector); 7135: } 7136: 7137: /* For anything other than nomatch or partial match, just return the code. */ 7138: 7139: if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL) 7140: { 7141: DPRINTF((">>>> error: returning %d\n", rc)); 7142: #ifdef NO_RECURSE 7143: release_match_heapframes(&frame_zero); 7144: #endif 7145: return rc; 7146: } 7147: 7148: /* Handle partial matches - disable any mark data */ 7149: 7150: if (match_partial != NULL) 7151: { 7152: DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n")); 7153: md->mark = NULL; 7154: if (offsetcount > 1) 7155: { 7156: offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject); 7157: offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject); 7158: if (offsetcount > 2) 7159: offsets[2] = (int)(match_partial - (PCRE_PUCHAR)subject); 7160: } 7161: rc = PCRE_ERROR_PARTIAL; 7162: } 7163: 7164: /* This is the classic nomatch case */ 7165: 7166: else 7167: { 7168: DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n")); 7169: rc = PCRE_ERROR_NOMATCH; 7170: } 7171: 7172: /* Return the MARK data if it has been requested. */ 7173: 7174: if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0) 7175: *(extra_data->mark) = (pcre_uchar *)md->nomatch_mark; 7176: #ifdef NO_RECURSE 7177: release_match_heapframes(&frame_zero); 7178: #endif 7179: return rc; 7180: } 7181: 7182: /* End of pcre_exec.c */