embedaddon/pcre/pcre_dfa_exec.c - view

File: [ELWIX - Embedded LightWeight unIX -] / embedaddon / pcre / pcre_dfa_exec.c
Revision 1.1.1.5 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Sun Jun 15 19:46:04 2014 UTC (10 years ago) by misho
Branches: pcre, MAIN
CVS tags: v8_34, HEAD

pcre 8.34

1: /************************************************* 2: * Perl-Compatible Regular Expressions * 3: *************************************************/ 4: 5: /* PCRE is a library of functions to support regular expressions whose syntax 6: and semantics are as close as possible to those of the Perl 5 language (but see 7: below for why this module is different). 8: 9: Written by Philip Hazel 10: Copyright (c) 1997-2013 University of Cambridge 11: 12: ----------------------------------------------------------------------------- 13: Redistribution and use in source and binary forms, with or without 14: modification, are permitted provided that the following conditions are met: 15: 16: * Redistributions of source code must retain the above copyright notice, 17: this list of conditions and the following disclaimer. 18: 19: * Redistributions in binary form must reproduce the above copyright 20: notice, this list of conditions and the following disclaimer in the 21: documentation and/or other materials provided with the distribution. 22: 23: * Neither the name of the University of Cambridge nor the names of its 24: contributors may be used to endorse or promote products derived from 25: this software without specific prior written permission. 26: 27: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37: POSSIBILITY OF SUCH DAMAGE. 38: ----------------------------------------------------------------------------- 39: */ 40: 41: /* This module contains the external function pcre_dfa_exec(), which is an 42: alternative matching function that uses a sort of DFA algorithm (not a true 43: FSM). This is NOT Perl-compatible, but it has advantages in certain 44: applications. */ 45: 46: 47: /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved 48: the performance of his patterns greatly. I could not use it as it stood, as it 49: was not thread safe, and made assumptions about pattern sizes. Also, it caused 50: test 7 to loop, and test 9 to crash with a segfault. 51: 52: The issue is the check for duplicate states, which is done by a simple linear 53: search up the state list. (Grep for "duplicate" below to find the code.) For 54: many patterns, there will never be many states active at one time, so a simple 55: linear search is fine. In patterns that have many active states, it might be a 56: bottleneck. The suggested code used an indexing scheme to remember which states 57: had previously been used for each character, and avoided the linear search when 58: it knew there was no chance of a duplicate. This was implemented when adding 59: states to the state lists. 60: 61: I wrote some thread-safe, not-limited code to try something similar at the time 62: of checking for duplicates (instead of when adding states), using index vectors 63: on the stack. It did give a 13% improvement with one specially constructed 64: pattern for certain subject strings, but on other strings and on many of the 65: simpler patterns in the test suite it did worse. The major problem, I think, 66: was the extra time to initialize the index. This had to be done for each call 67: of internal_dfa_exec(). (The supplied patch used a static vector, initialized 68: only once - I suspect this was the cause of the problems with the tests.) 69: 70: Overall, I concluded that the gains in some cases did not outweigh the losses 71: in others, so I abandoned this code. */ 72: 73: 74: 75: #ifdef HAVE_CONFIG_H 76: #include "config.h" 77: #endif 78: 79: #define NLBLOCK md /* Block containing newline information */ 80: #define PSSTART start_subject /* Field containing processed string start */ 81: #define PSEND end_subject /* Field containing processed string end */ 82: 83: #include "pcre_internal.h" 84: 85: 86: /* For use to indent debugging output */ 87: 88: #define SP " " 89: 90: 91: /************************************************* 92: * Code parameters and static tables * 93: *************************************************/ 94: 95: /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes 96: into others, under special conditions. A gap of 20 between the blocks should be 97: enough. The resulting opcodes don't have to be less than 256 because they are 98: never stored, so we push them well clear of the normal opcodes. */ 99: 100: #define OP_PROP_EXTRA 300 101: #define OP_EXTUNI_EXTRA 320 102: #define OP_ANYNL_EXTRA 340 103: #define OP_HSPACE_EXTRA 360 104: #define OP_VSPACE_EXTRA 380 105: 106: 107: /* This table identifies those opcodes that are followed immediately by a 108: character that is to be tested in some way. This makes it possible to 109: centralize the loading of these characters. In the case of Type * etc, the 110: "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a 111: small value. Non-zero values in the table are the offsets from the opcode where 112: the character is to be found. ***NOTE*** If the start of this table is 113: modified, the three tables that follow must also be modified. */ 114: 115: static const pcre_uint8 coptable[] = { 116: 0, /* End */ 117: 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */ 118: 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */ 119: 0, 0, 0, /* Any, AllAny, Anybyte */ 120: 0, 0, /* \P, \p */ 121: 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */ 122: 0, /* \X */ 123: 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */ 124: 1, /* Char */ 125: 1, /* Chari */ 126: 1, /* not */ 127: 1, /* noti */ 128: /* Positive single-char repeats */ 129: 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ 130: 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */ 131: 1+IMM2_SIZE, /* exact */ 132: 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */ 133: 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */ 134: 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */ 135: 1+IMM2_SIZE, /* exact I */ 136: 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */ 137: /* Negative single-char repeats - only for chars < 256 */ 138: 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */ 139: 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */ 140: 1+IMM2_SIZE, /* NOT exact */ 141: 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */ 142: 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */ 143: 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */ 144: 1+IMM2_SIZE, /* NOT exact I */ 145: 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */ 146: /* Positive type repeats */ 147: 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */ 148: 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */ 149: 1+IMM2_SIZE, /* Type exact */ 150: 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */ 151: /* Character class & ref repeats */ 152: 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */ 153: 0, 0, /* CRRANGE, CRMINRANGE */ 154: 0, 0, 0, 0, /* Possessive *+, ++, ?+, CRPOSRANGE */ 155: 0, /* CLASS */ 156: 0, /* NCLASS */ 157: 0, /* XCLASS - variable length */ 158: 0, /* REF */ 159: 0, /* REFI */ 160: 0, /* DNREF */ 161: 0, /* DNREFI */ 162: 0, /* RECURSE */ 163: 0, /* CALLOUT */ 164: 0, /* Alt */ 165: 0, /* Ket */ 166: 0, /* KetRmax */ 167: 0, /* KetRmin */ 168: 0, /* KetRpos */ 169: 0, /* Reverse */ 170: 0, /* Assert */ 171: 0, /* Assert not */ 172: 0, /* Assert behind */ 173: 0, /* Assert behind not */ 174: 0, 0, /* ONCE, ONCE_NC */ 175: 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */ 176: 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */ 177: 0, 0, /* CREF, DNCREF */ 178: 0, 0, /* RREF, DNRREF */ 179: 0, /* DEF */ 180: 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ 181: 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */ 182: 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */ 183: 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */ 184: 0, 0 /* CLOSE, SKIPZERO */ 185: }; 186: 187: /* This table identifies those opcodes that inspect a character. It is used to 188: remember the fact that a character could have been inspected when the end of 189: the subject is reached. ***NOTE*** If the start of this table is modified, the 190: two tables that follow must also be modified. */ 191: 192: static const pcre_uint8 poptable[] = { 193: 0, /* End */ 194: 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */ 195: 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ 196: 1, 1, 1, /* Any, AllAny, Anybyte */ 197: 1, 1, /* \P, \p */ 198: 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ 199: 1, /* \X */ 200: 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */ 201: 1, /* Char */ 202: 1, /* Chari */ 203: 1, /* not */ 204: 1, /* noti */ 205: /* Positive single-char repeats */ 206: 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ 207: 1, 1, 1, /* upto, minupto, exact */ 208: 1, 1, 1, 1, /* *+, ++, ?+, upto+ */ 209: 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */ 210: 1, 1, 1, /* upto I, minupto I, exact I */ 211: 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */ 212: /* Negative single-char repeats - only for chars < 256 */ 213: 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */ 214: 1, 1, 1, /* NOT upto, minupto, exact */ 215: 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */ 216: 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */ 217: 1, 1, 1, /* NOT upto I, minupto I, exact I */ 218: 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */ 219: /* Positive type repeats */ 220: 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */ 221: 1, 1, 1, /* Type upto, minupto, exact */ 222: 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */ 223: /* Character class & ref repeats */ 224: 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ 225: 1, 1, /* CRRANGE, CRMINRANGE */ 226: 1, 1, 1, 1, /* Possessive *+, ++, ?+, CRPOSRANGE */ 227: 1, /* CLASS */ 228: 1, /* NCLASS */ 229: 1, /* XCLASS - variable length */ 230: 0, /* REF */ 231: 0, /* REFI */ 232: 0, /* DNREF */ 233: 0, /* DNREFI */ 234: 0, /* RECURSE */ 235: 0, /* CALLOUT */ 236: 0, /* Alt */ 237: 0, /* Ket */ 238: 0, /* KetRmax */ 239: 0, /* KetRmin */ 240: 0, /* KetRpos */ 241: 0, /* Reverse */ 242: 0, /* Assert */ 243: 0, /* Assert not */ 244: 0, /* Assert behind */ 245: 0, /* Assert behind not */ 246: 0, 0, /* ONCE, ONCE_NC */ 247: 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */ 248: 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */ 249: 0, 0, /* CREF, DNCREF */ 250: 0, 0, /* RREF, DNRREF */ 251: 0, /* DEF */ 252: 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ 253: 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */ 254: 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */ 255: 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */ 256: 0, 0 /* CLOSE, SKIPZERO */ 257: }; 258: 259: /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W, 260: and \w */ 261: 262: static const pcre_uint8 toptable1[] = { 263: 0, 0, 0, 0, 0, 0, 264: ctype_digit, ctype_digit, 265: ctype_space, ctype_space, 266: ctype_word, ctype_word, 267: 0, 0 /* OP_ANY, OP_ALLANY */ 268: }; 269: 270: static const pcre_uint8 toptable2[] = { 271: 0, 0, 0, 0, 0, 0, 272: ctype_digit, 0, 273: ctype_space, 0, 274: ctype_word, 0, 275: 1, 1 /* OP_ANY, OP_ALLANY */ 276: }; 277: 278: 279: /* Structure for holding data about a particular state, which is in effect the 280: current data for an active path through the match tree. It must consist 281: entirely of ints because the working vector we are passed, and which we put 282: these structures in, is a vector of ints. */ 283: 284: typedef struct stateblock { 285: int offset; /* Offset to opcode */ 286: int count; /* Count for repeats */ 287: int data; /* Some use extra data */ 288: } stateblock; 289: 290: #define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int)) 291: 292: 293: #ifdef PCRE_DEBUG 294: /************************************************* 295: * Print character string * 296: *************************************************/ 297: 298: /* Character string printing function for debugging. 299: 300: Arguments: 301: p points to string 302: length number of bytes 303: f where to print 304: 305: Returns: nothing 306: */ 307: 308: static void 309: pchars(const pcre_uchar *p, int length, FILE *f) 310: { 311: pcre_uint32 c; 312: while (length-- > 0) 313: { 314: if (isprint(c = *(p++))) 315: fprintf(f, "%c", c); 316: else 317: fprintf(f, "\\x{%02x}", c); 318: } 319: } 320: #endif 321: 322: 323: 324: /************************************************* 325: * Execute a Regular Expression - DFA engine * 326: *************************************************/ 327: 328: /* This internal function applies a compiled pattern to a subject string, 329: starting at a given point, using a DFA engine. This function is called from the 330: external one, possibly multiple times if the pattern is not anchored. The 331: function calls itself recursively for some kinds of subpattern. 332: 333: Arguments: 334: md the match_data block with fixed information 335: this_start_code the opening bracket of this subexpression's code 336: current_subject where we currently are in the subject string 337: start_offset start offset in the subject string 338: offsets vector to contain the matching string offsets 339: offsetcount size of same 340: workspace vector of workspace 341: wscount size of same 342: rlevel function call recursion level 343: 344: Returns: > 0 => number of match offset pairs placed in offsets 345: = 0 => offsets overflowed; longest matches are present 346: -1 => failed to match 347: < -1 => some kind of unexpected problem 348: 349: The following macros are used for adding states to the two state vectors (one 350: for the current character, one for the following character). */ 351: 352: #define ADD_ACTIVE(x,y) \ 353: if (active_count++ < wscount) \ 354: { \ 355: next_active_state->offset = (x); \ 356: next_active_state->count = (y); \ 357: next_active_state++; \ 358: DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \ 359: } \ 360: else return PCRE_ERROR_DFA_WSSIZE 361: 362: #define ADD_ACTIVE_DATA(x,y,z) \ 363: if (active_count++ < wscount) \ 364: { \ 365: next_active_state->offset = (x); \ 366: next_active_state->count = (y); \ 367: next_active_state->data = (z); \ 368: next_active_state++; \ 369: DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \ 370: } \ 371: else return PCRE_ERROR_DFA_WSSIZE 372: 373: #define ADD_NEW(x,y) \ 374: if (new_count++ < wscount) \ 375: { \ 376: next_new_state->offset = (x); \ 377: next_new_state->count = (y); \ 378: next_new_state++; \ 379: DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \ 380: } \ 381: else return PCRE_ERROR_DFA_WSSIZE 382: 383: #define ADD_NEW_DATA(x,y,z) \ 384: if (new_count++ < wscount) \ 385: { \ 386: next_new_state->offset = (x); \ 387: next_new_state->count = (y); \ 388: next_new_state->data = (z); \ 389: next_new_state++; \ 390: DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \ 391: (x), (y), (z), __LINE__)); \ 392: } \ 393: else return PCRE_ERROR_DFA_WSSIZE 394: 395: /* And now, here is the code */ 396: 397: static int 398: internal_dfa_exec( 399: dfa_match_data *md, 400: const pcre_uchar *this_start_code, 401: const pcre_uchar *current_subject, 402: int start_offset, 403: int *offsets, 404: int offsetcount, 405: int *workspace, 406: int wscount, 407: int rlevel) 408: { 409: stateblock *active_states, *new_states, *temp_states; 410: stateblock *next_active_state, *next_new_state; 411: 412: const pcre_uint8 *ctypes, *lcc, *fcc; 413: const pcre_uchar *ptr; 414: const pcre_uchar *end_code, *first_op; 415: 416: dfa_recursion_info new_recursive; 417: 418: int active_count, new_count, match_count; 419: 420: /* Some fields in the md block are frequently referenced, so we load them into 421: independent variables in the hope that this will perform better. */ 422: 423: const pcre_uchar *start_subject = md->start_subject; 424: const pcre_uchar *end_subject = md->end_subject; 425: const pcre_uchar *start_code = md->start_code; 426: 427: #ifdef SUPPORT_UTF 428: BOOL utf = (md->poptions & PCRE_UTF8) != 0; 429: #else 430: BOOL utf = FALSE; 431: #endif 432: 433: BOOL reset_could_continue = FALSE; 434: 435: rlevel++; 436: offsetcount &= (-2); 437: 438: wscount -= 2; 439: wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) / 440: (2 * INTS_PER_STATEBLOCK); 441: 442: DPRINTF(("\n%.*s---------------------\n" 443: "%.*sCall to internal_dfa_exec f=%d\n", 444: rlevel*2-2, SP, rlevel*2-2, SP, rlevel)); 445: 446: ctypes = md->tables + ctypes_offset; 447: lcc = md->tables + lcc_offset; 448: fcc = md->tables + fcc_offset; 449: 450: match_count = PCRE_ERROR_NOMATCH; /* A negative number */ 451: 452: active_states = (stateblock *)(workspace + 2); 453: next_new_state = new_states = active_states + wscount; 454: new_count = 0; 455: 456: first_op = this_start_code + 1 + LINK_SIZE + 457: ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA || 458: *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS) 459: ? IMM2_SIZE:0); 460: 461: /* The first thing in any (sub) pattern is a bracket of some sort. Push all 462: the alternative states onto the list, and find out where the end is. This 463: makes is possible to use this function recursively, when we want to stop at a 464: matching internal ket rather than at the end. 465: 466: If the first opcode in the first alternative is OP_REVERSE, we are dealing with 467: a backward assertion. In that case, we have to find out the maximum amount to 468: move back, and set up each alternative appropriately. */ 469: 470: if (*first_op == OP_REVERSE) 471: { 472: int max_back = 0; 473: int gone_back; 474: 475: end_code = this_start_code; 476: do 477: { 478: int back = GET(end_code, 2+LINK_SIZE); 479: if (back > max_back) max_back = back; 480: end_code += GET(end_code, 1); 481: } 482: while (*end_code == OP_ALT); 483: 484: /* If we can't go back the amount required for the longest lookbehind 485: pattern, go back as far as we can; some alternatives may still be viable. */ 486: 487: #ifdef SUPPORT_UTF 488: /* In character mode we have to step back character by character */ 489: 490: if (utf) 491: { 492: for (gone_back = 0; gone_back < max_back; gone_back++) 493: { 494: if (current_subject <= start_subject) break; 495: current_subject--; 496: ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--); 497: } 498: } 499: else 500: #endif 501: 502: /* In byte-mode we can do this quickly. */ 503: 504: { 505: gone_back = (current_subject - max_back < start_subject)? 506: (int)(current_subject - start_subject) : max_back; 507: current_subject -= gone_back; 508: } 509: 510: /* Save the earliest consulted character */ 511: 512: if (current_subject < md->start_used_ptr) 513: md->start_used_ptr = current_subject; 514: 515: /* Now we can process the individual branches. */ 516: 517: end_code = this_start_code; 518: do 519: { 520: int back = GET(end_code, 2+LINK_SIZE); 521: if (back <= gone_back) 522: { 523: int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE); 524: ADD_NEW_DATA(-bstate, 0, gone_back - back); 525: } 526: end_code += GET(end_code, 1); 527: } 528: while (*end_code == OP_ALT); 529: } 530: 531: /* This is the code for a "normal" subpattern (not a backward assertion). The 532: start of a whole pattern is always one of these. If we are at the top level, 533: we may be asked to restart matching from the same point that we reached for a 534: previous partial match. We still have to scan through the top-level branches to 535: find the end state. */ 536: 537: else 538: { 539: end_code = this_start_code; 540: 541: /* Restarting */ 542: 543: if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0) 544: { 545: do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT); 546: new_count = workspace[1]; 547: if (!workspace[0]) 548: memcpy(new_states, active_states, new_count * sizeof(stateblock)); 549: } 550: 551: /* Not restarting */ 552: 553: else 554: { 555: int length = 1 + LINK_SIZE + 556: ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA || 557: *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS) 558: ? IMM2_SIZE:0); 559: do 560: { 561: ADD_NEW((int)(end_code - start_code + length), 0); 562: end_code += GET(end_code, 1); 563: length = 1 + LINK_SIZE; 564: } 565: while (*end_code == OP_ALT); 566: } 567: } 568: 569: workspace[0] = 0; /* Bit indicating which vector is current */ 570: 571: DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code))); 572: 573: /* Loop for scanning the subject */ 574: 575: ptr = current_subject; 576: for (;;) 577: { 578: int i, j; 579: int clen, dlen; 580: pcre_uint32 c, d; 581: int forced_fail = 0; 582: BOOL partial_newline = FALSE; 583: BOOL could_continue = reset_could_continue; 584: reset_could_continue = FALSE; 585: 586: /* Make the new state list into the active state list and empty the 587: new state list. */ 588: 589: temp_states = active_states; 590: active_states = new_states; 591: new_states = temp_states; 592: active_count = new_count; 593: new_count = 0; 594: 595: workspace[0] ^= 1; /* Remember for the restarting feature */ 596: workspace[1] = active_count; 597: 598: #ifdef PCRE_DEBUG 599: printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP); 600: pchars(ptr, STRLEN_UC(ptr), stdout); 601: printf("\"\n"); 602: 603: printf("%.*sActive states: ", rlevel*2-2, SP); 604: for (i = 0; i < active_count; i++) 605: printf("%d/%d ", active_states[i].offset, active_states[i].count); 606: printf("\n"); 607: #endif 608: 609: /* Set the pointers for adding new states */ 610: 611: next_active_state = active_states + active_count; 612: next_new_state = new_states; 613: 614: /* Load the current character from the subject outside the loop, as many 615: different states may want to look at it, and we assume that at least one 616: will. */ 617: 618: if (ptr < end_subject) 619: { 620: clen = 1; /* Number of data items in the character */ 621: #ifdef SUPPORT_UTF 622: GETCHARLENTEST(c, ptr, clen); 623: #else 624: c = *ptr; 625: #endif /* SUPPORT_UTF */ 626: } 627: else 628: { 629: clen = 0; /* This indicates the end of the subject */ 630: c = NOTACHAR; /* This value should never actually be used */ 631: } 632: 633: /* Scan up the active states and act on each one. The result of an action 634: may be to add more states to the currently active list (e.g. on hitting a 635: parenthesis) or it may be to put states on the new list, for considering 636: when we move the character pointer on. */ 637: 638: for (i = 0; i < active_count; i++) 639: { 640: stateblock *current_state = active_states + i; 641: BOOL caseless = FALSE; 642: const pcre_uchar *code; 643: int state_offset = current_state->offset; 644: int codevalue, rrc; 645: int count; 646: 647: #ifdef PCRE_DEBUG 648: printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset); 649: if (clen == 0) printf("EOL\n"); 650: else if (c > 32 && c < 127) printf("'%c'\n", c); 651: else printf("0x%02x\n", c); 652: #endif 653: 654: /* A negative offset is a special case meaning "hold off going to this 655: (negated) state until the number of characters in the data field have 656: been skipped". If the could_continue flag was passed over from a previous 657: state, arrange for it to passed on. */ 658: 659: if (state_offset < 0) 660: { 661: if (current_state->data > 0) 662: { 663: DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP)); 664: ADD_NEW_DATA(state_offset, current_state->count, 665: current_state->data - 1); 666: if (could_continue) reset_could_continue = TRUE; 667: continue; 668: } 669: else 670: { 671: current_state->offset = state_offset = -state_offset; 672: } 673: } 674: 675: /* Check for a duplicate state with the same count, and skip if found. 676: See the note at the head of this module about the possibility of improving 677: performance here. */ 678: 679: for (j = 0; j < i; j++) 680: { 681: if (active_states[j].offset == state_offset && 682: active_states[j].count == current_state->count) 683: { 684: DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP)); 685: goto NEXT_ACTIVE_STATE; 686: } 687: } 688: 689: /* The state offset is the offset to the opcode */ 690: 691: code = start_code + state_offset; 692: codevalue = *code; 693: 694: /* If this opcode inspects a character, but we are at the end of the 695: subject, remember the fact for use when testing for a partial match. */ 696: 697: if (clen == 0 && poptable[codevalue] != 0) 698: could_continue = TRUE; 699: 700: /* If this opcode is followed by an inline character, load it. It is 701: tempting to test for the presence of a subject character here, but that 702: is wrong, because sometimes zero repetitions of the subject are 703: permitted. 704: 705: We also use this mechanism for opcodes such as OP_TYPEPLUS that take an 706: argument that is not a data character - but is always one byte long because 707: the values are small. We have to take special action to deal with \P, \p, 708: \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert 709: these ones to new opcodes. */ 710: 711: if (coptable[codevalue] > 0) 712: { 713: dlen = 1; 714: #ifdef SUPPORT_UTF 715: if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else 716: #endif /* SUPPORT_UTF */ 717: d = code[coptable[codevalue]]; 718: if (codevalue >= OP_TYPESTAR) 719: { 720: switch(d) 721: { 722: case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM; 723: case OP_NOTPROP: 724: case OP_PROP: codevalue += OP_PROP_EXTRA; break; 725: case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break; 726: case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break; 727: case OP_NOT_HSPACE: 728: case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break; 729: case OP_NOT_VSPACE: 730: case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break; 731: default: break; 732: } 733: } 734: } 735: else 736: { 737: dlen = 0; /* Not strictly necessary, but compilers moan */ 738: d = NOTACHAR; /* if these variables are not set. */ 739: } 740: 741: 742: /* Now process the individual opcodes */ 743: 744: switch (codevalue) 745: { 746: /* ========================================================================== */ 747: /* These cases are never obeyed. This is a fudge that causes a compile- 748: time error if the vectors coptable or poptable, which are indexed by 749: opcode, are not the correct length. It seems to be the only way to do 750: such a check at compile time, as the sizeof() operator does not work 751: in the C preprocessor. */ 752: 753: case OP_TABLE_LENGTH: 754: case OP_TABLE_LENGTH + 755: ((sizeof(coptable) == OP_TABLE_LENGTH) && 756: (sizeof(poptable) == OP_TABLE_LENGTH)): 757: break; 758: 759: /* ========================================================================== */ 760: /* Reached a closing bracket. If not at the end of the pattern, carry 761: on with the next opcode. For repeating opcodes, also add the repeat 762: state. Note that KETRPOS will always be encountered at the end of the 763: subpattern, because the possessive subpattern repeats are always handled 764: using recursive calls. Thus, it never adds any new states. 765: 766: At the end of the (sub)pattern, unless we have an empty string and 767: PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the 768: start of the subject, save the match data, shifting up all previous 769: matches so we always have the longest first. */ 770: 771: case OP_KET: 772: case OP_KETRMIN: 773: case OP_KETRMAX: 774: case OP_KETRPOS: 775: if (code != end_code) 776: { 777: ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0); 778: if (codevalue != OP_KET) 779: { 780: ADD_ACTIVE(state_offset - GET(code, 1), 0); 781: } 782: } 783: else 784: { 785: if (ptr > current_subject || 786: ((md->moptions & PCRE_NOTEMPTY) == 0 && 787: ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 || 788: current_subject > start_subject + md->start_offset))) 789: { 790: if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0; 791: else if (match_count > 0 && ++match_count * 2 > offsetcount) 792: match_count = 0; 793: count = ((match_count == 0)? offsetcount : match_count * 2) - 2; 794: if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int)); 795: if (offsetcount >= 2) 796: { 797: offsets[0] = (int)(current_subject - start_subject); 798: offsets[1] = (int)(ptr - start_subject); 799: DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP, 800: offsets[1] - offsets[0], (char *)current_subject)); 801: } 802: if ((md->moptions & PCRE_DFA_SHORTEST) != 0) 803: { 804: DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" 805: "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, 806: match_count, rlevel*2-2, SP)); 807: return match_count; 808: } 809: } 810: } 811: break; 812: 813: /* ========================================================================== */ 814: /* These opcodes add to the current list of states without looking 815: at the current character. */ 816: 817: /*-----------------------------------------------------------------*/ 818: case OP_ALT: 819: do { code += GET(code, 1); } while (*code == OP_ALT); 820: ADD_ACTIVE((int)(code - start_code), 0); 821: break; 822: 823: /*-----------------------------------------------------------------*/ 824: case OP_BRA: 825: case OP_SBRA: 826: do 827: { 828: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); 829: code += GET(code, 1); 830: } 831: while (*code == OP_ALT); 832: break; 833: 834: /*-----------------------------------------------------------------*/ 835: case OP_CBRA: 836: case OP_SCBRA: 837: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0); 838: code += GET(code, 1); 839: while (*code == OP_ALT) 840: { 841: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); 842: code += GET(code, 1); 843: } 844: break; 845: 846: /*-----------------------------------------------------------------*/ 847: case OP_BRAZERO: 848: case OP_BRAMINZERO: 849: ADD_ACTIVE(state_offset + 1, 0); 850: code += 1 + GET(code, 2); 851: while (*code == OP_ALT) code += GET(code, 1); 852: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); 853: break; 854: 855: /*-----------------------------------------------------------------*/ 856: case OP_SKIPZERO: 857: code += 1 + GET(code, 2); 858: while (*code == OP_ALT) code += GET(code, 1); 859: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); 860: break; 861: 862: /*-----------------------------------------------------------------*/ 863: case OP_CIRC: 864: if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) 865: { ADD_ACTIVE(state_offset + 1, 0); } 866: break; 867: 868: /*-----------------------------------------------------------------*/ 869: case OP_CIRCM: 870: if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) || 871: (ptr != end_subject && WAS_NEWLINE(ptr))) 872: { ADD_ACTIVE(state_offset + 1, 0); } 873: break; 874: 875: /*-----------------------------------------------------------------*/ 876: case OP_EOD: 877: if (ptr >= end_subject) 878: { 879: if ((md->moptions & PCRE_PARTIAL_HARD) != 0) 880: could_continue = TRUE; 881: else { ADD_ACTIVE(state_offset + 1, 0); } 882: } 883: break; 884: 885: /*-----------------------------------------------------------------*/ 886: case OP_SOD: 887: if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); } 888: break; 889: 890: /*-----------------------------------------------------------------*/ 891: case OP_SOM: 892: if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); } 893: break; 894: 895: 896: /* ========================================================================== */ 897: /* These opcodes inspect the next subject character, and sometimes 898: the previous one as well, but do not have an argument. The variable 899: clen contains the length of the current character and is zero if we are 900: at the end of the subject. */ 901: 902: /*-----------------------------------------------------------------*/ 903: case OP_ANY: 904: if (clen > 0 && !IS_NEWLINE(ptr)) 905: { 906: if (ptr + 1 >= md->end_subject && 907: (md->moptions & (PCRE_PARTIAL_HARD)) != 0 && 908: NLBLOCK->nltype == NLTYPE_FIXED && 909: NLBLOCK->nllen == 2 && 910: c == NLBLOCK->nl[0]) 911: { 912: could_continue = partial_newline = TRUE; 913: } 914: else 915: { 916: ADD_NEW(state_offset + 1, 0); 917: } 918: } 919: break; 920: 921: /*-----------------------------------------------------------------*/ 922: case OP_ALLANY: 923: if (clen > 0) 924: { ADD_NEW(state_offset + 1, 0); } 925: break; 926: 927: /*-----------------------------------------------------------------*/ 928: case OP_EODN: 929: if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0) 930: could_continue = TRUE; 931: else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen)) 932: { ADD_ACTIVE(state_offset + 1, 0); } 933: break; 934: 935: /*-----------------------------------------------------------------*/ 936: case OP_DOLL: 937: if ((md->moptions & PCRE_NOTEOL) == 0) 938: { 939: if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0) 940: could_continue = TRUE; 941: else if (clen == 0 || 942: ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) && 943: (ptr == end_subject - md->nllen) 944: )) 945: { ADD_ACTIVE(state_offset + 1, 0); } 946: else if (ptr + 1 >= md->end_subject && 947: (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 && 948: NLBLOCK->nltype == NLTYPE_FIXED && 949: NLBLOCK->nllen == 2 && 950: c == NLBLOCK->nl[0]) 951: { 952: if ((md->moptions & PCRE_PARTIAL_HARD) != 0) 953: { 954: reset_could_continue = TRUE; 955: ADD_NEW_DATA(-(state_offset + 1), 0, 1); 956: } 957: else could_continue = partial_newline = TRUE; 958: } 959: } 960: break; 961: 962: /*-----------------------------------------------------------------*/ 963: case OP_DOLLM: 964: if ((md->moptions & PCRE_NOTEOL) == 0) 965: { 966: if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0) 967: could_continue = TRUE; 968: else if (clen == 0 || 969: ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr))) 970: { ADD_ACTIVE(state_offset + 1, 0); } 971: else if (ptr + 1 >= md->end_subject && 972: (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 && 973: NLBLOCK->nltype == NLTYPE_FIXED && 974: NLBLOCK->nllen == 2 && 975: c == NLBLOCK->nl[0]) 976: { 977: if ((md->moptions & PCRE_PARTIAL_HARD) != 0) 978: { 979: reset_could_continue = TRUE; 980: ADD_NEW_DATA(-(state_offset + 1), 0, 1); 981: } 982: else could_continue = partial_newline = TRUE; 983: } 984: } 985: else if (IS_NEWLINE(ptr)) 986: { ADD_ACTIVE(state_offset + 1, 0); } 987: break; 988: 989: /*-----------------------------------------------------------------*/ 990: 991: case OP_DIGIT: 992: case OP_WHITESPACE: 993: case OP_WORDCHAR: 994: if (clen > 0 && c < 256 && 995: ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0) 996: { ADD_NEW(state_offset + 1, 0); } 997: break; 998: 999: /*-----------------------------------------------------------------*/ 1000: case OP_NOT_DIGIT: 1001: case OP_NOT_WHITESPACE: 1002: case OP_NOT_WORDCHAR: 1003: if (clen > 0 && (c >= 256 || 1004: ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)) 1005: { ADD_NEW(state_offset + 1, 0); } 1006: break; 1007: 1008: /*-----------------------------------------------------------------*/ 1009: case OP_WORD_BOUNDARY: 1010: case OP_NOT_WORD_BOUNDARY: 1011: { 1012: int left_word, right_word; 1013: 1014: if (ptr > start_subject) 1015: { 1016: const pcre_uchar *temp = ptr - 1; 1017: if (temp < md->start_used_ptr) md->start_used_ptr = temp; 1018: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32 1019: if (utf) { BACKCHAR(temp); } 1020: #endif 1021: GETCHARTEST(d, temp); 1022: #ifdef SUPPORT_UCP 1023: if ((md->poptions & PCRE_UCP) != 0) 1024: { 1025: if (d == '_') left_word = TRUE; else 1026: { 1027: int cat = UCD_CATEGORY(d); 1028: left_word = (cat == ucp_L || cat == ucp_N); 1029: } 1030: } 1031: else 1032: #endif 1033: left_word = d < 256 && (ctypes[d] & ctype_word) != 0; 1034: } 1035: else left_word = FALSE; 1036: 1037: if (clen > 0) 1038: { 1039: #ifdef SUPPORT_UCP 1040: if ((md->poptions & PCRE_UCP) != 0) 1041: { 1042: if (c == '_') right_word = TRUE; else 1043: { 1044: int cat = UCD_CATEGORY(c); 1045: right_word = (cat == ucp_L || cat == ucp_N); 1046: } 1047: } 1048: else 1049: #endif 1050: right_word = c < 256 && (ctypes[c] & ctype_word) != 0; 1051: } 1052: else right_word = FALSE; 1053: 1054: if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY)) 1055: { ADD_ACTIVE(state_offset + 1, 0); } 1056: } 1057: break; 1058: 1059: 1060: /*-----------------------------------------------------------------*/ 1061: /* Check the next character by Unicode property. We will get here only 1062: if the support is in the binary; otherwise a compile-time error occurs. 1063: */ 1064: 1065: #ifdef SUPPORT_UCP 1066: case OP_PROP: 1067: case OP_NOTPROP: 1068: if (clen > 0) 1069: { 1070: BOOL OK; 1071: const pcre_uint32 *cp; 1072: const ucd_record * prop = GET_UCD(c); 1073: switch(code[1]) 1074: { 1075: case PT_ANY: 1076: OK = TRUE; 1077: break; 1078: 1079: case PT_LAMP: 1080: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || 1081: prop->chartype == ucp_Lt; 1082: break; 1083: 1084: case PT_GC: 1085: OK = PRIV(ucp_gentype)[prop->chartype] == code[2]; 1086: break; 1087: 1088: case PT_PC: 1089: OK = prop->chartype == code[2]; 1090: break; 1091: 1092: case PT_SC: 1093: OK = prop->script == code[2]; 1094: break; 1095: 1096: /* These are specials for combination cases. */ 1097: 1098: case PT_ALNUM: 1099: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1100: PRIV(ucp_gentype)[prop->chartype] == ucp_N; 1101: break; 1102: 1103: /* Perl space used to exclude VT, but from Perl 5.18 it is included, 1104: which means that Perl space and POSIX space are now identical. PCRE 1105: was changed at release 8.34. */ 1106: 1107: case PT_SPACE: /* Perl space */ 1108: case PT_PXSPACE: /* POSIX space */ 1109: switch(c) 1110: { 1111: HSPACE_CASES: 1112: VSPACE_CASES: 1113: OK = TRUE; 1114: break; 1115: 1116: default: 1117: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z; 1118: break; 1119: } 1120: break; 1121: 1122: case PT_WORD: 1123: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1124: PRIV(ucp_gentype)[prop->chartype] == ucp_N || 1125: c == CHAR_UNDERSCORE; 1126: break; 1127: 1128: case PT_CLIST: 1129: cp = PRIV(ucd_caseless_sets) + code[2]; 1130: for (;;) 1131: { 1132: if (c < *cp) { OK = FALSE; break; } 1133: if (c == *cp++) { OK = TRUE; break; } 1134: } 1135: break; 1136: 1137: case PT_UCNC: 1138: OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 1139: c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 1140: c >= 0xe000; 1141: break; 1142: 1143: /* Should never occur, but keep compilers from grumbling. */ 1144: 1145: default: 1146: OK = codevalue != OP_PROP; 1147: break; 1148: } 1149: 1150: if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); } 1151: } 1152: break; 1153: #endif 1154: 1155: 1156: 1157: /* ========================================================================== */ 1158: /* These opcodes likewise inspect the subject character, but have an 1159: argument that is not a data character. It is one of these opcodes: 1160: OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, 1161: OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */ 1162: 1163: case OP_TYPEPLUS: 1164: case OP_TYPEMINPLUS: 1165: case OP_TYPEPOSPLUS: 1166: count = current_state->count; /* Already matched */ 1167: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } 1168: if (clen > 0) 1169: { 1170: if (d == OP_ANY && ptr + 1 >= md->end_subject && 1171: (md->moptions & (PCRE_PARTIAL_HARD)) != 0 && 1172: NLBLOCK->nltype == NLTYPE_FIXED && 1173: NLBLOCK->nllen == 2 && 1174: c == NLBLOCK->nl[0]) 1175: { 1176: could_continue = partial_newline = TRUE; 1177: } 1178: else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || 1179: (c < 256 && 1180: (d != OP_ANY || !IS_NEWLINE(ptr)) && 1181: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) 1182: { 1183: if (count > 0 && codevalue == OP_TYPEPOSPLUS) 1184: { 1185: active_count--; /* Remove non-match possibility */ 1186: next_active_state--; 1187: } 1188: count++; 1189: ADD_NEW(state_offset, count); 1190: } 1191: } 1192: break; 1193: 1194: /*-----------------------------------------------------------------*/ 1195: case OP_TYPEQUERY: 1196: case OP_TYPEMINQUERY: 1197: case OP_TYPEPOSQUERY: 1198: ADD_ACTIVE(state_offset + 2, 0); 1199: if (clen > 0) 1200: { 1201: if (d == OP_ANY && ptr + 1 >= md->end_subject && 1202: (md->moptions & (PCRE_PARTIAL_HARD)) != 0 && 1203: NLBLOCK->nltype == NLTYPE_FIXED && 1204: NLBLOCK->nllen == 2 && 1205: c == NLBLOCK->nl[0]) 1206: { 1207: could_continue = partial_newline = TRUE; 1208: } 1209: else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || 1210: (c < 256 && 1211: (d != OP_ANY || !IS_NEWLINE(ptr)) && 1212: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) 1213: { 1214: if (codevalue == OP_TYPEPOSQUERY) 1215: { 1216: active_count--; /* Remove non-match possibility */ 1217: next_active_state--; 1218: } 1219: ADD_NEW(state_offset + 2, 0); 1220: } 1221: } 1222: break; 1223: 1224: /*-----------------------------------------------------------------*/ 1225: case OP_TYPESTAR: 1226: case OP_TYPEMINSTAR: 1227: case OP_TYPEPOSSTAR: 1228: ADD_ACTIVE(state_offset + 2, 0); 1229: if (clen > 0) 1230: { 1231: if (d == OP_ANY && ptr + 1 >= md->end_subject && 1232: (md->moptions & (PCRE_PARTIAL_HARD)) != 0 && 1233: NLBLOCK->nltype == NLTYPE_FIXED && 1234: NLBLOCK->nllen == 2 && 1235: c == NLBLOCK->nl[0]) 1236: { 1237: could_continue = partial_newline = TRUE; 1238: } 1239: else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || 1240: (c < 256 && 1241: (d != OP_ANY || !IS_NEWLINE(ptr)) && 1242: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) 1243: { 1244: if (codevalue == OP_TYPEPOSSTAR) 1245: { 1246: active_count--; /* Remove non-match possibility */ 1247: next_active_state--; 1248: } 1249: ADD_NEW(state_offset, 0); 1250: } 1251: } 1252: break; 1253: 1254: /*-----------------------------------------------------------------*/ 1255: case OP_TYPEEXACT: 1256: count = current_state->count; /* Number already matched */ 1257: if (clen > 0) 1258: { 1259: if (d == OP_ANY && ptr + 1 >= md->end_subject && 1260: (md->moptions & (PCRE_PARTIAL_HARD)) != 0 && 1261: NLBLOCK->nltype == NLTYPE_FIXED && 1262: NLBLOCK->nllen == 2 && 1263: c == NLBLOCK->nl[0]) 1264: { 1265: could_continue = partial_newline = TRUE; 1266: } 1267: else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || 1268: (c < 256 && 1269: (d != OP_ANY || !IS_NEWLINE(ptr)) && 1270: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) 1271: { 1272: if (++count >= (int)GET2(code, 1)) 1273: { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); } 1274: else 1275: { ADD_NEW(state_offset, count); } 1276: } 1277: } 1278: break; 1279: 1280: /*-----------------------------------------------------------------*/ 1281: case OP_TYPEUPTO: 1282: case OP_TYPEMINUPTO: 1283: case OP_TYPEPOSUPTO: 1284: ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); 1285: count = current_state->count; /* Number already matched */ 1286: if (clen > 0) 1287: { 1288: if (d == OP_ANY && ptr + 1 >= md->end_subject && 1289: (md->moptions & (PCRE_PARTIAL_HARD)) != 0 && 1290: NLBLOCK->nltype == NLTYPE_FIXED && 1291: NLBLOCK->nllen == 2 && 1292: c == NLBLOCK->nl[0]) 1293: { 1294: could_continue = partial_newline = TRUE; 1295: } 1296: else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || 1297: (c < 256 && 1298: (d != OP_ANY || !IS_NEWLINE(ptr)) && 1299: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) 1300: { 1301: if (codevalue == OP_TYPEPOSUPTO) 1302: { 1303: active_count--; /* Remove non-match possibility */ 1304: next_active_state--; 1305: } 1306: if (++count >= (int)GET2(code, 1)) 1307: { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); } 1308: else 1309: { ADD_NEW(state_offset, count); } 1310: } 1311: } 1312: break; 1313: 1314: /* ========================================================================== */ 1315: /* These are virtual opcodes that are used when something like 1316: OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its 1317: argument. It keeps the code above fast for the other cases. The argument 1318: is in the d variable. */ 1319: 1320: #ifdef SUPPORT_UCP 1321: case OP_PROP_EXTRA + OP_TYPEPLUS: 1322: case OP_PROP_EXTRA + OP_TYPEMINPLUS: 1323: case OP_PROP_EXTRA + OP_TYPEPOSPLUS: 1324: count = current_state->count; /* Already matched */ 1325: if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); } 1326: if (clen > 0) 1327: { 1328: BOOL OK; 1329: const pcre_uint32 *cp; 1330: const ucd_record * prop = GET_UCD(c); 1331: switch(code[2]) 1332: { 1333: case PT_ANY: 1334: OK = TRUE; 1335: break; 1336: 1337: case PT_LAMP: 1338: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || 1339: prop->chartype == ucp_Lt; 1340: break; 1341: 1342: case PT_GC: 1343: OK = PRIV(ucp_gentype)[prop->chartype] == code[3]; 1344: break; 1345: 1346: case PT_PC: 1347: OK = prop->chartype == code[3]; 1348: break; 1349: 1350: case PT_SC: 1351: OK = prop->script == code[3]; 1352: break; 1353: 1354: /* These are specials for combination cases. */ 1355: 1356: case PT_ALNUM: 1357: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1358: PRIV(ucp_gentype)[prop->chartype] == ucp_N; 1359: break; 1360: 1361: /* Perl space used to exclude VT, but from Perl 5.18 it is included, 1362: which means that Perl space and POSIX space are now identical. PCRE 1363: was changed at release 8.34. */ 1364: 1365: case PT_SPACE: /* Perl space */ 1366: case PT_PXSPACE: /* POSIX space */ 1367: switch(c) 1368: { 1369: HSPACE_CASES: 1370: VSPACE_CASES: 1371: OK = TRUE; 1372: break; 1373: 1374: default: 1375: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z; 1376: break; 1377: } 1378: break; 1379: 1380: case PT_WORD: 1381: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1382: PRIV(ucp_gentype)[prop->chartype] == ucp_N || 1383: c == CHAR_UNDERSCORE; 1384: break; 1385: 1386: case PT_CLIST: 1387: cp = PRIV(ucd_caseless_sets) + code[3]; 1388: for (;;) 1389: { 1390: if (c < *cp) { OK = FALSE; break; } 1391: if (c == *cp++) { OK = TRUE; break; } 1392: } 1393: break; 1394: 1395: case PT_UCNC: 1396: OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 1397: c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 1398: c >= 0xe000; 1399: break; 1400: 1401: /* Should never occur, but keep compilers from grumbling. */ 1402: 1403: default: 1404: OK = codevalue != OP_PROP; 1405: break; 1406: } 1407: 1408: if (OK == (d == OP_PROP)) 1409: { 1410: if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS) 1411: { 1412: active_count--; /* Remove non-match possibility */ 1413: next_active_state--; 1414: } 1415: count++; 1416: ADD_NEW(state_offset, count); 1417: } 1418: } 1419: break; 1420: 1421: /*-----------------------------------------------------------------*/ 1422: case OP_EXTUNI_EXTRA + OP_TYPEPLUS: 1423: case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS: 1424: case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS: 1425: count = current_state->count; /* Already matched */ 1426: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } 1427: if (clen > 0) 1428: { 1429: int lgb, rgb; 1430: const pcre_uchar *nptr = ptr + clen; 1431: int ncount = 0; 1432: if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS) 1433: { 1434: active_count--; /* Remove non-match possibility */ 1435: next_active_state--; 1436: } 1437: lgb = UCD_GRAPHBREAK(c); 1438: while (nptr < end_subject) 1439: { 1440: dlen = 1; 1441: if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } 1442: rgb = UCD_GRAPHBREAK(d); 1443: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; 1444: ncount++; 1445: lgb = rgb; 1446: nptr += dlen; 1447: } 1448: count++; 1449: ADD_NEW_DATA(-state_offset, count, ncount); 1450: } 1451: break; 1452: #endif 1453: 1454: /*-----------------------------------------------------------------*/ 1455: case OP_ANYNL_EXTRA + OP_TYPEPLUS: 1456: case OP_ANYNL_EXTRA + OP_TYPEMINPLUS: 1457: case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS: 1458: count = current_state->count; /* Already matched */ 1459: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } 1460: if (clen > 0) 1461: { 1462: int ncount = 0; 1463: switch (c) 1464: { 1465: case CHAR_VT: 1466: case CHAR_FF: 1467: case CHAR_NEL: 1468: #ifndef EBCDIC 1469: case 0x2028: 1470: case 0x2029: 1471: #endif /* Not EBCDIC */ 1472: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; 1473: goto ANYNL01; 1474: 1475: case CHAR_CR: 1476: if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1; 1477: /* Fall through */ 1478: 1479: ANYNL01: 1480: case CHAR_LF: 1481: if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS) 1482: { 1483: active_count--; /* Remove non-match possibility */ 1484: next_active_state--; 1485: } 1486: count++; 1487: ADD_NEW_DATA(-state_offset, count, ncount); 1488: break; 1489: 1490: default: 1491: break; 1492: } 1493: } 1494: break; 1495: 1496: /*-----------------------------------------------------------------*/ 1497: case OP_VSPACE_EXTRA + OP_TYPEPLUS: 1498: case OP_VSPACE_EXTRA + OP_TYPEMINPLUS: 1499: case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS: 1500: count = current_state->count; /* Already matched */ 1501: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } 1502: if (clen > 0) 1503: { 1504: BOOL OK; 1505: switch (c) 1506: { 1507: VSPACE_CASES: 1508: OK = TRUE; 1509: break; 1510: 1511: default: 1512: OK = FALSE; 1513: break; 1514: } 1515: 1516: if (OK == (d == OP_VSPACE)) 1517: { 1518: if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS) 1519: { 1520: active_count--; /* Remove non-match possibility */ 1521: next_active_state--; 1522: } 1523: count++; 1524: ADD_NEW_DATA(-state_offset, count, 0); 1525: } 1526: } 1527: break; 1528: 1529: /*-----------------------------------------------------------------*/ 1530: case OP_HSPACE_EXTRA + OP_TYPEPLUS: 1531: case OP_HSPACE_EXTRA + OP_TYPEMINPLUS: 1532: case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS: 1533: count = current_state->count; /* Already matched */ 1534: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } 1535: if (clen > 0) 1536: { 1537: BOOL OK; 1538: switch (c) 1539: { 1540: HSPACE_CASES: 1541: OK = TRUE; 1542: break; 1543: 1544: default: 1545: OK = FALSE; 1546: break; 1547: } 1548: 1549: if (OK == (d == OP_HSPACE)) 1550: { 1551: if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS) 1552: { 1553: active_count--; /* Remove non-match possibility */ 1554: next_active_state--; 1555: } 1556: count++; 1557: ADD_NEW_DATA(-state_offset, count, 0); 1558: } 1559: } 1560: break; 1561: 1562: /*-----------------------------------------------------------------*/ 1563: #ifdef SUPPORT_UCP 1564: case OP_PROP_EXTRA + OP_TYPEQUERY: 1565: case OP_PROP_EXTRA + OP_TYPEMINQUERY: 1566: case OP_PROP_EXTRA + OP_TYPEPOSQUERY: 1567: count = 4; 1568: goto QS1; 1569: 1570: case OP_PROP_EXTRA + OP_TYPESTAR: 1571: case OP_PROP_EXTRA + OP_TYPEMINSTAR: 1572: case OP_PROP_EXTRA + OP_TYPEPOSSTAR: 1573: count = 0; 1574: 1575: QS1: 1576: 1577: ADD_ACTIVE(state_offset + 4, 0); 1578: if (clen > 0) 1579: { 1580: BOOL OK; 1581: const pcre_uint32 *cp; 1582: const ucd_record * prop = GET_UCD(c); 1583: switch(code[2]) 1584: { 1585: case PT_ANY: 1586: OK = TRUE; 1587: break; 1588: 1589: case PT_LAMP: 1590: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || 1591: prop->chartype == ucp_Lt; 1592: break; 1593: 1594: case PT_GC: 1595: OK = PRIV(ucp_gentype)[prop->chartype] == code[3]; 1596: break; 1597: 1598: case PT_PC: 1599: OK = prop->chartype == code[3]; 1600: break; 1601: 1602: case PT_SC: 1603: OK = prop->script == code[3]; 1604: break; 1605: 1606: /* These are specials for combination cases. */ 1607: 1608: case PT_ALNUM: 1609: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1610: PRIV(ucp_gentype)[prop->chartype] == ucp_N; 1611: break; 1612: 1613: /* Perl space used to exclude VT, but from Perl 5.18 it is included, 1614: which means that Perl space and POSIX space are now identical. PCRE 1615: was changed at release 8.34. */ 1616: 1617: case PT_SPACE: /* Perl space */ 1618: case PT_PXSPACE: /* POSIX space */ 1619: switch(c) 1620: { 1621: HSPACE_CASES: 1622: VSPACE_CASES: 1623: OK = TRUE; 1624: break; 1625: 1626: default: 1627: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z; 1628: break; 1629: } 1630: break; 1631: 1632: case PT_WORD: 1633: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1634: PRIV(ucp_gentype)[prop->chartype] == ucp_N || 1635: c == CHAR_UNDERSCORE; 1636: break; 1637: 1638: case PT_CLIST: 1639: cp = PRIV(ucd_caseless_sets) + code[3]; 1640: for (;;) 1641: { 1642: if (c < *cp) { OK = FALSE; break; } 1643: if (c == *cp++) { OK = TRUE; break; } 1644: } 1645: break; 1646: 1647: case PT_UCNC: 1648: OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 1649: c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 1650: c >= 0xe000; 1651: break; 1652: 1653: /* Should never occur, but keep compilers from grumbling. */ 1654: 1655: default: 1656: OK = codevalue != OP_PROP; 1657: break; 1658: } 1659: 1660: if (OK == (d == OP_PROP)) 1661: { 1662: if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR || 1663: codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY) 1664: { 1665: active_count--; /* Remove non-match possibility */ 1666: next_active_state--; 1667: } 1668: ADD_NEW(state_offset + count, 0); 1669: } 1670: } 1671: break; 1672: 1673: /*-----------------------------------------------------------------*/ 1674: case OP_EXTUNI_EXTRA + OP_TYPEQUERY: 1675: case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY: 1676: case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY: 1677: count = 2; 1678: goto QS2; 1679: 1680: case OP_EXTUNI_EXTRA + OP_TYPESTAR: 1681: case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR: 1682: case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR: 1683: count = 0; 1684: 1685: QS2: 1686: 1687: ADD_ACTIVE(state_offset + 2, 0); 1688: if (clen > 0) 1689: { 1690: int lgb, rgb; 1691: const pcre_uchar *nptr = ptr + clen; 1692: int ncount = 0; 1693: if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR || 1694: codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY) 1695: { 1696: active_count--; /* Remove non-match possibility */ 1697: next_active_state--; 1698: } 1699: lgb = UCD_GRAPHBREAK(c); 1700: while (nptr < end_subject) 1701: { 1702: dlen = 1; 1703: if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } 1704: rgb = UCD_GRAPHBREAK(d); 1705: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; 1706: ncount++; 1707: lgb = rgb; 1708: nptr += dlen; 1709: } 1710: ADD_NEW_DATA(-(state_offset + count), 0, ncount); 1711: } 1712: break; 1713: #endif 1714: 1715: /*-----------------------------------------------------------------*/ 1716: case OP_ANYNL_EXTRA + OP_TYPEQUERY: 1717: case OP_ANYNL_EXTRA + OP_TYPEMINQUERY: 1718: case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY: 1719: count = 2; 1720: goto QS3; 1721: 1722: case OP_ANYNL_EXTRA + OP_TYPESTAR: 1723: case OP_ANYNL_EXTRA + OP_TYPEMINSTAR: 1724: case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR: 1725: count = 0; 1726: 1727: QS3: 1728: ADD_ACTIVE(state_offset + 2, 0); 1729: if (clen > 0) 1730: { 1731: int ncount = 0; 1732: switch (c) 1733: { 1734: case CHAR_VT: 1735: case CHAR_FF: 1736: case CHAR_NEL: 1737: #ifndef EBCDIC 1738: case 0x2028: 1739: case 0x2029: 1740: #endif /* Not EBCDIC */ 1741: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; 1742: goto ANYNL02; 1743: 1744: case CHAR_CR: 1745: if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1; 1746: /* Fall through */ 1747: 1748: ANYNL02: 1749: case CHAR_LF: 1750: if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR || 1751: codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY) 1752: { 1753: active_count--; /* Remove non-match possibility */ 1754: next_active_state--; 1755: } 1756: ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount); 1757: break; 1758: 1759: default: 1760: break; 1761: } 1762: } 1763: break; 1764: 1765: /*-----------------------------------------------------------------*/ 1766: case OP_VSPACE_EXTRA + OP_TYPEQUERY: 1767: case OP_VSPACE_EXTRA + OP_TYPEMINQUERY: 1768: case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY: 1769: count = 2; 1770: goto QS4; 1771: 1772: case OP_VSPACE_EXTRA + OP_TYPESTAR: 1773: case OP_VSPACE_EXTRA + OP_TYPEMINSTAR: 1774: case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR: 1775: count = 0; 1776: 1777: QS4: 1778: ADD_ACTIVE(state_offset + 2, 0); 1779: if (clen > 0) 1780: { 1781: BOOL OK; 1782: switch (c) 1783: { 1784: VSPACE_CASES: 1785: OK = TRUE; 1786: break; 1787: 1788: default: 1789: OK = FALSE; 1790: break; 1791: } 1792: if (OK == (d == OP_VSPACE)) 1793: { 1794: if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR || 1795: codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY) 1796: { 1797: active_count--; /* Remove non-match possibility */ 1798: next_active_state--; 1799: } 1800: ADD_NEW_DATA(-(state_offset + (int)count), 0, 0); 1801: } 1802: } 1803: break; 1804: 1805: /*-----------------------------------------------------------------*/ 1806: case OP_HSPACE_EXTRA + OP_TYPEQUERY: 1807: case OP_HSPACE_EXTRA + OP_TYPEMINQUERY: 1808: case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY: 1809: count = 2; 1810: goto QS5; 1811: 1812: case OP_HSPACE_EXTRA + OP_TYPESTAR: 1813: case OP_HSPACE_EXTRA + OP_TYPEMINSTAR: 1814: case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR: 1815: count = 0; 1816: 1817: QS5: 1818: ADD_ACTIVE(state_offset + 2, 0); 1819: if (clen > 0) 1820: { 1821: BOOL OK; 1822: switch (c) 1823: { 1824: HSPACE_CASES: 1825: OK = TRUE; 1826: break; 1827: 1828: default: 1829: OK = FALSE; 1830: break; 1831: } 1832: 1833: if (OK == (d == OP_HSPACE)) 1834: { 1835: if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR || 1836: codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY) 1837: { 1838: active_count--; /* Remove non-match possibility */ 1839: next_active_state--; 1840: } 1841: ADD_NEW_DATA(-(state_offset + (int)count), 0, 0); 1842: } 1843: } 1844: break; 1845: 1846: /*-----------------------------------------------------------------*/ 1847: #ifdef SUPPORT_UCP 1848: case OP_PROP_EXTRA + OP_TYPEEXACT: 1849: case OP_PROP_EXTRA + OP_TYPEUPTO: 1850: case OP_PROP_EXTRA + OP_TYPEMINUPTO: 1851: case OP_PROP_EXTRA + OP_TYPEPOSUPTO: 1852: if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT) 1853: { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); } 1854: count = current_state->count; /* Number already matched */ 1855: if (clen > 0) 1856: { 1857: BOOL OK; 1858: const pcre_uint32 *cp; 1859: const ucd_record * prop = GET_UCD(c); 1860: switch(code[1 + IMM2_SIZE + 1]) 1861: { 1862: case PT_ANY: 1863: OK = TRUE; 1864: break; 1865: 1866: case PT_LAMP: 1867: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || 1868: prop->chartype == ucp_Lt; 1869: break; 1870: 1871: case PT_GC: 1872: OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2]; 1873: break; 1874: 1875: case PT_PC: 1876: OK = prop->chartype == code[1 + IMM2_SIZE + 2]; 1877: break; 1878: 1879: case PT_SC: 1880: OK = prop->script == code[1 + IMM2_SIZE + 2]; 1881: break; 1882: 1883: /* These are specials for combination cases. */ 1884: 1885: case PT_ALNUM: 1886: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1887: PRIV(ucp_gentype)[prop->chartype] == ucp_N; 1888: break; 1889: 1890: /* Perl space used to exclude VT, but from Perl 5.18 it is included, 1891: which means that Perl space and POSIX space are now identical. PCRE 1892: was changed at release 8.34. */ 1893: 1894: case PT_SPACE: /* Perl space */ 1895: case PT_PXSPACE: /* POSIX space */ 1896: switch(c) 1897: { 1898: HSPACE_CASES: 1899: VSPACE_CASES: 1900: OK = TRUE; 1901: break; 1902: 1903: default: 1904: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z; 1905: break; 1906: } 1907: break; 1908: 1909: case PT_WORD: 1910: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1911: PRIV(ucp_gentype)[prop->chartype] == ucp_N || 1912: c == CHAR_UNDERSCORE; 1913: break; 1914: 1915: case PT_CLIST: 1916: cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2]; 1917: for (;;) 1918: { 1919: if (c < *cp) { OK = FALSE; break; } 1920: if (c == *cp++) { OK = TRUE; break; } 1921: } 1922: break; 1923: 1924: case PT_UCNC: 1925: OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 1926: c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 1927: c >= 0xe000; 1928: break; 1929: 1930: /* Should never occur, but keep compilers from grumbling. */ 1931: 1932: default: 1933: OK = codevalue != OP_PROP; 1934: break; 1935: } 1936: 1937: if (OK == (d == OP_PROP)) 1938: { 1939: if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO) 1940: { 1941: active_count--; /* Remove non-match possibility */ 1942: next_active_state--; 1943: } 1944: if (++count >= (int)GET2(code, 1)) 1945: { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); } 1946: else 1947: { ADD_NEW(state_offset, count); } 1948: } 1949: } 1950: break; 1951: 1952: /*-----------------------------------------------------------------*/ 1953: case OP_EXTUNI_EXTRA + OP_TYPEEXACT: 1954: case OP_EXTUNI_EXTRA + OP_TYPEUPTO: 1955: case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO: 1956: case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO: 1957: if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT) 1958: { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } 1959: count = current_state->count; /* Number already matched */ 1960: if (clen > 0) 1961: { 1962: int lgb, rgb; 1963: const pcre_uchar *nptr = ptr + clen; 1964: int ncount = 0; 1965: if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO) 1966: { 1967: active_count--; /* Remove non-match possibility */ 1968: next_active_state--; 1969: } 1970: lgb = UCD_GRAPHBREAK(c); 1971: while (nptr < end_subject) 1972: { 1973: dlen = 1; 1974: if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } 1975: rgb = UCD_GRAPHBREAK(d); 1976: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; 1977: ncount++; 1978: lgb = rgb; 1979: nptr += dlen; 1980: } 1981: if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0) 1982: reset_could_continue = TRUE; 1983: if (++count >= (int)GET2(code, 1)) 1984: { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); } 1985: else 1986: { ADD_NEW_DATA(-state_offset, count, ncount); } 1987: } 1988: break; 1989: #endif 1990: 1991: /*-----------------------------------------------------------------*/ 1992: case OP_ANYNL_EXTRA + OP_TYPEEXACT: 1993: case OP_ANYNL_EXTRA + OP_TYPEUPTO: 1994: case OP_ANYNL_EXTRA + OP_TYPEMINUPTO: 1995: case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO: 1996: if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT) 1997: { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } 1998: count = current_state->count; /* Number already matched */ 1999: if (clen > 0) 2000: { 2001: int ncount = 0; 2002: switch (c) 2003: { 2004: case CHAR_VT: 2005: case CHAR_FF: 2006: case CHAR_NEL: 2007: #ifndef EBCDIC 2008: case 0x2028: 2009: case 0x2029: 2010: #endif /* Not EBCDIC */ 2011: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; 2012: goto ANYNL03; 2013: 2014: case CHAR_CR: 2015: if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1; 2016: /* Fall through */ 2017: 2018: ANYNL03: 2019: case CHAR_LF: 2020: if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO) 2021: { 2022: active_count--; /* Remove non-match possibility */ 2023: next_active_state--; 2024: } 2025: if (++count >= (int)GET2(code, 1)) 2026: { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); } 2027: else 2028: { ADD_NEW_DATA(-state_offset, count, ncount); } 2029: break; 2030: 2031: default: 2032: break; 2033: } 2034: } 2035: break; 2036: 2037: /*-----------------------------------------------------------------*/ 2038: case OP_VSPACE_EXTRA + OP_TYPEEXACT: 2039: case OP_VSPACE_EXTRA + OP_TYPEUPTO: 2040: case OP_VSPACE_EXTRA + OP_TYPEMINUPTO: 2041: case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO: 2042: if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT) 2043: { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } 2044: count = current_state->count; /* Number already matched */ 2045: if (clen > 0) 2046: { 2047: BOOL OK; 2048: switch (c) 2049: { 2050: VSPACE_CASES: 2051: OK = TRUE; 2052: break; 2053: 2054: default: 2055: OK = FALSE; 2056: } 2057: 2058: if (OK == (d == OP_VSPACE)) 2059: { 2060: if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO) 2061: { 2062: active_count--; /* Remove non-match possibility */ 2063: next_active_state--; 2064: } 2065: if (++count >= (int)GET2(code, 1)) 2066: { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); } 2067: else 2068: { ADD_NEW_DATA(-state_offset, count, 0); } 2069: } 2070: } 2071: break; 2072: 2073: /*-----------------------------------------------------------------*/ 2074: case OP_HSPACE_EXTRA + OP_TYPEEXACT: 2075: case OP_HSPACE_EXTRA + OP_TYPEUPTO: 2076: case OP_HSPACE_EXTRA + OP_TYPEMINUPTO: 2077: case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO: 2078: if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT) 2079: { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } 2080: count = current_state->count; /* Number already matched */ 2081: if (clen > 0) 2082: { 2083: BOOL OK; 2084: switch (c) 2085: { 2086: HSPACE_CASES: 2087: OK = TRUE; 2088: break; 2089: 2090: default: 2091: OK = FALSE; 2092: break; 2093: } 2094: 2095: if (OK == (d == OP_HSPACE)) 2096: { 2097: if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO) 2098: { 2099: active_count--; /* Remove non-match possibility */ 2100: next_active_state--; 2101: } 2102: if (++count >= (int)GET2(code, 1)) 2103: { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); } 2104: else 2105: { ADD_NEW_DATA(-state_offset, count, 0); } 2106: } 2107: } 2108: break; 2109: 2110: /* ========================================================================== */ 2111: /* These opcodes are followed by a character that is usually compared 2112: to the current subject character; it is loaded into d. We still get 2113: here even if there is no subject character, because in some cases zero 2114: repetitions are permitted. */ 2115: 2116: /*-----------------------------------------------------------------*/ 2117: case OP_CHAR: 2118: if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); } 2119: break; 2120: 2121: /*-----------------------------------------------------------------*/ 2122: case OP_CHARI: 2123: if (clen == 0) break; 2124: 2125: #ifdef SUPPORT_UTF 2126: if (utf) 2127: { 2128: if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else 2129: { 2130: unsigned int othercase; 2131: if (c < 128) 2132: othercase = fcc[c]; 2133: else 2134: /* If we have Unicode property support, we can use it to test the 2135: other case of the character. */ 2136: #ifdef SUPPORT_UCP 2137: othercase = UCD_OTHERCASE(c); 2138: #else 2139: othercase = NOTACHAR; 2140: #endif 2141: 2142: if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); } 2143: } 2144: } 2145: else 2146: #endif /* SUPPORT_UTF */ 2147: /* Not UTF mode */ 2148: { 2149: if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d)) 2150: { ADD_NEW(state_offset + 2, 0); } 2151: } 2152: break; 2153: 2154: 2155: #ifdef SUPPORT_UCP 2156: /*-----------------------------------------------------------------*/ 2157: /* This is a tricky one because it can match more than one character. 2158: Find out how many characters to skip, and then set up a negative state 2159: to wait for them to pass before continuing. */ 2160: 2161: case OP_EXTUNI: 2162: if (clen > 0) 2163: { 2164: int lgb, rgb; 2165: const pcre_uchar *nptr = ptr + clen; 2166: int ncount = 0; 2167: lgb = UCD_GRAPHBREAK(c); 2168: while (nptr < end_subject) 2169: { 2170: dlen = 1; 2171: if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } 2172: rgb = UCD_GRAPHBREAK(d); 2173: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; 2174: ncount++; 2175: lgb = rgb; 2176: nptr += dlen; 2177: } 2178: if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0) 2179: reset_could_continue = TRUE; 2180: ADD_NEW_DATA(-(state_offset + 1), 0, ncount); 2181: } 2182: break; 2183: #endif 2184: 2185: /*-----------------------------------------------------------------*/ 2186: /* This is a tricky like EXTUNI because it too can match more than one 2187: character (when CR is followed by LF). In this case, set up a negative 2188: state to wait for one character to pass before continuing. */ 2189: 2190: case OP_ANYNL: 2191: if (clen > 0) switch(c) 2192: { 2193: case CHAR_VT: 2194: case CHAR_FF: 2195: case CHAR_NEL: 2196: #ifndef EBCDIC 2197: case 0x2028: 2198: case 0x2029: 2199: #endif /* Not EBCDIC */ 2200: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; 2201: 2202: case CHAR_LF: 2203: ADD_NEW(state_offset + 1, 0); 2204: break; 2205: 2206: case CHAR_CR: 2207: if (ptr + 1 >= end_subject) 2208: { 2209: ADD_NEW(state_offset + 1, 0); 2210: if ((md->moptions & PCRE_PARTIAL_HARD) != 0) 2211: reset_could_continue = TRUE; 2212: } 2213: else if (RAWUCHARTEST(ptr + 1) == CHAR_LF) 2214: { 2215: ADD_NEW_DATA(-(state_offset + 1), 0, 1); 2216: } 2217: else 2218: { 2219: ADD_NEW(state_offset + 1, 0); 2220: } 2221: break; 2222: } 2223: break; 2224: 2225: /*-----------------------------------------------------------------*/ 2226: case OP_NOT_VSPACE: 2227: if (clen > 0) switch(c) 2228: { 2229: VSPACE_CASES: 2230: break; 2231: 2232: default: 2233: ADD_NEW(state_offset + 1, 0); 2234: break; 2235: } 2236: break; 2237: 2238: /*-----------------------------------------------------------------*/ 2239: case OP_VSPACE: 2240: if (clen > 0) switch(c) 2241: { 2242: VSPACE_CASES: 2243: ADD_NEW(state_offset + 1, 0); 2244: break; 2245: 2246: default: 2247: break; 2248: } 2249: break; 2250: 2251: /*-----------------------------------------------------------------*/ 2252: case OP_NOT_HSPACE: 2253: if (clen > 0) switch(c) 2254: { 2255: HSPACE_CASES: 2256: break; 2257: 2258: default: 2259: ADD_NEW(state_offset + 1, 0); 2260: break; 2261: } 2262: break; 2263: 2264: /*-----------------------------------------------------------------*/ 2265: case OP_HSPACE: 2266: if (clen > 0) switch(c) 2267: { 2268: HSPACE_CASES: 2269: ADD_NEW(state_offset + 1, 0); 2270: break; 2271: 2272: default: 2273: break; 2274: } 2275: break; 2276: 2277: /*-----------------------------------------------------------------*/ 2278: /* Match a negated single character casefully. */ 2279: 2280: case OP_NOT: 2281: if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); } 2282: break; 2283: 2284: /*-----------------------------------------------------------------*/ 2285: /* Match a negated single character caselessly. */ 2286: 2287: case OP_NOTI: 2288: if (clen > 0) 2289: { 2290: unsigned int otherd; 2291: #ifdef SUPPORT_UTF 2292: if (utf && d >= 128) 2293: { 2294: #ifdef SUPPORT_UCP 2295: otherd = UCD_OTHERCASE(d); 2296: #endif /* SUPPORT_UCP */ 2297: } 2298: else 2299: #endif /* SUPPORT_UTF */ 2300: otherd = TABLE_GET(d, fcc, d); 2301: if (c != d && c != otherd) 2302: { ADD_NEW(state_offset + dlen + 1, 0); } 2303: } 2304: break; 2305: 2306: /*-----------------------------------------------------------------*/ 2307: case OP_PLUSI: 2308: case OP_MINPLUSI: 2309: case OP_POSPLUSI: 2310: case OP_NOTPLUSI: 2311: case OP_NOTMINPLUSI: 2312: case OP_NOTPOSPLUSI: 2313: caseless = TRUE; 2314: codevalue -= OP_STARI - OP_STAR; 2315: 2316: /* Fall through */ 2317: case OP_PLUS: 2318: case OP_MINPLUS: 2319: case OP_POSPLUS: 2320: case OP_NOTPLUS: 2321: case OP_NOTMINPLUS: 2322: case OP_NOTPOSPLUS: 2323: count = current_state->count; /* Already matched */ 2324: if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); } 2325: if (clen > 0) 2326: { 2327: pcre_uint32 otherd = NOTACHAR; 2328: if (caseless) 2329: { 2330: #ifdef SUPPORT_UTF 2331: if (utf && d >= 128) 2332: { 2333: #ifdef SUPPORT_UCP 2334: otherd = UCD_OTHERCASE(d); 2335: #endif /* SUPPORT_UCP */ 2336: } 2337: else 2338: #endif /* SUPPORT_UTF */ 2339: otherd = TABLE_GET(d, fcc, d); 2340: } 2341: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) 2342: { 2343: if (count > 0 && 2344: (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS)) 2345: { 2346: active_count--; /* Remove non-match possibility */ 2347: next_active_state--; 2348: } 2349: count++; 2350: ADD_NEW(state_offset, count); 2351: } 2352: } 2353: break; 2354: 2355: /*-----------------------------------------------------------------*/ 2356: case OP_QUERYI: 2357: case OP_MINQUERYI: 2358: case OP_POSQUERYI: 2359: case OP_NOTQUERYI: 2360: case OP_NOTMINQUERYI: 2361: case OP_NOTPOSQUERYI: 2362: caseless = TRUE; 2363: codevalue -= OP_STARI - OP_STAR; 2364: /* Fall through */ 2365: case OP_QUERY: 2366: case OP_MINQUERY: 2367: case OP_POSQUERY: 2368: case OP_NOTQUERY: 2369: case OP_NOTMINQUERY: 2370: case OP_NOTPOSQUERY: 2371: ADD_ACTIVE(state_offset + dlen + 1, 0); 2372: if (clen > 0) 2373: { 2374: pcre_uint32 otherd = NOTACHAR; 2375: if (caseless) 2376: { 2377: #ifdef SUPPORT_UTF 2378: if (utf && d >= 128) 2379: { 2380: #ifdef SUPPORT_UCP 2381: otherd = UCD_OTHERCASE(d); 2382: #endif /* SUPPORT_UCP */ 2383: } 2384: else 2385: #endif /* SUPPORT_UTF */ 2386: otherd = TABLE_GET(d, fcc, d); 2387: } 2388: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) 2389: { 2390: if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY) 2391: { 2392: active_count--; /* Remove non-match possibility */ 2393: next_active_state--; 2394: } 2395: ADD_NEW(state_offset + dlen + 1, 0); 2396: } 2397: } 2398: break; 2399: 2400: /*-----------------------------------------------------------------*/ 2401: case OP_STARI: 2402: case OP_MINSTARI: 2403: case OP_POSSTARI: 2404: case OP_NOTSTARI: 2405: case OP_NOTMINSTARI: 2406: case OP_NOTPOSSTARI: 2407: caseless = TRUE; 2408: codevalue -= OP_STARI - OP_STAR; 2409: /* Fall through */ 2410: case OP_STAR: 2411: case OP_MINSTAR: 2412: case OP_POSSTAR: 2413: case OP_NOTSTAR: 2414: case OP_NOTMINSTAR: 2415: case OP_NOTPOSSTAR: 2416: ADD_ACTIVE(state_offset + dlen + 1, 0); 2417: if (clen > 0) 2418: { 2419: pcre_uint32 otherd = NOTACHAR; 2420: if (caseless) 2421: { 2422: #ifdef SUPPORT_UTF 2423: if (utf && d >= 128) 2424: { 2425: #ifdef SUPPORT_UCP 2426: otherd = UCD_OTHERCASE(d); 2427: #endif /* SUPPORT_UCP */ 2428: } 2429: else 2430: #endif /* SUPPORT_UTF */ 2431: otherd = TABLE_GET(d, fcc, d); 2432: } 2433: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) 2434: { 2435: if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR) 2436: { 2437: active_count--; /* Remove non-match possibility */ 2438: next_active_state--; 2439: } 2440: ADD_NEW(state_offset, 0); 2441: } 2442: } 2443: break; 2444: 2445: /*-----------------------------------------------------------------*/ 2446: case OP_EXACTI: 2447: case OP_NOTEXACTI: 2448: caseless = TRUE; 2449: codevalue -= OP_STARI - OP_STAR; 2450: /* Fall through */ 2451: case OP_EXACT: 2452: case OP_NOTEXACT: 2453: count = current_state->count; /* Number already matched */ 2454: if (clen > 0) 2455: { 2456: pcre_uint32 otherd = NOTACHAR; 2457: if (caseless) 2458: { 2459: #ifdef SUPPORT_UTF 2460: if (utf && d >= 128) 2461: { 2462: #ifdef SUPPORT_UCP 2463: otherd = UCD_OTHERCASE(d); 2464: #endif /* SUPPORT_UCP */ 2465: } 2466: else 2467: #endif /* SUPPORT_UTF */ 2468: otherd = TABLE_GET(d, fcc, d); 2469: } 2470: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) 2471: { 2472: if (++count >= (int)GET2(code, 1)) 2473: { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); } 2474: else 2475: { ADD_NEW(state_offset, count); } 2476: } 2477: } 2478: break; 2479: 2480: /*-----------------------------------------------------------------*/ 2481: case OP_UPTOI: 2482: case OP_MINUPTOI: 2483: case OP_POSUPTOI: 2484: case OP_NOTUPTOI: 2485: case OP_NOTMINUPTOI: 2486: case OP_NOTPOSUPTOI: 2487: caseless = TRUE; 2488: codevalue -= OP_STARI - OP_STAR; 2489: /* Fall through */ 2490: case OP_UPTO: 2491: case OP_MINUPTO: 2492: case OP_POSUPTO: 2493: case OP_NOTUPTO: 2494: case OP_NOTMINUPTO: 2495: case OP_NOTPOSUPTO: 2496: ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0); 2497: count = current_state->count; /* Number already matched */ 2498: if (clen > 0) 2499: { 2500: pcre_uint32 otherd = NOTACHAR; 2501: if (caseless) 2502: { 2503: #ifdef SUPPORT_UTF 2504: if (utf && d >= 128) 2505: { 2506: #ifdef SUPPORT_UCP 2507: otherd = UCD_OTHERCASE(d); 2508: #endif /* SUPPORT_UCP */ 2509: } 2510: else 2511: #endif /* SUPPORT_UTF */ 2512: otherd = TABLE_GET(d, fcc, d); 2513: } 2514: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) 2515: { 2516: if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO) 2517: { 2518: active_count--; /* Remove non-match possibility */ 2519: next_active_state--; 2520: } 2521: if (++count >= (int)GET2(code, 1)) 2522: { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); } 2523: else 2524: { ADD_NEW(state_offset, count); } 2525: } 2526: } 2527: break; 2528: 2529: 2530: /* ========================================================================== */ 2531: /* These are the class-handling opcodes */ 2532: 2533: case OP_CLASS: 2534: case OP_NCLASS: 2535: case OP_XCLASS: 2536: { 2537: BOOL isinclass = FALSE; 2538: int next_state_offset; 2539: const pcre_uchar *ecode; 2540: 2541: /* For a simple class, there is always just a 32-byte table, and we 2542: can set isinclass from it. */ 2543: 2544: if (codevalue != OP_XCLASS) 2545: { 2546: ecode = code + 1 + (32 / sizeof(pcre_uchar)); 2547: if (clen > 0) 2548: { 2549: isinclass = (c > 255)? (codevalue == OP_NCLASS) : 2550: ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0); 2551: } 2552: } 2553: 2554: /* An extended class may have a table or a list of single characters, 2555: ranges, or both, and it may be positive or negative. There's a 2556: function that sorts all this out. */ 2557: 2558: else 2559: { 2560: ecode = code + GET(code, 1); 2561: if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf); 2562: } 2563: 2564: /* At this point, isinclass is set for all kinds of class, and ecode 2565: points to the byte after the end of the class. If there is a 2566: quantifier, this is where it will be. */ 2567: 2568: next_state_offset = (int)(ecode - start_code); 2569: 2570: switch (*ecode) 2571: { 2572: case OP_CRSTAR: 2573: case OP_CRMINSTAR: 2574: case OP_CRPOSSTAR: 2575: ADD_ACTIVE(next_state_offset + 1, 0); 2576: if (isinclass) 2577: { 2578: if (*ecode == OP_CRPOSSTAR) 2579: { 2580: active_count--; /* Remove non-match possibility */ 2581: next_active_state--; 2582: } 2583: ADD_NEW(state_offset, 0); 2584: } 2585: break; 2586: 2587: case OP_CRPLUS: 2588: case OP_CRMINPLUS: 2589: case OP_CRPOSPLUS: 2590: count = current_state->count; /* Already matched */ 2591: if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); } 2592: if (isinclass) 2593: { 2594: if (count > 0 && *ecode == OP_CRPOSPLUS) 2595: { 2596: active_count--; /* Remove non-match possibility */ 2597: next_active_state--; 2598: } 2599: count++; 2600: ADD_NEW(state_offset, count); 2601: } 2602: break; 2603: 2604: case OP_CRQUERY: 2605: case OP_CRMINQUERY: 2606: case OP_CRPOSQUERY: 2607: ADD_ACTIVE(next_state_offset + 1, 0); 2608: if (isinclass) 2609: { 2610: if (*ecode == OP_CRPOSQUERY) 2611: { 2612: active_count--; /* Remove non-match possibility */ 2613: next_active_state--; 2614: } 2615: ADD_NEW(next_state_offset + 1, 0); 2616: } 2617: break; 2618: 2619: case OP_CRRANGE: 2620: case OP_CRMINRANGE: 2621: case OP_CRPOSRANGE: 2622: count = current_state->count; /* Already matched */ 2623: if (count >= (int)GET2(ecode, 1)) 2624: { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); } 2625: if (isinclass) 2626: { 2627: int max = (int)GET2(ecode, 1 + IMM2_SIZE); 2628: if (*ecode == OP_CRPOSRANGE) 2629: { 2630: active_count--; /* Remove non-match possibility */ 2631: next_active_state--; 2632: } 2633: if (++count >= max && max != 0) /* Max 0 => no limit */ 2634: { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); } 2635: else 2636: { ADD_NEW(state_offset, count); } 2637: } 2638: break; 2639: 2640: default: 2641: if (isinclass) { ADD_NEW(next_state_offset, 0); } 2642: break; 2643: } 2644: } 2645: break; 2646: 2647: /* ========================================================================== */ 2648: /* These are the opcodes for fancy brackets of various kinds. We have 2649: to use recursion in order to handle them. The "always failing" assertion 2650: (?!) is optimised to OP_FAIL when compiling, so we have to support that, 2651: though the other "backtracking verbs" are not supported. */ 2652: 2653: case OP_FAIL: 2654: forced_fail++; /* Count FAILs for multiple states */ 2655: break; 2656: 2657: case OP_ASSERT: 2658: case OP_ASSERT_NOT: 2659: case OP_ASSERTBACK: 2660: case OP_ASSERTBACK_NOT: 2661: { 2662: int rc; 2663: int local_offsets[2]; 2664: int local_workspace[1000]; 2665: const pcre_uchar *endasscode = code + GET(code, 1); 2666: 2667: while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1); 2668: 2669: rc = internal_dfa_exec( 2670: md, /* static match data */ 2671: code, /* this subexpression's code */ 2672: ptr, /* where we currently are */ 2673: (int)(ptr - start_subject), /* start offset */ 2674: local_offsets, /* offset vector */ 2675: sizeof(local_offsets)/sizeof(int), /* size of same */ 2676: local_workspace, /* workspace vector */ 2677: sizeof(local_workspace)/sizeof(int), /* size of same */ 2678: rlevel); /* function recursion level */ 2679: 2680: if (rc == PCRE_ERROR_DFA_UITEM) return rc; 2681: if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK)) 2682: { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); } 2683: } 2684: break; 2685: 2686: /*-----------------------------------------------------------------*/ 2687: case OP_COND: 2688: case OP_SCOND: 2689: { 2690: int local_offsets[1000]; 2691: int local_workspace[1000]; 2692: int codelink = GET(code, 1); 2693: int condcode; 2694: 2695: /* Because of the way auto-callout works during compile, a callout item 2696: is inserted between OP_COND and an assertion condition. This does not 2697: happen for the other conditions. */ 2698: 2699: if (code[LINK_SIZE+1] == OP_CALLOUT) 2700: { 2701: rrc = 0; 2702: if (PUBL(callout) != NULL) 2703: { 2704: PUBL(callout_block) cb; 2705: cb.version = 1; /* Version 1 of the callout block */ 2706: cb.callout_number = code[LINK_SIZE+2]; 2707: cb.offset_vector = offsets; 2708: #if defined COMPILE_PCRE8 2709: cb.subject = (PCRE_SPTR)start_subject; 2710: #elif defined COMPILE_PCRE16 2711: cb.subject = (PCRE_SPTR16)start_subject; 2712: #elif defined COMPILE_PCRE32 2713: cb.subject = (PCRE_SPTR32)start_subject; 2714: #endif 2715: cb.subject_length = (int)(end_subject - start_subject); 2716: cb.start_match = (int)(current_subject - start_subject); 2717: cb.current_position = (int)(ptr - start_subject); 2718: cb.pattern_position = GET(code, LINK_SIZE + 3); 2719: cb.next_item_length = GET(code, 3 + 2*LINK_SIZE); 2720: cb.capture_top = 1; 2721: cb.capture_last = -1; 2722: cb.callout_data = md->callout_data; 2723: cb.mark = NULL; /* No (*MARK) support */ 2724: if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */ 2725: } 2726: if (rrc > 0) break; /* Fail this thread */ 2727: code += PRIV(OP_lengths)[OP_CALLOUT]; /* Skip callout data */ 2728: } 2729: 2730: condcode = code[LINK_SIZE+1]; 2731: 2732: /* Back reference conditions and duplicate named recursion conditions 2733: are not supported */ 2734: 2735: if (condcode == OP_CREF || condcode == OP_DNCREF || 2736: condcode == OP_DNRREF) 2737: return PCRE_ERROR_DFA_UCOND; 2738: 2739: /* The DEFINE condition is always false */ 2740: 2741: if (condcode == OP_DEF) 2742: { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); } 2743: 2744: /* The only supported version of OP_RREF is for the value RREF_ANY, 2745: which means "test if in any recursion". We can't test for specifically 2746: recursed groups. */ 2747: 2748: else if (condcode == OP_RREF) 2749: { 2750: int value = GET2(code, LINK_SIZE + 2); 2751: if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND; 2752: if (md->recursive != NULL) 2753: { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); } 2754: else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); } 2755: } 2756: 2757: /* Otherwise, the condition is an assertion */ 2758: 2759: else 2760: { 2761: int rc; 2762: const pcre_uchar *asscode = code + LINK_SIZE + 1; 2763: const pcre_uchar *endasscode = asscode + GET(asscode, 1); 2764: 2765: while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1); 2766: 2767: rc = internal_dfa_exec( 2768: md, /* fixed match data */ 2769: asscode, /* this subexpression's code */ 2770: ptr, /* where we currently are */ 2771: (int)(ptr - start_subject), /* start offset */ 2772: local_offsets, /* offset vector */ 2773: sizeof(local_offsets)/sizeof(int), /* size of same */ 2774: local_workspace, /* workspace vector */ 2775: sizeof(local_workspace)/sizeof(int), /* size of same */ 2776: rlevel); /* function recursion level */ 2777: 2778: if (rc == PCRE_ERROR_DFA_UITEM) return rc; 2779: if ((rc >= 0) == 2780: (condcode == OP_ASSERT || condcode == OP_ASSERTBACK)) 2781: { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); } 2782: else 2783: { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); } 2784: } 2785: } 2786: break; 2787: 2788: /*-----------------------------------------------------------------*/ 2789: case OP_RECURSE: 2790: { 2791: dfa_recursion_info *ri; 2792: int local_offsets[1000]; 2793: int local_workspace[1000]; 2794: const pcre_uchar *callpat = start_code + GET(code, 1); 2795: int recno = (callpat == md->start_code)? 0 : 2796: GET2(callpat, 1 + LINK_SIZE); 2797: int rc; 2798: 2799: DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP)); 2800: 2801: /* Check for repeating a recursion without advancing the subject 2802: pointer. This should catch convoluted mutual recursions. (Some simple 2803: cases are caught at compile time.) */ 2804: 2805: for (ri = md->recursive; ri != NULL; ri = ri->prevrec) 2806: if (recno == ri->group_num && ptr == ri->subject_position) 2807: return PCRE_ERROR_RECURSELOOP; 2808: 2809: /* Remember this recursion and where we started it so as to 2810: catch infinite loops. */ 2811: 2812: new_recursive.group_num = recno; 2813: new_recursive.subject_position = ptr; 2814: new_recursive.prevrec = md->recursive; 2815: md->recursive = &new_recursive; 2816: 2817: rc = internal_dfa_exec( 2818: md, /* fixed match data */ 2819: callpat, /* this subexpression's code */ 2820: ptr, /* where we currently are */ 2821: (int)(ptr - start_subject), /* start offset */ 2822: local_offsets, /* offset vector */ 2823: sizeof(local_offsets)/sizeof(int), /* size of same */ 2824: local_workspace, /* workspace vector */ 2825: sizeof(local_workspace)/sizeof(int), /* size of same */ 2826: rlevel); /* function recursion level */ 2827: 2828: md->recursive = new_recursive.prevrec; /* Done this recursion */ 2829: 2830: DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP, 2831: rc)); 2832: 2833: /* Ran out of internal offsets */ 2834: 2835: if (rc == 0) return PCRE_ERROR_DFA_RECURSE; 2836: 2837: /* For each successful matched substring, set up the next state with a 2838: count of characters to skip before trying it. Note that the count is in 2839: characters, not bytes. */ 2840: 2841: if (rc > 0) 2842: { 2843: for (rc = rc*2 - 2; rc >= 0; rc -= 2) 2844: { 2845: int charcount = local_offsets[rc+1] - local_offsets[rc]; 2846: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32 2847: if (utf) 2848: { 2849: const pcre_uchar *p = start_subject + local_offsets[rc]; 2850: const pcre_uchar *pp = start_subject + local_offsets[rc+1]; 2851: while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--; 2852: } 2853: #endif 2854: if (charcount > 0) 2855: { 2856: ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1)); 2857: } 2858: else 2859: { 2860: ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0); 2861: } 2862: } 2863: } 2864: else if (rc != PCRE_ERROR_NOMATCH) return rc; 2865: } 2866: break; 2867: 2868: /*-----------------------------------------------------------------*/ 2869: case OP_BRAPOS: 2870: case OP_SBRAPOS: 2871: case OP_CBRAPOS: 2872: case OP_SCBRAPOS: 2873: case OP_BRAPOSZERO: 2874: { 2875: int charcount, matched_count; 2876: const pcre_uchar *local_ptr = ptr; 2877: BOOL allow_zero; 2878: 2879: if (codevalue == OP_BRAPOSZERO) 2880: { 2881: allow_zero = TRUE; 2882: codevalue = *(++code); /* Codevalue will be one of above BRAs */ 2883: } 2884: else allow_zero = FALSE; 2885: 2886: /* Loop to match the subpattern as many times as possible as if it were 2887: a complete pattern. */ 2888: 2889: for (matched_count = 0;; matched_count++) 2890: { 2891: int local_offsets[2]; 2892: int local_workspace[1000]; 2893: 2894: int rc = internal_dfa_exec( 2895: md, /* fixed match data */ 2896: code, /* this subexpression's code */ 2897: local_ptr, /* where we currently are */ 2898: (int)(ptr - start_subject), /* start offset */ 2899: local_offsets, /* offset vector */ 2900: sizeof(local_offsets)/sizeof(int), /* size of same */ 2901: local_workspace, /* workspace vector */ 2902: sizeof(local_workspace)/sizeof(int), /* size of same */ 2903: rlevel); /* function recursion level */ 2904: 2905: /* Failed to match */ 2906: 2907: if (rc < 0) 2908: { 2909: if (rc != PCRE_ERROR_NOMATCH) return rc; 2910: break; 2911: } 2912: 2913: /* Matched: break the loop if zero characters matched. */ 2914: 2915: charcount = local_offsets[1] - local_offsets[0]; 2916: if (charcount == 0) break; 2917: local_ptr += charcount; /* Advance temporary position ptr */ 2918: } 2919: 2920: /* At this point we have matched the subpattern matched_count 2921: times, and local_ptr is pointing to the character after the end of the 2922: last match. */ 2923: 2924: if (matched_count > 0 || allow_zero) 2925: { 2926: const pcre_uchar *end_subpattern = code; 2927: int next_state_offset; 2928: 2929: do { end_subpattern += GET(end_subpattern, 1); } 2930: while (*end_subpattern == OP_ALT); 2931: next_state_offset = 2932: (int)(end_subpattern - start_code + LINK_SIZE + 1); 2933: 2934: /* Optimization: if there are no more active states, and there 2935: are no new states yet set up, then skip over the subject string 2936: right here, to save looping. Otherwise, set up the new state to swing 2937: into action when the end of the matched substring is reached. */ 2938: 2939: if (i + 1 >= active_count && new_count == 0) 2940: { 2941: ptr = local_ptr; 2942: clen = 0; 2943: ADD_NEW(next_state_offset, 0); 2944: } 2945: else 2946: { 2947: const pcre_uchar *p = ptr; 2948: const pcre_uchar *pp = local_ptr; 2949: charcount = (int)(pp - p); 2950: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32 2951: if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--; 2952: #endif 2953: ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1)); 2954: } 2955: } 2956: } 2957: break; 2958: 2959: /*-----------------------------------------------------------------*/ 2960: case OP_ONCE: 2961: case OP_ONCE_NC: 2962: { 2963: int local_offsets[2]; 2964: int local_workspace[1000]; 2965: 2966: int rc = internal_dfa_exec( 2967: md, /* fixed match data */ 2968: code, /* this subexpression's code */ 2969: ptr, /* where we currently are */ 2970: (int)(ptr - start_subject), /* start offset */ 2971: local_offsets, /* offset vector */ 2972: sizeof(local_offsets)/sizeof(int), /* size of same */ 2973: local_workspace, /* workspace vector */ 2974: sizeof(local_workspace)/sizeof(int), /* size of same */ 2975: rlevel); /* function recursion level */ 2976: 2977: if (rc >= 0) 2978: { 2979: const pcre_uchar *end_subpattern = code; 2980: int charcount = local_offsets[1] - local_offsets[0]; 2981: int next_state_offset, repeat_state_offset; 2982: 2983: do { end_subpattern += GET(end_subpattern, 1); } 2984: while (*end_subpattern == OP_ALT); 2985: next_state_offset = 2986: (int)(end_subpattern - start_code + LINK_SIZE + 1); 2987: 2988: /* If the end of this subpattern is KETRMAX or KETRMIN, we must 2989: arrange for the repeat state also to be added to the relevant list. 2990: Calculate the offset, or set -1 for no repeat. */ 2991: 2992: repeat_state_offset = (*end_subpattern == OP_KETRMAX || 2993: *end_subpattern == OP_KETRMIN)? 2994: (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1; 2995: 2996: /* If we have matched an empty string, add the next state at the 2997: current character pointer. This is important so that the duplicate 2998: checking kicks in, which is what breaks infinite loops that match an 2999: empty string. */ 3000: 3001: if (charcount == 0) 3002: { 3003: ADD_ACTIVE(next_state_offset, 0); 3004: } 3005: 3006: /* Optimization: if there are no more active states, and there 3007: are no new states yet set up, then skip over the subject string 3008: right here, to save looping. Otherwise, set up the new state to swing 3009: into action when the end of the matched substring is reached. */ 3010: 3011: else if (i + 1 >= active_count && new_count == 0) 3012: { 3013: ptr += charcount; 3014: clen = 0; 3015: ADD_NEW(next_state_offset, 0); 3016: 3017: /* If we are adding a repeat state at the new character position, 3018: we must fudge things so that it is the only current state. 3019: Otherwise, it might be a duplicate of one we processed before, and 3020: that would cause it to be skipped. */ 3021: 3022: if (repeat_state_offset >= 0) 3023: { 3024: next_active_state = active_states; 3025: active_count = 0; 3026: i = -1; 3027: ADD_ACTIVE(repeat_state_offset, 0); 3028: } 3029: } 3030: else 3031: { 3032: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32 3033: if (utf) 3034: { 3035: const pcre_uchar *p = start_subject + local_offsets[0]; 3036: const pcre_uchar *pp = start_subject + local_offsets[1]; 3037: while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--; 3038: } 3039: #endif 3040: ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1)); 3041: if (repeat_state_offset >= 0) 3042: { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); } 3043: } 3044: } 3045: else if (rc != PCRE_ERROR_NOMATCH) return rc; 3046: } 3047: break; 3048: 3049: 3050: /* ========================================================================== */ 3051: /* Handle callouts */ 3052: 3053: case OP_CALLOUT: 3054: rrc = 0; 3055: if (PUBL(callout) != NULL) 3056: { 3057: PUBL(callout_block) cb; 3058: cb.version = 1; /* Version 1 of the callout block */ 3059: cb.callout_number = code[1]; 3060: cb.offset_vector = offsets; 3061: #if defined COMPILE_PCRE8 3062: cb.subject = (PCRE_SPTR)start_subject; 3063: #elif defined COMPILE_PCRE16 3064: cb.subject = (PCRE_SPTR16)start_subject; 3065: #elif defined COMPILE_PCRE32 3066: cb.subject = (PCRE_SPTR32)start_subject; 3067: #endif 3068: cb.subject_length = (int)(end_subject - start_subject); 3069: cb.start_match = (int)(current_subject - start_subject); 3070: cb.current_position = (int)(ptr - start_subject); 3071: cb.pattern_position = GET(code, 2); 3072: cb.next_item_length = GET(code, 2 + LINK_SIZE); 3073: cb.capture_top = 1; 3074: cb.capture_last = -1; 3075: cb.callout_data = md->callout_data; 3076: cb.mark = NULL; /* No (*MARK) support */ 3077: if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */ 3078: } 3079: if (rrc == 0) 3080: { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); } 3081: break; 3082: 3083: 3084: /* ========================================================================== */ 3085: default: /* Unsupported opcode */ 3086: return PCRE_ERROR_DFA_UITEM; 3087: } 3088: 3089: NEXT_ACTIVE_STATE: continue; 3090: 3091: } /* End of loop scanning active states */ 3092: 3093: /* We have finished the processing at the current subject character. If no 3094: new states have been set for the next character, we have found all the 3095: matches that we are going to find. If we are at the top level and partial 3096: matching has been requested, check for appropriate conditions. 3097: 3098: The "forced_ fail" variable counts the number of (*F) encountered for the 3099: character. If it is equal to the original active_count (saved in 3100: workspace[1]) it means that (*F) was found on every active state. In this 3101: case we don't want to give a partial match. 3102: 3103: The "could_continue" variable is true if a state could have continued but 3104: for the fact that the end of the subject was reached. */ 3105: 3106: if (new_count <= 0) 3107: { 3108: if (rlevel == 1 && /* Top level, and */ 3109: could_continue && /* Some could go on, and */ 3110: forced_fail != workspace[1] && /* Not all forced fail & */ 3111: ( /* either... */ 3112: (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */ 3113: || /* or... */ 3114: ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */ 3115: match_count < 0) /* no matches */ 3116: ) && /* And... */ 3117: ( 3118: partial_newline || /* Either partial NL */ 3119: ( /* or ... */ 3120: ptr >= end_subject && /* End of subject and */ 3121: ptr > md->start_used_ptr) /* Inspected non-empty string */ 3122: ) 3123: ) 3124: match_count = PCRE_ERROR_PARTIAL; 3125: DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" 3126: "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count, 3127: rlevel*2-2, SP)); 3128: break; /* In effect, "return", but see the comment below */ 3129: } 3130: 3131: /* One or more states are active for the next character. */ 3132: 3133: ptr += clen; /* Advance to next subject character */ 3134: } /* Loop to move along the subject string */ 3135: 3136: /* Control gets here from "break" a few lines above. We do it this way because 3137: if we use "return" above, we have compiler trouble. Some compilers warn if 3138: there's nothing here because they think the function doesn't return a value. On 3139: the other hand, if we put a dummy statement here, some more clever compilers 3140: complain that it can't be reached. Sigh. */ 3141: 3142: return match_count; 3143: } 3144: 3145: 3146: 3147: 3148: /************************************************* 3149: * Execute a Regular Expression - DFA engine * 3150: *************************************************/ 3151: 3152: /* This external function applies a compiled re to a subject string using a DFA 3153: engine. This function calls the internal function multiple times if the pattern 3154: is not anchored. 3155: 3156: Arguments: 3157: argument_re points to the compiled expression 3158: extra_data points to extra data or is NULL 3159: subject points to the subject string 3160: length length of subject string (may contain binary zeros) 3161: start_offset where to start in the subject string 3162: options option bits 3163: offsets vector of match offsets 3164: offsetcount size of same 3165: workspace workspace vector 3166: wscount size of same 3167: 3168: Returns: > 0 => number of match offset pairs placed in offsets 3169: = 0 => offsets overflowed; longest matches are present 3170: -1 => failed to match 3171: < -1 => some kind of unexpected problem 3172: */ 3173: 3174: #if defined COMPILE_PCRE8 3175: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION 3176: pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data, 3177: const char *subject, int length, int start_offset, int options, int *offsets, 3178: int offsetcount, int *workspace, int wscount) 3179: #elif defined COMPILE_PCRE16 3180: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION 3181: pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data, 3182: PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets, 3183: int offsetcount, int *workspace, int wscount) 3184: #elif defined COMPILE_PCRE32 3185: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION 3186: pcre32_dfa_exec(const pcre32 *argument_re, const pcre32_extra *extra_data, 3187: PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets, 3188: int offsetcount, int *workspace, int wscount) 3189: #endif 3190: { 3191: REAL_PCRE *re = (REAL_PCRE *)argument_re; 3192: dfa_match_data match_block; 3193: dfa_match_data *md = &match_block; 3194: BOOL utf, anchored, startline, firstline; 3195: const pcre_uchar *current_subject, *end_subject; 3196: const pcre_study_data *study = NULL; 3197: 3198: const pcre_uchar *req_char_ptr; 3199: const pcre_uint8 *start_bits = NULL; 3200: BOOL has_first_char = FALSE; 3201: BOOL has_req_char = FALSE; 3202: pcre_uchar first_char = 0; 3203: pcre_uchar first_char2 = 0; 3204: pcre_uchar req_char = 0; 3205: pcre_uchar req_char2 = 0; 3206: int newline; 3207: 3208: /* Plausibility checks */ 3209: 3210: if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; 3211: if (re == NULL || subject == NULL || workspace == NULL || 3212: (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; 3213: if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; 3214: if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE; 3215: if (length < 0) return PCRE_ERROR_BADLENGTH; 3216: if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET; 3217: 3218: /* Check that the first field in the block is the magic number. If it is not, 3219: return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to 3220: REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which 3221: means that the pattern is likely compiled with different endianness. */ 3222: 3223: if (re->magic_number != MAGIC_NUMBER) 3224: return re->magic_number == REVERSED_MAGIC_NUMBER? 3225: PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC; 3226: if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE; 3227: 3228: /* If restarting after a partial match, do some sanity checks on the contents 3229: of the workspace. */ 3230: 3231: if ((options & PCRE_DFA_RESTART) != 0) 3232: { 3233: if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 || 3234: workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK) 3235: return PCRE_ERROR_DFA_BADRESTART; 3236: } 3237: 3238: /* Set up study, callout, and table data */ 3239: 3240: md->tables = re->tables; 3241: md->callout_data = NULL; 3242: 3243: if (extra_data != NULL) 3244: { 3245: unsigned int flags = extra_data->flags; 3246: if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) 3247: study = (const pcre_study_data *)extra_data->study_data; 3248: if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT; 3249: if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) 3250: return PCRE_ERROR_DFA_UMLIMIT; 3251: if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) 3252: md->callout_data = extra_data->callout_data; 3253: if ((flags & PCRE_EXTRA_TABLES) != 0) 3254: md->tables = extra_data->tables; 3255: } 3256: 3257: /* Set some local values */ 3258: 3259: current_subject = (const pcre_uchar *)subject + start_offset; 3260: end_subject = (const pcre_uchar *)subject + length; 3261: req_char_ptr = current_subject - 1; 3262: 3263: #ifdef SUPPORT_UTF 3264: /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */ 3265: utf = (re->options & PCRE_UTF8) != 0; 3266: #else 3267: utf = FALSE; 3268: #endif 3269: 3270: anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 || 3271: (re->options & PCRE_ANCHORED) != 0; 3272: 3273: /* The remaining fixed data for passing around. */ 3274: 3275: md->start_code = (const pcre_uchar *)argument_re + 3276: re->name_table_offset + re->name_count * re->name_entry_size; 3277: md->start_subject = (const pcre_uchar *)subject; 3278: md->end_subject = end_subject; 3279: md->start_offset = start_offset; 3280: md->moptions = options; 3281: md->poptions = re->options; 3282: 3283: /* If the BSR option is not set at match time, copy what was set 3284: at compile time. */ 3285: 3286: if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0) 3287: { 3288: if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0) 3289: md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE); 3290: #ifdef BSR_ANYCRLF 3291: else md->moptions |= PCRE_BSR_ANYCRLF; 3292: #endif 3293: } 3294: 3295: /* Handle different types of newline. The three bits give eight cases. If 3296: nothing is set at run time, whatever was used at compile time applies. */ 3297: 3298: switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) & 3299: PCRE_NEWLINE_BITS) 3300: { 3301: case 0: newline = NEWLINE; break; /* Compile-time default */ 3302: case PCRE_NEWLINE_CR: newline = CHAR_CR; break; 3303: case PCRE_NEWLINE_LF: newline = CHAR_NL; break; 3304: case PCRE_NEWLINE_CR+ 3305: PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break; 3306: case PCRE_NEWLINE_ANY: newline = -1; break; 3307: case PCRE_NEWLINE_ANYCRLF: newline = -2; break; 3308: default: return PCRE_ERROR_BADNEWLINE; 3309: } 3310: 3311: if (newline == -2) 3312: { 3313: md->nltype = NLTYPE_ANYCRLF; 3314: } 3315: else if (newline < 0) 3316: { 3317: md->nltype = NLTYPE_ANY; 3318: } 3319: else 3320: { 3321: md->nltype = NLTYPE_FIXED; 3322: if (newline > 255) 3323: { 3324: md->nllen = 2; 3325: md->nl[0] = (newline >> 8) & 255; 3326: md->nl[1] = newline & 255; 3327: } 3328: else 3329: { 3330: md->nllen = 1; 3331: md->nl[0] = newline; 3332: } 3333: } 3334: 3335: /* Check a UTF-8 string if required. Unfortunately there's no way of passing 3336: back the character offset. */ 3337: 3338: #ifdef SUPPORT_UTF 3339: if (utf && (options & PCRE_NO_UTF8_CHECK) == 0) 3340: { 3341: int erroroffset; 3342: int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset); 3343: if (errorcode != 0) 3344: { 3345: if (offsetcount >= 2) 3346: { 3347: offsets[0] = erroroffset; 3348: offsets[1] = errorcode; 3349: } 3350: #if defined COMPILE_PCRE8 3351: return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0) ? 3352: PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8; 3353: #elif defined COMPILE_PCRE16 3354: return (errorcode <= PCRE_UTF16_ERR1 && (options & PCRE_PARTIAL_HARD) != 0) ? 3355: PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16; 3356: #elif defined COMPILE_PCRE32 3357: return PCRE_ERROR_BADUTF32; 3358: #endif 3359: } 3360: #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16 3361: if (start_offset > 0 && start_offset < length && 3362: NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset])) 3363: return PCRE_ERROR_BADUTF8_OFFSET; 3364: #endif 3365: } 3366: #endif 3367: 3368: /* If the exec call supplied NULL for tables, use the inbuilt ones. This 3369: is a feature that makes it possible to save compiled regex and re-use them 3370: in other programs later. */ 3371: 3372: if (md->tables == NULL) md->tables = PRIV(default_tables); 3373: 3374: /* The "must be at the start of a line" flags are used in a loop when finding 3375: where to start. */ 3376: 3377: startline = (re->flags & PCRE_STARTLINE) != 0; 3378: firstline = (re->options & PCRE_FIRSTLINE) != 0; 3379: 3380: /* Set up the first character to match, if available. The first_byte value is 3381: never set for an anchored regular expression, but the anchoring may be forced 3382: at run time, so we have to test for anchoring. The first char may be unset for 3383: an unanchored pattern, of course. If there's no first char and the pattern was 3384: studied, there may be a bitmap of possible first characters. */ 3385: 3386: if (!anchored) 3387: { 3388: if ((re->flags & PCRE_FIRSTSET) != 0) 3389: { 3390: has_first_char = TRUE; 3391: first_char = first_char2 = (pcre_uchar)(re->first_char); 3392: if ((re->flags & PCRE_FCH_CASELESS) != 0) 3393: { 3394: first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char); 3395: #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) 3396: if (utf && first_char > 127) 3397: first_char2 = UCD_OTHERCASE(first_char); 3398: #endif 3399: } 3400: } 3401: else 3402: { 3403: if (!startline && study != NULL && 3404: (study->flags & PCRE_STUDY_MAPPED) != 0) 3405: start_bits = study->start_bits; 3406: } 3407: } 3408: 3409: /* For anchored or unanchored matches, there may be a "last known required 3410: character" set. */ 3411: 3412: if ((re->flags & PCRE_REQCHSET) != 0) 3413: { 3414: has_req_char = TRUE; 3415: req_char = req_char2 = (pcre_uchar)(re->req_char); 3416: if ((re->flags & PCRE_RCH_CASELESS) != 0) 3417: { 3418: req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char); 3419: #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) 3420: if (utf && req_char > 127) 3421: req_char2 = UCD_OTHERCASE(req_char); 3422: #endif 3423: } 3424: } 3425: 3426: /* Call the main matching function, looping for a non-anchored regex after a 3427: failed match. If not restarting, perform certain optimizations at the start of 3428: a match. */ 3429: 3430: for (;;) 3431: { 3432: int rc; 3433: 3434: if ((options & PCRE_DFA_RESTART) == 0) 3435: { 3436: const pcre_uchar *save_end_subject = end_subject; 3437: 3438: /* If firstline is TRUE, the start of the match is constrained to the first 3439: line of a multiline string. Implement this by temporarily adjusting 3440: end_subject so that we stop scanning at a newline. If the match fails at 3441: the newline, later code breaks this loop. */ 3442: 3443: if (firstline) 3444: { 3445: PCRE_PUCHAR t = current_subject; 3446: #ifdef SUPPORT_UTF 3447: if (utf) 3448: { 3449: while (t < md->end_subject && !IS_NEWLINE(t)) 3450: { 3451: t++; 3452: ACROSSCHAR(t < end_subject, *t, t++); 3453: } 3454: } 3455: else 3456: #endif 3457: while (t < md->end_subject && !IS_NEWLINE(t)) t++; 3458: end_subject = t; 3459: } 3460: 3461: /* There are some optimizations that avoid running the match if a known 3462: starting point is not found. However, there is an option that disables 3463: these, for testing and for ensuring that all callouts do actually occur. 3464: The option can be set in the regex by (*NO_START_OPT) or passed in 3465: match-time options. */ 3466: 3467: if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0) 3468: { 3469: /* Advance to a known first char. */ 3470: 3471: if (has_first_char) 3472: { 3473: if (first_char != first_char2) 3474: { 3475: pcre_uchar csc; 3476: while (current_subject < end_subject && 3477: (csc = RAWUCHARTEST(current_subject)) != first_char && csc != first_char2) 3478: current_subject++; 3479: } 3480: else 3481: while (current_subject < end_subject && 3482: RAWUCHARTEST(current_subject) != first_char) 3483: current_subject++; 3484: } 3485: 3486: /* Or to just after a linebreak for a multiline match if possible */ 3487: 3488: else if (startline) 3489: { 3490: if (current_subject > md->start_subject + start_offset) 3491: { 3492: #ifdef SUPPORT_UTF 3493: if (utf) 3494: { 3495: while (current_subject < end_subject && 3496: !WAS_NEWLINE(current_subject)) 3497: { 3498: current_subject++; 3499: ACROSSCHAR(current_subject < end_subject, *current_subject, 3500: current_subject++); 3501: } 3502: } 3503: else 3504: #endif 3505: while (current_subject < end_subject && !WAS_NEWLINE(current_subject)) 3506: current_subject++; 3507: 3508: /* If we have just passed a CR and the newline option is ANY or 3509: ANYCRLF, and we are now at a LF, advance the match position by one 3510: more character. */ 3511: 3512: if (RAWUCHARTEST(current_subject - 1) == CHAR_CR && 3513: (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) && 3514: current_subject < end_subject && 3515: RAWUCHARTEST(current_subject) == CHAR_NL) 3516: current_subject++; 3517: } 3518: } 3519: 3520: /* Or to a non-unique first char after study */ 3521: 3522: else if (start_bits != NULL) 3523: { 3524: while (current_subject < end_subject) 3525: { 3526: register pcre_uint32 c = RAWUCHARTEST(current_subject); 3527: #ifndef COMPILE_PCRE8 3528: if (c > 255) c = 255; 3529: #endif 3530: if ((start_bits[c/8] & (1 << (c&7))) == 0) 3531: { 3532: current_subject++; 3533: #if defined SUPPORT_UTF && defined COMPILE_PCRE8 3534: /* In non 8-bit mode, the iteration will stop for 3535: characters > 255 at the beginning or not stop at all. */ 3536: if (utf) 3537: ACROSSCHAR(current_subject < end_subject, *current_subject, 3538: current_subject++); 3539: #endif 3540: } 3541: else break; 3542: } 3543: } 3544: } 3545: 3546: /* Restore fudged end_subject */ 3547: 3548: end_subject = save_end_subject; 3549: 3550: /* The following two optimizations are disabled for partial matching or if 3551: disabling is explicitly requested (and of course, by the test above, this 3552: code is not obeyed when restarting after a partial match). */ 3553: 3554: if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && 3555: (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0) 3556: { 3557: /* If the pattern was studied, a minimum subject length may be set. This 3558: is a lower bound; no actual string of that length may actually match the 3559: pattern. Although the value is, strictly, in characters, we treat it as 3560: bytes to avoid spending too much time in this optimization. */ 3561: 3562: if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 && 3563: (pcre_uint32)(end_subject - current_subject) < study->minlength) 3564: return PCRE_ERROR_NOMATCH; 3565: 3566: /* If req_char is set, we know that that character must appear in the 3567: subject for the match to succeed. If the first character is set, req_char 3568: must be later in the subject; otherwise the test starts at the match 3569: point. This optimization can save a huge amount of work in patterns with 3570: nested unlimited repeats that aren't going to match. Writing separate 3571: code for cased/caseless versions makes it go faster, as does using an 3572: autoincrement and backing off on a match. 3573: 3574: HOWEVER: when the subject string is very, very long, searching to its end 3575: can take a long time, and give bad performance on quite ordinary 3576: patterns. This showed up when somebody was matching /^C/ on a 32-megabyte 3577: string... so we don't do this when the string is sufficiently long. */ 3578: 3579: if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX) 3580: { 3581: register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0); 3582: 3583: /* We don't need to repeat the search if we haven't yet reached the 3584: place we found it at last time. */ 3585: 3586: if (p > req_char_ptr) 3587: { 3588: if (req_char != req_char2) 3589: { 3590: while (p < end_subject) 3591: { 3592: register pcre_uint32 pp = RAWUCHARINCTEST(p); 3593: if (pp == req_char || pp == req_char2) { p--; break; } 3594: } 3595: } 3596: else 3597: { 3598: while (p < end_subject) 3599: { 3600: if (RAWUCHARINCTEST(p) == req_char) { p--; break; } 3601: } 3602: } 3603: 3604: /* If we can't find the required character, break the matching loop, 3605: which will cause a return or PCRE_ERROR_NOMATCH. */ 3606: 3607: if (p >= end_subject) break; 3608: 3609: /* If we have found the required character, save the point where we 3610: found it, so that we don't search again next time round the loop if 3611: the start hasn't passed this character yet. */ 3612: 3613: req_char_ptr = p; 3614: } 3615: } 3616: } 3617: } /* End of optimizations that are done when not restarting */ 3618: 3619: /* OK, now we can do the business */ 3620: 3621: md->start_used_ptr = current_subject; 3622: md->recursive = NULL; 3623: 3624: rc = internal_dfa_exec( 3625: md, /* fixed match data */ 3626: md->start_code, /* this subexpression's code */ 3627: current_subject, /* where we currently are */ 3628: start_offset, /* start offset in subject */ 3629: offsets, /* offset vector */ 3630: offsetcount, /* size of same */ 3631: workspace, /* workspace vector */ 3632: wscount, /* size of same */ 3633: 0); /* function recurse level */ 3634: 3635: /* Anything other than "no match" means we are done, always; otherwise, carry 3636: on only if not anchored. */ 3637: 3638: if (rc != PCRE_ERROR_NOMATCH || anchored) 3639: { 3640: if (rc == PCRE_ERROR_PARTIAL && offsetcount >= 2) 3641: { 3642: offsets[0] = (int)(md->start_used_ptr - (PCRE_PUCHAR)subject); 3643: offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject); 3644: if (offsetcount > 2) 3645: offsets[2] = (int)(current_subject - (PCRE_PUCHAR)subject); 3646: } 3647: return rc; 3648: } 3649: 3650: /* Advance to the next subject character unless we are at the end of a line 3651: and firstline is set. */ 3652: 3653: if (firstline && IS_NEWLINE(current_subject)) break; 3654: current_subject++; 3655: #ifdef SUPPORT_UTF 3656: if (utf) 3657: { 3658: ACROSSCHAR(current_subject < end_subject, *current_subject, 3659: current_subject++); 3660: } 3661: #endif 3662: if (current_subject > end_subject) break; 3663: 3664: /* If we have just passed a CR and we are now at a LF, and the pattern does 3665: not contain any explicit matches for \r or \n, and the newline option is CRLF 3666: or ANY or ANYCRLF, advance the match position by one more character. */ 3667: 3668: if (RAWUCHARTEST(current_subject - 1) == CHAR_CR && 3669: current_subject < end_subject && 3670: RAWUCHARTEST(current_subject) == CHAR_NL && 3671: (re->flags & PCRE_HASCRORLF) == 0 && 3672: (md->nltype == NLTYPE_ANY || 3673: md->nltype == NLTYPE_ANYCRLF || 3674: md->nllen == 2)) 3675: current_subject++; 3676: 3677: } /* "Bumpalong" loop */ 3678: 3679: return PCRE_ERROR_NOMATCH; 3680: } 3681: 3682: /* End of pcre_dfa_exec.c */