embedaddon/pcre/pcre_dfa_exec.c - view

File: [ELWIX - Embedded LightWeight unIX -] / embedaddon / pcre / pcre_dfa_exec.c
Revision 1.1.1.4 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Mon Jul 22 08:25:55 2013 UTC (11 years ago) by misho
Branches: pcre, MAIN
CVS tags: v8_33, HEAD

8.33

1: /************************************************* 2: * Perl-Compatible Regular Expressions * 3: *************************************************/ 4: 5: /* PCRE is a library of functions to support regular expressions whose syntax 6: and semantics are as close as possible to those of the Perl 5 language (but see 7: below for why this module is different). 8: 9: Written by Philip Hazel 10: Copyright (c) 1997-2013 University of Cambridge 11: 12: ----------------------------------------------------------------------------- 13: Redistribution and use in source and binary forms, with or without 14: modification, are permitted provided that the following conditions are met: 15: 16: * Redistributions of source code must retain the above copyright notice, 17: this list of conditions and the following disclaimer. 18: 19: * Redistributions in binary form must reproduce the above copyright 20: notice, this list of conditions and the following disclaimer in the 21: documentation and/or other materials provided with the distribution. 22: 23: * Neither the name of the University of Cambridge nor the names of its 24: contributors may be used to endorse or promote products derived from 25: this software without specific prior written permission. 26: 27: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37: POSSIBILITY OF SUCH DAMAGE. 38: ----------------------------------------------------------------------------- 39: */ 40: 41: /* This module contains the external function pcre_dfa_exec(), which is an 42: alternative matching function that uses a sort of DFA algorithm (not a true 43: FSM). This is NOT Perl-compatible, but it has advantages in certain 44: applications. */ 45: 46: 47: /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved 48: the performance of his patterns greatly. I could not use it as it stood, as it 49: was not thread safe, and made assumptions about pattern sizes. Also, it caused 50: test 7 to loop, and test 9 to crash with a segfault. 51: 52: The issue is the check for duplicate states, which is done by a simple linear 53: search up the state list. (Grep for "duplicate" below to find the code.) For 54: many patterns, there will never be many states active at one time, so a simple 55: linear search is fine. In patterns that have many active states, it might be a 56: bottleneck. The suggested code used an indexing scheme to remember which states 57: had previously been used for each character, and avoided the linear search when 58: it knew there was no chance of a duplicate. This was implemented when adding 59: states to the state lists. 60: 61: I wrote some thread-safe, not-limited code to try something similar at the time 62: of checking for duplicates (instead of when adding states), using index vectors 63: on the stack. It did give a 13% improvement with one specially constructed 64: pattern for certain subject strings, but on other strings and on many of the 65: simpler patterns in the test suite it did worse. The major problem, I think, 66: was the extra time to initialize the index. This had to be done for each call 67: of internal_dfa_exec(). (The supplied patch used a static vector, initialized 68: only once - I suspect this was the cause of the problems with the tests.) 69: 70: Overall, I concluded that the gains in some cases did not outweigh the losses 71: in others, so I abandoned this code. */ 72: 73: 74: 75: #ifdef HAVE_CONFIG_H 76: #include "config.h" 77: #endif 78: 79: #define NLBLOCK md /* Block containing newline information */ 80: #define PSSTART start_subject /* Field containing processed string start */ 81: #define PSEND end_subject /* Field containing processed string end */ 82: 83: #include "pcre_internal.h" 84: 85: 86: /* For use to indent debugging output */ 87: 88: #define SP " " 89: 90: 91: /************************************************* 92: * Code parameters and static tables * 93: *************************************************/ 94: 95: /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes 96: into others, under special conditions. A gap of 20 between the blocks should be 97: enough. The resulting opcodes don't have to be less than 256 because they are 98: never stored, so we push them well clear of the normal opcodes. */ 99: 100: #define OP_PROP_EXTRA 300 101: #define OP_EXTUNI_EXTRA 320 102: #define OP_ANYNL_EXTRA 340 103: #define OP_HSPACE_EXTRA 360 104: #define OP_VSPACE_EXTRA 380 105: 106: 107: /* This table identifies those opcodes that are followed immediately by a 108: character that is to be tested in some way. This makes it possible to 109: centralize the loading of these characters. In the case of Type * etc, the 110: "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a 111: small value. Non-zero values in the table are the offsets from the opcode where 112: the character is to be found. ***NOTE*** If the start of this table is 113: modified, the three tables that follow must also be modified. */ 114: 115: static const pcre_uint8 coptable[] = { 116: 0, /* End */ 117: 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */ 118: 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */ 119: 0, 0, 0, /* Any, AllAny, Anybyte */ 120: 0, 0, /* \P, \p */ 121: 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */ 122: 0, /* \X */ 123: 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */ 124: 1, /* Char */ 125: 1, /* Chari */ 126: 1, /* not */ 127: 1, /* noti */ 128: /* Positive single-char repeats */ 129: 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ 130: 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */ 131: 1+IMM2_SIZE, /* exact */ 132: 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */ 133: 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */ 134: 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */ 135: 1+IMM2_SIZE, /* exact I */ 136: 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */ 137: /* Negative single-char repeats - only for chars < 256 */ 138: 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */ 139: 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */ 140: 1+IMM2_SIZE, /* NOT exact */ 141: 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */ 142: 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */ 143: 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */ 144: 1+IMM2_SIZE, /* NOT exact I */ 145: 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */ 146: /* Positive type repeats */ 147: 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */ 148: 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */ 149: 1+IMM2_SIZE, /* Type exact */ 150: 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */ 151: /* Character class & ref repeats */ 152: 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */ 153: 0, 0, /* CRRANGE, CRMINRANGE */ 154: 0, /* CLASS */ 155: 0, /* NCLASS */ 156: 0, /* XCLASS - variable length */ 157: 0, /* REF */ 158: 0, /* REFI */ 159: 0, /* RECURSE */ 160: 0, /* CALLOUT */ 161: 0, /* Alt */ 162: 0, /* Ket */ 163: 0, /* KetRmax */ 164: 0, /* KetRmin */ 165: 0, /* KetRpos */ 166: 0, /* Reverse */ 167: 0, /* Assert */ 168: 0, /* Assert not */ 169: 0, /* Assert behind */ 170: 0, /* Assert behind not */ 171: 0, 0, /* ONCE, ONCE_NC */ 172: 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */ 173: 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */ 174: 0, 0, /* CREF, NCREF */ 175: 0, 0, /* RREF, NRREF */ 176: 0, /* DEF */ 177: 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ 178: 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */ 179: 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */ 180: 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */ 181: 0, 0 /* CLOSE, SKIPZERO */ 182: }; 183: 184: /* This table identifies those opcodes that inspect a character. It is used to 185: remember the fact that a character could have been inspected when the end of 186: the subject is reached. ***NOTE*** If the start of this table is modified, the 187: two tables that follow must also be modified. */ 188: 189: static const pcre_uint8 poptable[] = { 190: 0, /* End */ 191: 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */ 192: 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ 193: 1, 1, 1, /* Any, AllAny, Anybyte */ 194: 1, 1, /* \P, \p */ 195: 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ 196: 1, /* \X */ 197: 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */ 198: 1, /* Char */ 199: 1, /* Chari */ 200: 1, /* not */ 201: 1, /* noti */ 202: /* Positive single-char repeats */ 203: 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ 204: 1, 1, 1, /* upto, minupto, exact */ 205: 1, 1, 1, 1, /* *+, ++, ?+, upto+ */ 206: 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */ 207: 1, 1, 1, /* upto I, minupto I, exact I */ 208: 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */ 209: /* Negative single-char repeats - only for chars < 256 */ 210: 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */ 211: 1, 1, 1, /* NOT upto, minupto, exact */ 212: 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */ 213: 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */ 214: 1, 1, 1, /* NOT upto I, minupto I, exact I */ 215: 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */ 216: /* Positive type repeats */ 217: 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */ 218: 1, 1, 1, /* Type upto, minupto, exact */ 219: 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */ 220: /* Character class & ref repeats */ 221: 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ 222: 1, 1, /* CRRANGE, CRMINRANGE */ 223: 1, /* CLASS */ 224: 1, /* NCLASS */ 225: 1, /* XCLASS - variable length */ 226: 0, /* REF */ 227: 0, /* REFI */ 228: 0, /* RECURSE */ 229: 0, /* CALLOUT */ 230: 0, /* Alt */ 231: 0, /* Ket */ 232: 0, /* KetRmax */ 233: 0, /* KetRmin */ 234: 0, /* KetRpos */ 235: 0, /* Reverse */ 236: 0, /* Assert */ 237: 0, /* Assert not */ 238: 0, /* Assert behind */ 239: 0, /* Assert behind not */ 240: 0, 0, /* ONCE, ONCE_NC */ 241: 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */ 242: 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */ 243: 0, 0, /* CREF, NCREF */ 244: 0, 0, /* RREF, NRREF */ 245: 0, /* DEF */ 246: 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ 247: 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */ 248: 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */ 249: 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */ 250: 0, 0 /* CLOSE, SKIPZERO */ 251: }; 252: 253: /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W, 254: and \w */ 255: 256: static const pcre_uint8 toptable1[] = { 257: 0, 0, 0, 0, 0, 0, 258: ctype_digit, ctype_digit, 259: ctype_space, ctype_space, 260: ctype_word, ctype_word, 261: 0, 0 /* OP_ANY, OP_ALLANY */ 262: }; 263: 264: static const pcre_uint8 toptable2[] = { 265: 0, 0, 0, 0, 0, 0, 266: ctype_digit, 0, 267: ctype_space, 0, 268: ctype_word, 0, 269: 1, 1 /* OP_ANY, OP_ALLANY */ 270: }; 271: 272: 273: /* Structure for holding data about a particular state, which is in effect the 274: current data for an active path through the match tree. It must consist 275: entirely of ints because the working vector we are passed, and which we put 276: these structures in, is a vector of ints. */ 277: 278: typedef struct stateblock { 279: int offset; /* Offset to opcode */ 280: int count; /* Count for repeats */ 281: int data; /* Some use extra data */ 282: } stateblock; 283: 284: #define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int)) 285: 286: 287: #ifdef PCRE_DEBUG 288: /************************************************* 289: * Print character string * 290: *************************************************/ 291: 292: /* Character string printing function for debugging. 293: 294: Arguments: 295: p points to string 296: length number of bytes 297: f where to print 298: 299: Returns: nothing 300: */ 301: 302: static void 303: pchars(const pcre_uchar *p, int length, FILE *f) 304: { 305: pcre_uint32 c; 306: while (length-- > 0) 307: { 308: if (isprint(c = *(p++))) 309: fprintf(f, "%c", c); 310: else 311: fprintf(f, "\\x{%02x}", c); 312: } 313: } 314: #endif 315: 316: 317: 318: /************************************************* 319: * Execute a Regular Expression - DFA engine * 320: *************************************************/ 321: 322: /* This internal function applies a compiled pattern to a subject string, 323: starting at a given point, using a DFA engine. This function is called from the 324: external one, possibly multiple times if the pattern is not anchored. The 325: function calls itself recursively for some kinds of subpattern. 326: 327: Arguments: 328: md the match_data block with fixed information 329: this_start_code the opening bracket of this subexpression's code 330: current_subject where we currently are in the subject string 331: start_offset start offset in the subject string 332: offsets vector to contain the matching string offsets 333: offsetcount size of same 334: workspace vector of workspace 335: wscount size of same 336: rlevel function call recursion level 337: 338: Returns: > 0 => number of match offset pairs placed in offsets 339: = 0 => offsets overflowed; longest matches are present 340: -1 => failed to match 341: < -1 => some kind of unexpected problem 342: 343: The following macros are used for adding states to the two state vectors (one 344: for the current character, one for the following character). */ 345: 346: #define ADD_ACTIVE(x,y) \ 347: if (active_count++ < wscount) \ 348: { \ 349: next_active_state->offset = (x); \ 350: next_active_state->count = (y); \ 351: next_active_state++; \ 352: DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \ 353: } \ 354: else return PCRE_ERROR_DFA_WSSIZE 355: 356: #define ADD_ACTIVE_DATA(x,y,z) \ 357: if (active_count++ < wscount) \ 358: { \ 359: next_active_state->offset = (x); \ 360: next_active_state->count = (y); \ 361: next_active_state->data = (z); \ 362: next_active_state++; \ 363: DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \ 364: } \ 365: else return PCRE_ERROR_DFA_WSSIZE 366: 367: #define ADD_NEW(x,y) \ 368: if (new_count++ < wscount) \ 369: { \ 370: next_new_state->offset = (x); \ 371: next_new_state->count = (y); \ 372: next_new_state++; \ 373: DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \ 374: } \ 375: else return PCRE_ERROR_DFA_WSSIZE 376: 377: #define ADD_NEW_DATA(x,y,z) \ 378: if (new_count++ < wscount) \ 379: { \ 380: next_new_state->offset = (x); \ 381: next_new_state->count = (y); \ 382: next_new_state->data = (z); \ 383: next_new_state++; \ 384: DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \ 385: (x), (y), (z), __LINE__)); \ 386: } \ 387: else return PCRE_ERROR_DFA_WSSIZE 388: 389: /* And now, here is the code */ 390: 391: static int 392: internal_dfa_exec( 393: dfa_match_data *md, 394: const pcre_uchar *this_start_code, 395: const pcre_uchar *current_subject, 396: int start_offset, 397: int *offsets, 398: int offsetcount, 399: int *workspace, 400: int wscount, 401: int rlevel) 402: { 403: stateblock *active_states, *new_states, *temp_states; 404: stateblock *next_active_state, *next_new_state; 405: 406: const pcre_uint8 *ctypes, *lcc, *fcc; 407: const pcre_uchar *ptr; 408: const pcre_uchar *end_code, *first_op; 409: 410: dfa_recursion_info new_recursive; 411: 412: int active_count, new_count, match_count; 413: 414: /* Some fields in the md block are frequently referenced, so we load them into 415: independent variables in the hope that this will perform better. */ 416: 417: const pcre_uchar *start_subject = md->start_subject; 418: const pcre_uchar *end_subject = md->end_subject; 419: const pcre_uchar *start_code = md->start_code; 420: 421: #ifdef SUPPORT_UTF 422: BOOL utf = (md->poptions & PCRE_UTF8) != 0; 423: #else 424: BOOL utf = FALSE; 425: #endif 426: 427: BOOL reset_could_continue = FALSE; 428: 429: rlevel++; 430: offsetcount &= (-2); 431: 432: wscount -= 2; 433: wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) / 434: (2 * INTS_PER_STATEBLOCK); 435: 436: DPRINTF(("\n%.*s---------------------\n" 437: "%.*sCall to internal_dfa_exec f=%d\n", 438: rlevel*2-2, SP, rlevel*2-2, SP, rlevel)); 439: 440: ctypes = md->tables + ctypes_offset; 441: lcc = md->tables + lcc_offset; 442: fcc = md->tables + fcc_offset; 443: 444: match_count = PCRE_ERROR_NOMATCH; /* A negative number */ 445: 446: active_states = (stateblock *)(workspace + 2); 447: next_new_state = new_states = active_states + wscount; 448: new_count = 0; 449: 450: first_op = this_start_code + 1 + LINK_SIZE + 451: ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA || 452: *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS) 453: ? IMM2_SIZE:0); 454: 455: /* The first thing in any (sub) pattern is a bracket of some sort. Push all 456: the alternative states onto the list, and find out where the end is. This 457: makes is possible to use this function recursively, when we want to stop at a 458: matching internal ket rather than at the end. 459: 460: If the first opcode in the first alternative is OP_REVERSE, we are dealing with 461: a backward assertion. In that case, we have to find out the maximum amount to 462: move back, and set up each alternative appropriately. */ 463: 464: if (*first_op == OP_REVERSE) 465: { 466: int max_back = 0; 467: int gone_back; 468: 469: end_code = this_start_code; 470: do 471: { 472: int back = GET(end_code, 2+LINK_SIZE); 473: if (back > max_back) max_back = back; 474: end_code += GET(end_code, 1); 475: } 476: while (*end_code == OP_ALT); 477: 478: /* If we can't go back the amount required for the longest lookbehind 479: pattern, go back as far as we can; some alternatives may still be viable. */ 480: 481: #ifdef SUPPORT_UTF 482: /* In character mode we have to step back character by character */ 483: 484: if (utf) 485: { 486: for (gone_back = 0; gone_back < max_back; gone_back++) 487: { 488: if (current_subject <= start_subject) break; 489: current_subject--; 490: ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--); 491: } 492: } 493: else 494: #endif 495: 496: /* In byte-mode we can do this quickly. */ 497: 498: { 499: gone_back = (current_subject - max_back < start_subject)? 500: (int)(current_subject - start_subject) : max_back; 501: current_subject -= gone_back; 502: } 503: 504: /* Save the earliest consulted character */ 505: 506: if (current_subject < md->start_used_ptr) 507: md->start_used_ptr = current_subject; 508: 509: /* Now we can process the individual branches. */ 510: 511: end_code = this_start_code; 512: do 513: { 514: int back = GET(end_code, 2+LINK_SIZE); 515: if (back <= gone_back) 516: { 517: int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE); 518: ADD_NEW_DATA(-bstate, 0, gone_back - back); 519: } 520: end_code += GET(end_code, 1); 521: } 522: while (*end_code == OP_ALT); 523: } 524: 525: /* This is the code for a "normal" subpattern (not a backward assertion). The 526: start of a whole pattern is always one of these. If we are at the top level, 527: we may be asked to restart matching from the same point that we reached for a 528: previous partial match. We still have to scan through the top-level branches to 529: find the end state. */ 530: 531: else 532: { 533: end_code = this_start_code; 534: 535: /* Restarting */ 536: 537: if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0) 538: { 539: do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT); 540: new_count = workspace[1]; 541: if (!workspace[0]) 542: memcpy(new_states, active_states, new_count * sizeof(stateblock)); 543: } 544: 545: /* Not restarting */ 546: 547: else 548: { 549: int length = 1 + LINK_SIZE + 550: ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA || 551: *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS) 552: ? IMM2_SIZE:0); 553: do 554: { 555: ADD_NEW((int)(end_code - start_code + length), 0); 556: end_code += GET(end_code, 1); 557: length = 1 + LINK_SIZE; 558: } 559: while (*end_code == OP_ALT); 560: } 561: } 562: 563: workspace[0] = 0; /* Bit indicating which vector is current */ 564: 565: DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code))); 566: 567: /* Loop for scanning the subject */ 568: 569: ptr = current_subject; 570: for (;;) 571: { 572: int i, j; 573: int clen, dlen; 574: pcre_uint32 c, d; 575: int forced_fail = 0; 576: BOOL partial_newline = FALSE; 577: BOOL could_continue = reset_could_continue; 578: reset_could_continue = FALSE; 579: 580: /* Make the new state list into the active state list and empty the 581: new state list. */ 582: 583: temp_states = active_states; 584: active_states = new_states; 585: new_states = temp_states; 586: active_count = new_count; 587: new_count = 0; 588: 589: workspace[0] ^= 1; /* Remember for the restarting feature */ 590: workspace[1] = active_count; 591: 592: #ifdef PCRE_DEBUG 593: printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP); 594: pchars(ptr, STRLEN_UC(ptr), stdout); 595: printf("\"\n"); 596: 597: printf("%.*sActive states: ", rlevel*2-2, SP); 598: for (i = 0; i < active_count; i++) 599: printf("%d/%d ", active_states[i].offset, active_states[i].count); 600: printf("\n"); 601: #endif 602: 603: /* Set the pointers for adding new states */ 604: 605: next_active_state = active_states + active_count; 606: next_new_state = new_states; 607: 608: /* Load the current character from the subject outside the loop, as many 609: different states may want to look at it, and we assume that at least one 610: will. */ 611: 612: if (ptr < end_subject) 613: { 614: clen = 1; /* Number of data items in the character */ 615: #ifdef SUPPORT_UTF 616: GETCHARLENTEST(c, ptr, clen); 617: #else 618: c = *ptr; 619: #endif /* SUPPORT_UTF */ 620: } 621: else 622: { 623: clen = 0; /* This indicates the end of the subject */ 624: c = NOTACHAR; /* This value should never actually be used */ 625: } 626: 627: /* Scan up the active states and act on each one. The result of an action 628: may be to add more states to the currently active list (e.g. on hitting a 629: parenthesis) or it may be to put states on the new list, for considering 630: when we move the character pointer on. */ 631: 632: for (i = 0; i < active_count; i++) 633: { 634: stateblock *current_state = active_states + i; 635: BOOL caseless = FALSE; 636: const pcre_uchar *code; 637: int state_offset = current_state->offset; 638: int codevalue, rrc; 639: int count; 640: 641: #ifdef PCRE_DEBUG 642: printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset); 643: if (clen == 0) printf("EOL\n"); 644: else if (c > 32 && c < 127) printf("'%c'\n", c); 645: else printf("0x%02x\n", c); 646: #endif 647: 648: /* A negative offset is a special case meaning "hold off going to this 649: (negated) state until the number of characters in the data field have 650: been skipped". If the could_continue flag was passed over from a previous 651: state, arrange for it to passed on. */ 652: 653: if (state_offset < 0) 654: { 655: if (current_state->data > 0) 656: { 657: DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP)); 658: ADD_NEW_DATA(state_offset, current_state->count, 659: current_state->data - 1); 660: if (could_continue) reset_could_continue = TRUE; 661: continue; 662: } 663: else 664: { 665: current_state->offset = state_offset = -state_offset; 666: } 667: } 668: 669: /* Check for a duplicate state with the same count, and skip if found. 670: See the note at the head of this module about the possibility of improving 671: performance here. */ 672: 673: for (j = 0; j < i; j++) 674: { 675: if (active_states[j].offset == state_offset && 676: active_states[j].count == current_state->count) 677: { 678: DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP)); 679: goto NEXT_ACTIVE_STATE; 680: } 681: } 682: 683: /* The state offset is the offset to the opcode */ 684: 685: code = start_code + state_offset; 686: codevalue = *code; 687: 688: /* If this opcode inspects a character, but we are at the end of the 689: subject, remember the fact for use when testing for a partial match. */ 690: 691: if (clen == 0 && poptable[codevalue] != 0) 692: could_continue = TRUE; 693: 694: /* If this opcode is followed by an inline character, load it. It is 695: tempting to test for the presence of a subject character here, but that 696: is wrong, because sometimes zero repetitions of the subject are 697: permitted. 698: 699: We also use this mechanism for opcodes such as OP_TYPEPLUS that take an 700: argument that is not a data character - but is always one byte long because 701: the values are small. We have to take special action to deal with \P, \p, 702: \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert 703: these ones to new opcodes. */ 704: 705: if (coptable[codevalue] > 0) 706: { 707: dlen = 1; 708: #ifdef SUPPORT_UTF 709: if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else 710: #endif /* SUPPORT_UTF */ 711: d = code[coptable[codevalue]]; 712: if (codevalue >= OP_TYPESTAR) 713: { 714: switch(d) 715: { 716: case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM; 717: case OP_NOTPROP: 718: case OP_PROP: codevalue += OP_PROP_EXTRA; break; 719: case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break; 720: case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break; 721: case OP_NOT_HSPACE: 722: case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break; 723: case OP_NOT_VSPACE: 724: case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break; 725: default: break; 726: } 727: } 728: } 729: else 730: { 731: dlen = 0; /* Not strictly necessary, but compilers moan */ 732: d = NOTACHAR; /* if these variables are not set. */ 733: } 734: 735: 736: /* Now process the individual opcodes */ 737: 738: switch (codevalue) 739: { 740: /* ========================================================================== */ 741: /* These cases are never obeyed. This is a fudge that causes a compile- 742: time error if the vectors coptable or poptable, which are indexed by 743: opcode, are not the correct length. It seems to be the only way to do 744: such a check at compile time, as the sizeof() operator does not work 745: in the C preprocessor. */ 746: 747: case OP_TABLE_LENGTH: 748: case OP_TABLE_LENGTH + 749: ((sizeof(coptable) == OP_TABLE_LENGTH) && 750: (sizeof(poptable) == OP_TABLE_LENGTH)): 751: break; 752: 753: /* ========================================================================== */ 754: /* Reached a closing bracket. If not at the end of the pattern, carry 755: on with the next opcode. For repeating opcodes, also add the repeat 756: state. Note that KETRPOS will always be encountered at the end of the 757: subpattern, because the possessive subpattern repeats are always handled 758: using recursive calls. Thus, it never adds any new states. 759: 760: At the end of the (sub)pattern, unless we have an empty string and 761: PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the 762: start of the subject, save the match data, shifting up all previous 763: matches so we always have the longest first. */ 764: 765: case OP_KET: 766: case OP_KETRMIN: 767: case OP_KETRMAX: 768: case OP_KETRPOS: 769: if (code != end_code) 770: { 771: ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0); 772: if (codevalue != OP_KET) 773: { 774: ADD_ACTIVE(state_offset - GET(code, 1), 0); 775: } 776: } 777: else 778: { 779: if (ptr > current_subject || 780: ((md->moptions & PCRE_NOTEMPTY) == 0 && 781: ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 || 782: current_subject > start_subject + md->start_offset))) 783: { 784: if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0; 785: else if (match_count > 0 && ++match_count * 2 > offsetcount) 786: match_count = 0; 787: count = ((match_count == 0)? offsetcount : match_count * 2) - 2; 788: if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int)); 789: if (offsetcount >= 2) 790: { 791: offsets[0] = (int)(current_subject - start_subject); 792: offsets[1] = (int)(ptr - start_subject); 793: DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP, 794: offsets[1] - offsets[0], (char *)current_subject)); 795: } 796: if ((md->moptions & PCRE_DFA_SHORTEST) != 0) 797: { 798: DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" 799: "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, 800: match_count, rlevel*2-2, SP)); 801: return match_count; 802: } 803: } 804: } 805: break; 806: 807: /* ========================================================================== */ 808: /* These opcodes add to the current list of states without looking 809: at the current character. */ 810: 811: /*-----------------------------------------------------------------*/ 812: case OP_ALT: 813: do { code += GET(code, 1); } while (*code == OP_ALT); 814: ADD_ACTIVE((int)(code - start_code), 0); 815: break; 816: 817: /*-----------------------------------------------------------------*/ 818: case OP_BRA: 819: case OP_SBRA: 820: do 821: { 822: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); 823: code += GET(code, 1); 824: } 825: while (*code == OP_ALT); 826: break; 827: 828: /*-----------------------------------------------------------------*/ 829: case OP_CBRA: 830: case OP_SCBRA: 831: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0); 832: code += GET(code, 1); 833: while (*code == OP_ALT) 834: { 835: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); 836: code += GET(code, 1); 837: } 838: break; 839: 840: /*-----------------------------------------------------------------*/ 841: case OP_BRAZERO: 842: case OP_BRAMINZERO: 843: ADD_ACTIVE(state_offset + 1, 0); 844: code += 1 + GET(code, 2); 845: while (*code == OP_ALT) code += GET(code, 1); 846: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); 847: break; 848: 849: /*-----------------------------------------------------------------*/ 850: case OP_SKIPZERO: 851: code += 1 + GET(code, 2); 852: while (*code == OP_ALT) code += GET(code, 1); 853: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); 854: break; 855: 856: /*-----------------------------------------------------------------*/ 857: case OP_CIRC: 858: if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) 859: { ADD_ACTIVE(state_offset + 1, 0); } 860: break; 861: 862: /*-----------------------------------------------------------------*/ 863: case OP_CIRCM: 864: if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) || 865: (ptr != end_subject && WAS_NEWLINE(ptr))) 866: { ADD_ACTIVE(state_offset + 1, 0); } 867: break; 868: 869: /*-----------------------------------------------------------------*/ 870: case OP_EOD: 871: if (ptr >= end_subject) 872: { 873: if ((md->moptions & PCRE_PARTIAL_HARD) != 0) 874: could_continue = TRUE; 875: else { ADD_ACTIVE(state_offset + 1, 0); } 876: } 877: break; 878: 879: /*-----------------------------------------------------------------*/ 880: case OP_SOD: 881: if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); } 882: break; 883: 884: /*-----------------------------------------------------------------*/ 885: case OP_SOM: 886: if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); } 887: break; 888: 889: 890: /* ========================================================================== */ 891: /* These opcodes inspect the next subject character, and sometimes 892: the previous one as well, but do not have an argument. The variable 893: clen contains the length of the current character and is zero if we are 894: at the end of the subject. */ 895: 896: /*-----------------------------------------------------------------*/ 897: case OP_ANY: 898: if (clen > 0 && !IS_NEWLINE(ptr)) 899: { 900: if (ptr + 1 >= md->end_subject && 901: (md->moptions & (PCRE_PARTIAL_HARD)) != 0 && 902: NLBLOCK->nltype == NLTYPE_FIXED && 903: NLBLOCK->nllen == 2 && 904: c == NLBLOCK->nl[0]) 905: { 906: could_continue = partial_newline = TRUE; 907: } 908: else 909: { 910: ADD_NEW(state_offset + 1, 0); 911: } 912: } 913: break; 914: 915: /*-----------------------------------------------------------------*/ 916: case OP_ALLANY: 917: if (clen > 0) 918: { ADD_NEW(state_offset + 1, 0); } 919: break; 920: 921: /*-----------------------------------------------------------------*/ 922: case OP_EODN: 923: if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0) 924: could_continue = TRUE; 925: else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen)) 926: { ADD_ACTIVE(state_offset + 1, 0); } 927: break; 928: 929: /*-----------------------------------------------------------------*/ 930: case OP_DOLL: 931: if ((md->moptions & PCRE_NOTEOL) == 0) 932: { 933: if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0) 934: could_continue = TRUE; 935: else if (clen == 0 || 936: ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) && 937: (ptr == end_subject - md->nllen) 938: )) 939: { ADD_ACTIVE(state_offset + 1, 0); } 940: else if (ptr + 1 >= md->end_subject && 941: (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 && 942: NLBLOCK->nltype == NLTYPE_FIXED && 943: NLBLOCK->nllen == 2 && 944: c == NLBLOCK->nl[0]) 945: { 946: if ((md->moptions & PCRE_PARTIAL_HARD) != 0) 947: { 948: reset_could_continue = TRUE; 949: ADD_NEW_DATA(-(state_offset + 1), 0, 1); 950: } 951: else could_continue = partial_newline = TRUE; 952: } 953: } 954: break; 955: 956: /*-----------------------------------------------------------------*/ 957: case OP_DOLLM: 958: if ((md->moptions & PCRE_NOTEOL) == 0) 959: { 960: if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0) 961: could_continue = TRUE; 962: else if (clen == 0 || 963: ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr))) 964: { ADD_ACTIVE(state_offset + 1, 0); } 965: else if (ptr + 1 >= md->end_subject && 966: (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 && 967: NLBLOCK->nltype == NLTYPE_FIXED && 968: NLBLOCK->nllen == 2 && 969: c == NLBLOCK->nl[0]) 970: { 971: if ((md->moptions & PCRE_PARTIAL_HARD) != 0) 972: { 973: reset_could_continue = TRUE; 974: ADD_NEW_DATA(-(state_offset + 1), 0, 1); 975: } 976: else could_continue = partial_newline = TRUE; 977: } 978: } 979: else if (IS_NEWLINE(ptr)) 980: { ADD_ACTIVE(state_offset + 1, 0); } 981: break; 982: 983: /*-----------------------------------------------------------------*/ 984: 985: case OP_DIGIT: 986: case OP_WHITESPACE: 987: case OP_WORDCHAR: 988: if (clen > 0 && c < 256 && 989: ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0) 990: { ADD_NEW(state_offset + 1, 0); } 991: break; 992: 993: /*-----------------------------------------------------------------*/ 994: case OP_NOT_DIGIT: 995: case OP_NOT_WHITESPACE: 996: case OP_NOT_WORDCHAR: 997: if (clen > 0 && (c >= 256 || 998: ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)) 999: { ADD_NEW(state_offset + 1, 0); } 1000: break; 1001: 1002: /*-----------------------------------------------------------------*/ 1003: case OP_WORD_BOUNDARY: 1004: case OP_NOT_WORD_BOUNDARY: 1005: { 1006: int left_word, right_word; 1007: 1008: if (ptr > start_subject) 1009: { 1010: const pcre_uchar *temp = ptr - 1; 1011: if (temp < md->start_used_ptr) md->start_used_ptr = temp; 1012: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32 1013: if (utf) { BACKCHAR(temp); } 1014: #endif 1015: GETCHARTEST(d, temp); 1016: #ifdef SUPPORT_UCP 1017: if ((md->poptions & PCRE_UCP) != 0) 1018: { 1019: if (d == '_') left_word = TRUE; else 1020: { 1021: int cat = UCD_CATEGORY(d); 1022: left_word = (cat == ucp_L || cat == ucp_N); 1023: } 1024: } 1025: else 1026: #endif 1027: left_word = d < 256 && (ctypes[d] & ctype_word) != 0; 1028: } 1029: else left_word = FALSE; 1030: 1031: if (clen > 0) 1032: { 1033: #ifdef SUPPORT_UCP 1034: if ((md->poptions & PCRE_UCP) != 0) 1035: { 1036: if (c == '_') right_word = TRUE; else 1037: { 1038: int cat = UCD_CATEGORY(c); 1039: right_word = (cat == ucp_L || cat == ucp_N); 1040: } 1041: } 1042: else 1043: #endif 1044: right_word = c < 256 && (ctypes[c] & ctype_word) != 0; 1045: } 1046: else right_word = FALSE; 1047: 1048: if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY)) 1049: { ADD_ACTIVE(state_offset + 1, 0); } 1050: } 1051: break; 1052: 1053: 1054: /*-----------------------------------------------------------------*/ 1055: /* Check the next character by Unicode property. We will get here only 1056: if the support is in the binary; otherwise a compile-time error occurs. 1057: */ 1058: 1059: #ifdef SUPPORT_UCP 1060: case OP_PROP: 1061: case OP_NOTPROP: 1062: if (clen > 0) 1063: { 1064: BOOL OK; 1065: const pcre_uint32 *cp; 1066: const ucd_record * prop = GET_UCD(c); 1067: switch(code[1]) 1068: { 1069: case PT_ANY: 1070: OK = TRUE; 1071: break; 1072: 1073: case PT_LAMP: 1074: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || 1075: prop->chartype == ucp_Lt; 1076: break; 1077: 1078: case PT_GC: 1079: OK = PRIV(ucp_gentype)[prop->chartype] == code[2]; 1080: break; 1081: 1082: case PT_PC: 1083: OK = prop->chartype == code[2]; 1084: break; 1085: 1086: case PT_SC: 1087: OK = prop->script == code[2]; 1088: break; 1089: 1090: /* These are specials for combination cases. */ 1091: 1092: case PT_ALNUM: 1093: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1094: PRIV(ucp_gentype)[prop->chartype] == ucp_N; 1095: break; 1096: 1097: case PT_SPACE: /* Perl space */ 1098: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || 1099: c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; 1100: break; 1101: 1102: case PT_PXSPACE: /* POSIX space */ 1103: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || 1104: c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || 1105: c == CHAR_FF || c == CHAR_CR; 1106: break; 1107: 1108: case PT_WORD: 1109: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1110: PRIV(ucp_gentype)[prop->chartype] == ucp_N || 1111: c == CHAR_UNDERSCORE; 1112: break; 1113: 1114: case PT_CLIST: 1115: cp = PRIV(ucd_caseless_sets) + code[2]; 1116: for (;;) 1117: { 1118: if (c < *cp) { OK = FALSE; break; } 1119: if (c == *cp++) { OK = TRUE; break; } 1120: } 1121: break; 1122: 1123: case PT_UCNC: 1124: OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 1125: c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 1126: c >= 0xe000; 1127: break; 1128: 1129: /* Should never occur, but keep compilers from grumbling. */ 1130: 1131: default: 1132: OK = codevalue != OP_PROP; 1133: break; 1134: } 1135: 1136: if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); } 1137: } 1138: break; 1139: #endif 1140: 1141: 1142: 1143: /* ========================================================================== */ 1144: /* These opcodes likewise inspect the subject character, but have an 1145: argument that is not a data character. It is one of these opcodes: 1146: OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, 1147: OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */ 1148: 1149: case OP_TYPEPLUS: 1150: case OP_TYPEMINPLUS: 1151: case OP_TYPEPOSPLUS: 1152: count = current_state->count; /* Already matched */ 1153: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } 1154: if (clen > 0) 1155: { 1156: if (d == OP_ANY && ptr + 1 >= md->end_subject && 1157: (md->moptions & (PCRE_PARTIAL_HARD)) != 0 && 1158: NLBLOCK->nltype == NLTYPE_FIXED && 1159: NLBLOCK->nllen == 2 && 1160: c == NLBLOCK->nl[0]) 1161: { 1162: could_continue = partial_newline = TRUE; 1163: } 1164: else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || 1165: (c < 256 && 1166: (d != OP_ANY || !IS_NEWLINE(ptr)) && 1167: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) 1168: { 1169: if (count > 0 && codevalue == OP_TYPEPOSPLUS) 1170: { 1171: active_count--; /* Remove non-match possibility */ 1172: next_active_state--; 1173: } 1174: count++; 1175: ADD_NEW(state_offset, count); 1176: } 1177: } 1178: break; 1179: 1180: /*-----------------------------------------------------------------*/ 1181: case OP_TYPEQUERY: 1182: case OP_TYPEMINQUERY: 1183: case OP_TYPEPOSQUERY: 1184: ADD_ACTIVE(state_offset + 2, 0); 1185: if (clen > 0) 1186: { 1187: if (d == OP_ANY && ptr + 1 >= md->end_subject && 1188: (md->moptions & (PCRE_PARTIAL_HARD)) != 0 && 1189: NLBLOCK->nltype == NLTYPE_FIXED && 1190: NLBLOCK->nllen == 2 && 1191: c == NLBLOCK->nl[0]) 1192: { 1193: could_continue = partial_newline = TRUE; 1194: } 1195: else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || 1196: (c < 256 && 1197: (d != OP_ANY || !IS_NEWLINE(ptr)) && 1198: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) 1199: { 1200: if (codevalue == OP_TYPEPOSQUERY) 1201: { 1202: active_count--; /* Remove non-match possibility */ 1203: next_active_state--; 1204: } 1205: ADD_NEW(state_offset + 2, 0); 1206: } 1207: } 1208: break; 1209: 1210: /*-----------------------------------------------------------------*/ 1211: case OP_TYPESTAR: 1212: case OP_TYPEMINSTAR: 1213: case OP_TYPEPOSSTAR: 1214: ADD_ACTIVE(state_offset + 2, 0); 1215: if (clen > 0) 1216: { 1217: if (d == OP_ANY && ptr + 1 >= md->end_subject && 1218: (md->moptions & (PCRE_PARTIAL_HARD)) != 0 && 1219: NLBLOCK->nltype == NLTYPE_FIXED && 1220: NLBLOCK->nllen == 2 && 1221: c == NLBLOCK->nl[0]) 1222: { 1223: could_continue = partial_newline = TRUE; 1224: } 1225: else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || 1226: (c < 256 && 1227: (d != OP_ANY || !IS_NEWLINE(ptr)) && 1228: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) 1229: { 1230: if (codevalue == OP_TYPEPOSSTAR) 1231: { 1232: active_count--; /* Remove non-match possibility */ 1233: next_active_state--; 1234: } 1235: ADD_NEW(state_offset, 0); 1236: } 1237: } 1238: break; 1239: 1240: /*-----------------------------------------------------------------*/ 1241: case OP_TYPEEXACT: 1242: count = current_state->count; /* Number already matched */ 1243: if (clen > 0) 1244: { 1245: if (d == OP_ANY && ptr + 1 >= md->end_subject && 1246: (md->moptions & (PCRE_PARTIAL_HARD)) != 0 && 1247: NLBLOCK->nltype == NLTYPE_FIXED && 1248: NLBLOCK->nllen == 2 && 1249: c == NLBLOCK->nl[0]) 1250: { 1251: could_continue = partial_newline = TRUE; 1252: } 1253: else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || 1254: (c < 256 && 1255: (d != OP_ANY || !IS_NEWLINE(ptr)) && 1256: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) 1257: { 1258: if (++count >= (int)GET2(code, 1)) 1259: { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); } 1260: else 1261: { ADD_NEW(state_offset, count); } 1262: } 1263: } 1264: break; 1265: 1266: /*-----------------------------------------------------------------*/ 1267: case OP_TYPEUPTO: 1268: case OP_TYPEMINUPTO: 1269: case OP_TYPEPOSUPTO: 1270: ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); 1271: count = current_state->count; /* Number already matched */ 1272: if (clen > 0) 1273: { 1274: if (d == OP_ANY && ptr + 1 >= md->end_subject && 1275: (md->moptions & (PCRE_PARTIAL_HARD)) != 0 && 1276: NLBLOCK->nltype == NLTYPE_FIXED && 1277: NLBLOCK->nllen == 2 && 1278: c == NLBLOCK->nl[0]) 1279: { 1280: could_continue = partial_newline = TRUE; 1281: } 1282: else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || 1283: (c < 256 && 1284: (d != OP_ANY || !IS_NEWLINE(ptr)) && 1285: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) 1286: { 1287: if (codevalue == OP_TYPEPOSUPTO) 1288: { 1289: active_count--; /* Remove non-match possibility */ 1290: next_active_state--; 1291: } 1292: if (++count >= (int)GET2(code, 1)) 1293: { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); } 1294: else 1295: { ADD_NEW(state_offset, count); } 1296: } 1297: } 1298: break; 1299: 1300: /* ========================================================================== */ 1301: /* These are virtual opcodes that are used when something like 1302: OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its 1303: argument. It keeps the code above fast for the other cases. The argument 1304: is in the d variable. */ 1305: 1306: #ifdef SUPPORT_UCP 1307: case OP_PROP_EXTRA + OP_TYPEPLUS: 1308: case OP_PROP_EXTRA + OP_TYPEMINPLUS: 1309: case OP_PROP_EXTRA + OP_TYPEPOSPLUS: 1310: count = current_state->count; /* Already matched */ 1311: if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); } 1312: if (clen > 0) 1313: { 1314: BOOL OK; 1315: const pcre_uint32 *cp; 1316: const ucd_record * prop = GET_UCD(c); 1317: switch(code[2]) 1318: { 1319: case PT_ANY: 1320: OK = TRUE; 1321: break; 1322: 1323: case PT_LAMP: 1324: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || 1325: prop->chartype == ucp_Lt; 1326: break; 1327: 1328: case PT_GC: 1329: OK = PRIV(ucp_gentype)[prop->chartype] == code[3]; 1330: break; 1331: 1332: case PT_PC: 1333: OK = prop->chartype == code[3]; 1334: break; 1335: 1336: case PT_SC: 1337: OK = prop->script == code[3]; 1338: break; 1339: 1340: /* These are specials for combination cases. */ 1341: 1342: case PT_ALNUM: 1343: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1344: PRIV(ucp_gentype)[prop->chartype] == ucp_N; 1345: break; 1346: 1347: case PT_SPACE: /* Perl space */ 1348: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || 1349: c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; 1350: break; 1351: 1352: case PT_PXSPACE: /* POSIX space */ 1353: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || 1354: c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || 1355: c == CHAR_FF || c == CHAR_CR; 1356: break; 1357: 1358: case PT_WORD: 1359: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1360: PRIV(ucp_gentype)[prop->chartype] == ucp_N || 1361: c == CHAR_UNDERSCORE; 1362: break; 1363: 1364: case PT_CLIST: 1365: cp = PRIV(ucd_caseless_sets) + code[3]; 1366: for (;;) 1367: { 1368: if (c < *cp) { OK = FALSE; break; } 1369: if (c == *cp++) { OK = TRUE; break; } 1370: } 1371: break; 1372: 1373: case PT_UCNC: 1374: OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 1375: c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 1376: c >= 0xe000; 1377: break; 1378: 1379: /* Should never occur, but keep compilers from grumbling. */ 1380: 1381: default: 1382: OK = codevalue != OP_PROP; 1383: break; 1384: } 1385: 1386: if (OK == (d == OP_PROP)) 1387: { 1388: if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS) 1389: { 1390: active_count--; /* Remove non-match possibility */ 1391: next_active_state--; 1392: } 1393: count++; 1394: ADD_NEW(state_offset, count); 1395: } 1396: } 1397: break; 1398: 1399: /*-----------------------------------------------------------------*/ 1400: case OP_EXTUNI_EXTRA + OP_TYPEPLUS: 1401: case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS: 1402: case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS: 1403: count = current_state->count; /* Already matched */ 1404: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } 1405: if (clen > 0) 1406: { 1407: int lgb, rgb; 1408: const pcre_uchar *nptr = ptr + clen; 1409: int ncount = 0; 1410: if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS) 1411: { 1412: active_count--; /* Remove non-match possibility */ 1413: next_active_state--; 1414: } 1415: lgb = UCD_GRAPHBREAK(c); 1416: while (nptr < end_subject) 1417: { 1418: dlen = 1; 1419: if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } 1420: rgb = UCD_GRAPHBREAK(d); 1421: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; 1422: ncount++; 1423: lgb = rgb; 1424: nptr += dlen; 1425: } 1426: count++; 1427: ADD_NEW_DATA(-state_offset, count, ncount); 1428: } 1429: break; 1430: #endif 1431: 1432: /*-----------------------------------------------------------------*/ 1433: case OP_ANYNL_EXTRA + OP_TYPEPLUS: 1434: case OP_ANYNL_EXTRA + OP_TYPEMINPLUS: 1435: case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS: 1436: count = current_state->count; /* Already matched */ 1437: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } 1438: if (clen > 0) 1439: { 1440: int ncount = 0; 1441: switch (c) 1442: { 1443: case CHAR_VT: 1444: case CHAR_FF: 1445: case CHAR_NEL: 1446: #ifndef EBCDIC 1447: case 0x2028: 1448: case 0x2029: 1449: #endif /* Not EBCDIC */ 1450: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; 1451: goto ANYNL01; 1452: 1453: case CHAR_CR: 1454: if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1; 1455: /* Fall through */ 1456: 1457: ANYNL01: 1458: case CHAR_LF: 1459: if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS) 1460: { 1461: active_count--; /* Remove non-match possibility */ 1462: next_active_state--; 1463: } 1464: count++; 1465: ADD_NEW_DATA(-state_offset, count, ncount); 1466: break; 1467: 1468: default: 1469: break; 1470: } 1471: } 1472: break; 1473: 1474: /*-----------------------------------------------------------------*/ 1475: case OP_VSPACE_EXTRA + OP_TYPEPLUS: 1476: case OP_VSPACE_EXTRA + OP_TYPEMINPLUS: 1477: case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS: 1478: count = current_state->count; /* Already matched */ 1479: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } 1480: if (clen > 0) 1481: { 1482: BOOL OK; 1483: switch (c) 1484: { 1485: VSPACE_CASES: 1486: OK = TRUE; 1487: break; 1488: 1489: default: 1490: OK = FALSE; 1491: break; 1492: } 1493: 1494: if (OK == (d == OP_VSPACE)) 1495: { 1496: if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS) 1497: { 1498: active_count--; /* Remove non-match possibility */ 1499: next_active_state--; 1500: } 1501: count++; 1502: ADD_NEW_DATA(-state_offset, count, 0); 1503: } 1504: } 1505: break; 1506: 1507: /*-----------------------------------------------------------------*/ 1508: case OP_HSPACE_EXTRA + OP_TYPEPLUS: 1509: case OP_HSPACE_EXTRA + OP_TYPEMINPLUS: 1510: case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS: 1511: count = current_state->count; /* Already matched */ 1512: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } 1513: if (clen > 0) 1514: { 1515: BOOL OK; 1516: switch (c) 1517: { 1518: HSPACE_CASES: 1519: OK = TRUE; 1520: break; 1521: 1522: default: 1523: OK = FALSE; 1524: break; 1525: } 1526: 1527: if (OK == (d == OP_HSPACE)) 1528: { 1529: if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS) 1530: { 1531: active_count--; /* Remove non-match possibility */ 1532: next_active_state--; 1533: } 1534: count++; 1535: ADD_NEW_DATA(-state_offset, count, 0); 1536: } 1537: } 1538: break; 1539: 1540: /*-----------------------------------------------------------------*/ 1541: #ifdef SUPPORT_UCP 1542: case OP_PROP_EXTRA + OP_TYPEQUERY: 1543: case OP_PROP_EXTRA + OP_TYPEMINQUERY: 1544: case OP_PROP_EXTRA + OP_TYPEPOSQUERY: 1545: count = 4; 1546: goto QS1; 1547: 1548: case OP_PROP_EXTRA + OP_TYPESTAR: 1549: case OP_PROP_EXTRA + OP_TYPEMINSTAR: 1550: case OP_PROP_EXTRA + OP_TYPEPOSSTAR: 1551: count = 0; 1552: 1553: QS1: 1554: 1555: ADD_ACTIVE(state_offset + 4, 0); 1556: if (clen > 0) 1557: { 1558: BOOL OK; 1559: const pcre_uint32 *cp; 1560: const ucd_record * prop = GET_UCD(c); 1561: switch(code[2]) 1562: { 1563: case PT_ANY: 1564: OK = TRUE; 1565: break; 1566: 1567: case PT_LAMP: 1568: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || 1569: prop->chartype == ucp_Lt; 1570: break; 1571: 1572: case PT_GC: 1573: OK = PRIV(ucp_gentype)[prop->chartype] == code[3]; 1574: break; 1575: 1576: case PT_PC: 1577: OK = prop->chartype == code[3]; 1578: break; 1579: 1580: case PT_SC: 1581: OK = prop->script == code[3]; 1582: break; 1583: 1584: /* These are specials for combination cases. */ 1585: 1586: case PT_ALNUM: 1587: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1588: PRIV(ucp_gentype)[prop->chartype] == ucp_N; 1589: break; 1590: 1591: case PT_SPACE: /* Perl space */ 1592: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || 1593: c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; 1594: break; 1595: 1596: case PT_PXSPACE: /* POSIX space */ 1597: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || 1598: c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || 1599: c == CHAR_FF || c == CHAR_CR; 1600: break; 1601: 1602: case PT_WORD: 1603: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1604: PRIV(ucp_gentype)[prop->chartype] == ucp_N || 1605: c == CHAR_UNDERSCORE; 1606: break; 1607: 1608: case PT_CLIST: 1609: cp = PRIV(ucd_caseless_sets) + code[3]; 1610: for (;;) 1611: { 1612: if (c < *cp) { OK = FALSE; break; } 1613: if (c == *cp++) { OK = TRUE; break; } 1614: } 1615: break; 1616: 1617: case PT_UCNC: 1618: OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 1619: c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 1620: c >= 0xe000; 1621: break; 1622: 1623: /* Should never occur, but keep compilers from grumbling. */ 1624: 1625: default: 1626: OK = codevalue != OP_PROP; 1627: break; 1628: } 1629: 1630: if (OK == (d == OP_PROP)) 1631: { 1632: if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR || 1633: codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY) 1634: { 1635: active_count--; /* Remove non-match possibility */ 1636: next_active_state--; 1637: } 1638: ADD_NEW(state_offset + count, 0); 1639: } 1640: } 1641: break; 1642: 1643: /*-----------------------------------------------------------------*/ 1644: case OP_EXTUNI_EXTRA + OP_TYPEQUERY: 1645: case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY: 1646: case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY: 1647: count = 2; 1648: goto QS2; 1649: 1650: case OP_EXTUNI_EXTRA + OP_TYPESTAR: 1651: case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR: 1652: case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR: 1653: count = 0; 1654: 1655: QS2: 1656: 1657: ADD_ACTIVE(state_offset + 2, 0); 1658: if (clen > 0) 1659: { 1660: int lgb, rgb; 1661: const pcre_uchar *nptr = ptr + clen; 1662: int ncount = 0; 1663: if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR || 1664: codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY) 1665: { 1666: active_count--; /* Remove non-match possibility */ 1667: next_active_state--; 1668: } 1669: lgb = UCD_GRAPHBREAK(c); 1670: while (nptr < end_subject) 1671: { 1672: dlen = 1; 1673: if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } 1674: rgb = UCD_GRAPHBREAK(d); 1675: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; 1676: ncount++; 1677: lgb = rgb; 1678: nptr += dlen; 1679: } 1680: ADD_NEW_DATA(-(state_offset + count), 0, ncount); 1681: } 1682: break; 1683: #endif 1684: 1685: /*-----------------------------------------------------------------*/ 1686: case OP_ANYNL_EXTRA + OP_TYPEQUERY: 1687: case OP_ANYNL_EXTRA + OP_TYPEMINQUERY: 1688: case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY: 1689: count = 2; 1690: goto QS3; 1691: 1692: case OP_ANYNL_EXTRA + OP_TYPESTAR: 1693: case OP_ANYNL_EXTRA + OP_TYPEMINSTAR: 1694: case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR: 1695: count = 0; 1696: 1697: QS3: 1698: ADD_ACTIVE(state_offset + 2, 0); 1699: if (clen > 0) 1700: { 1701: int ncount = 0; 1702: switch (c) 1703: { 1704: case CHAR_VT: 1705: case CHAR_FF: 1706: case CHAR_NEL: 1707: #ifndef EBCDIC 1708: case 0x2028: 1709: case 0x2029: 1710: #endif /* Not EBCDIC */ 1711: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; 1712: goto ANYNL02; 1713: 1714: case CHAR_CR: 1715: if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1; 1716: /* Fall through */ 1717: 1718: ANYNL02: 1719: case CHAR_LF: 1720: if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR || 1721: codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY) 1722: { 1723: active_count--; /* Remove non-match possibility */ 1724: next_active_state--; 1725: } 1726: ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount); 1727: break; 1728: 1729: default: 1730: break; 1731: } 1732: } 1733: break; 1734: 1735: /*-----------------------------------------------------------------*/ 1736: case OP_VSPACE_EXTRA + OP_TYPEQUERY: 1737: case OP_VSPACE_EXTRA + OP_TYPEMINQUERY: 1738: case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY: 1739: count = 2; 1740: goto QS4; 1741: 1742: case OP_VSPACE_EXTRA + OP_TYPESTAR: 1743: case OP_VSPACE_EXTRA + OP_TYPEMINSTAR: 1744: case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR: 1745: count = 0; 1746: 1747: QS4: 1748: ADD_ACTIVE(state_offset + 2, 0); 1749: if (clen > 0) 1750: { 1751: BOOL OK; 1752: switch (c) 1753: { 1754: VSPACE_CASES: 1755: OK = TRUE; 1756: break; 1757: 1758: default: 1759: OK = FALSE; 1760: break; 1761: } 1762: if (OK == (d == OP_VSPACE)) 1763: { 1764: if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR || 1765: codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY) 1766: { 1767: active_count--; /* Remove non-match possibility */ 1768: next_active_state--; 1769: } 1770: ADD_NEW_DATA(-(state_offset + (int)count), 0, 0); 1771: } 1772: } 1773: break; 1774: 1775: /*-----------------------------------------------------------------*/ 1776: case OP_HSPACE_EXTRA + OP_TYPEQUERY: 1777: case OP_HSPACE_EXTRA + OP_TYPEMINQUERY: 1778: case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY: 1779: count = 2; 1780: goto QS5; 1781: 1782: case OP_HSPACE_EXTRA + OP_TYPESTAR: 1783: case OP_HSPACE_EXTRA + OP_TYPEMINSTAR: 1784: case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR: 1785: count = 0; 1786: 1787: QS5: 1788: ADD_ACTIVE(state_offset + 2, 0); 1789: if (clen > 0) 1790: { 1791: BOOL OK; 1792: switch (c) 1793: { 1794: HSPACE_CASES: 1795: OK = TRUE; 1796: break; 1797: 1798: default: 1799: OK = FALSE; 1800: break; 1801: } 1802: 1803: if (OK == (d == OP_HSPACE)) 1804: { 1805: if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR || 1806: codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY) 1807: { 1808: active_count--; /* Remove non-match possibility */ 1809: next_active_state--; 1810: } 1811: ADD_NEW_DATA(-(state_offset + (int)count), 0, 0); 1812: } 1813: } 1814: break; 1815: 1816: /*-----------------------------------------------------------------*/ 1817: #ifdef SUPPORT_UCP 1818: case OP_PROP_EXTRA + OP_TYPEEXACT: 1819: case OP_PROP_EXTRA + OP_TYPEUPTO: 1820: case OP_PROP_EXTRA + OP_TYPEMINUPTO: 1821: case OP_PROP_EXTRA + OP_TYPEPOSUPTO: 1822: if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT) 1823: { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); } 1824: count = current_state->count; /* Number already matched */ 1825: if (clen > 0) 1826: { 1827: BOOL OK; 1828: const pcre_uint32 *cp; 1829: const ucd_record * prop = GET_UCD(c); 1830: switch(code[1 + IMM2_SIZE + 1]) 1831: { 1832: case PT_ANY: 1833: OK = TRUE; 1834: break; 1835: 1836: case PT_LAMP: 1837: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || 1838: prop->chartype == ucp_Lt; 1839: break; 1840: 1841: case PT_GC: 1842: OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2]; 1843: break; 1844: 1845: case PT_PC: 1846: OK = prop->chartype == code[1 + IMM2_SIZE + 2]; 1847: break; 1848: 1849: case PT_SC: 1850: OK = prop->script == code[1 + IMM2_SIZE + 2]; 1851: break; 1852: 1853: /* These are specials for combination cases. */ 1854: 1855: case PT_ALNUM: 1856: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1857: PRIV(ucp_gentype)[prop->chartype] == ucp_N; 1858: break; 1859: 1860: case PT_SPACE: /* Perl space */ 1861: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || 1862: c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; 1863: break; 1864: 1865: case PT_PXSPACE: /* POSIX space */ 1866: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || 1867: c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || 1868: c == CHAR_FF || c == CHAR_CR; 1869: break; 1870: 1871: case PT_WORD: 1872: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1873: PRIV(ucp_gentype)[prop->chartype] == ucp_N || 1874: c == CHAR_UNDERSCORE; 1875: break; 1876: 1877: case PT_CLIST: 1878: cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2]; 1879: for (;;) 1880: { 1881: if (c < *cp) { OK = FALSE; break; } 1882: if (c == *cp++) { OK = TRUE; break; } 1883: } 1884: break; 1885: 1886: case PT_UCNC: 1887: OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 1888: c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 1889: c >= 0xe000; 1890: break; 1891: 1892: /* Should never occur, but keep compilers from grumbling. */ 1893: 1894: default: 1895: OK = codevalue != OP_PROP; 1896: break; 1897: } 1898: 1899: if (OK == (d == OP_PROP)) 1900: { 1901: if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO) 1902: { 1903: active_count--; /* Remove non-match possibility */ 1904: next_active_state--; 1905: } 1906: if (++count >= (int)GET2(code, 1)) 1907: { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); } 1908: else 1909: { ADD_NEW(state_offset, count); } 1910: } 1911: } 1912: break; 1913: 1914: /*-----------------------------------------------------------------*/ 1915: case OP_EXTUNI_EXTRA + OP_TYPEEXACT: 1916: case OP_EXTUNI_EXTRA + OP_TYPEUPTO: 1917: case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO: 1918: case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO: 1919: if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT) 1920: { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } 1921: count = current_state->count; /* Number already matched */ 1922: if (clen > 0) 1923: { 1924: int lgb, rgb; 1925: const pcre_uchar *nptr = ptr + clen; 1926: int ncount = 0; 1927: if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO) 1928: { 1929: active_count--; /* Remove non-match possibility */ 1930: next_active_state--; 1931: } 1932: lgb = UCD_GRAPHBREAK(c); 1933: while (nptr < end_subject) 1934: { 1935: dlen = 1; 1936: if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } 1937: rgb = UCD_GRAPHBREAK(d); 1938: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; 1939: ncount++; 1940: lgb = rgb; 1941: nptr += dlen; 1942: } 1943: if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0) 1944: reset_could_continue = TRUE; 1945: if (++count >= (int)GET2(code, 1)) 1946: { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); } 1947: else 1948: { ADD_NEW_DATA(-state_offset, count, ncount); } 1949: } 1950: break; 1951: #endif 1952: 1953: /*-----------------------------------------------------------------*/ 1954: case OP_ANYNL_EXTRA + OP_TYPEEXACT: 1955: case OP_ANYNL_EXTRA + OP_TYPEUPTO: 1956: case OP_ANYNL_EXTRA + OP_TYPEMINUPTO: 1957: case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO: 1958: if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT) 1959: { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } 1960: count = current_state->count; /* Number already matched */ 1961: if (clen > 0) 1962: { 1963: int ncount = 0; 1964: switch (c) 1965: { 1966: case CHAR_VT: 1967: case CHAR_FF: 1968: case CHAR_NEL: 1969: #ifndef EBCDIC 1970: case 0x2028: 1971: case 0x2029: 1972: #endif /* Not EBCDIC */ 1973: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; 1974: goto ANYNL03; 1975: 1976: case CHAR_CR: 1977: if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1; 1978: /* Fall through */ 1979: 1980: ANYNL03: 1981: case CHAR_LF: 1982: if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO) 1983: { 1984: active_count--; /* Remove non-match possibility */ 1985: next_active_state--; 1986: } 1987: if (++count >= (int)GET2(code, 1)) 1988: { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); } 1989: else 1990: { ADD_NEW_DATA(-state_offset, count, ncount); } 1991: break; 1992: 1993: default: 1994: break; 1995: } 1996: } 1997: break; 1998: 1999: /*-----------------------------------------------------------------*/ 2000: case OP_VSPACE_EXTRA + OP_TYPEEXACT: 2001: case OP_VSPACE_EXTRA + OP_TYPEUPTO: 2002: case OP_VSPACE_EXTRA + OP_TYPEMINUPTO: 2003: case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO: 2004: if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT) 2005: { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } 2006: count = current_state->count; /* Number already matched */ 2007: if (clen > 0) 2008: { 2009: BOOL OK; 2010: switch (c) 2011: { 2012: VSPACE_CASES: 2013: OK = TRUE; 2014: break; 2015: 2016: default: 2017: OK = FALSE; 2018: } 2019: 2020: if (OK == (d == OP_VSPACE)) 2021: { 2022: if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO) 2023: { 2024: active_count--; /* Remove non-match possibility */ 2025: next_active_state--; 2026: } 2027: if (++count >= (int)GET2(code, 1)) 2028: { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); } 2029: else 2030: { ADD_NEW_DATA(-state_offset, count, 0); } 2031: } 2032: } 2033: break; 2034: 2035: /*-----------------------------------------------------------------*/ 2036: case OP_HSPACE_EXTRA + OP_TYPEEXACT: 2037: case OP_HSPACE_EXTRA + OP_TYPEUPTO: 2038: case OP_HSPACE_EXTRA + OP_TYPEMINUPTO: 2039: case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO: 2040: if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT) 2041: { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } 2042: count = current_state->count; /* Number already matched */ 2043: if (clen > 0) 2044: { 2045: BOOL OK; 2046: switch (c) 2047: { 2048: HSPACE_CASES: 2049: OK = TRUE; 2050: break; 2051: 2052: default: 2053: OK = FALSE; 2054: break; 2055: } 2056: 2057: if (OK == (d == OP_HSPACE)) 2058: { 2059: if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO) 2060: { 2061: active_count--; /* Remove non-match possibility */ 2062: next_active_state--; 2063: } 2064: if (++count >= (int)GET2(code, 1)) 2065: { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); } 2066: else 2067: { ADD_NEW_DATA(-state_offset, count, 0); } 2068: } 2069: } 2070: break; 2071: 2072: /* ========================================================================== */ 2073: /* These opcodes are followed by a character that is usually compared 2074: to the current subject character; it is loaded into d. We still get 2075: here even if there is no subject character, because in some cases zero 2076: repetitions are permitted. */ 2077: 2078: /*-----------------------------------------------------------------*/ 2079: case OP_CHAR: 2080: if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); } 2081: break; 2082: 2083: /*-----------------------------------------------------------------*/ 2084: case OP_CHARI: 2085: if (clen == 0) break; 2086: 2087: #ifdef SUPPORT_UTF 2088: if (utf) 2089: { 2090: if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else 2091: { 2092: unsigned int othercase; 2093: if (c < 128) 2094: othercase = fcc[c]; 2095: else 2096: /* If we have Unicode property support, we can use it to test the 2097: other case of the character. */ 2098: #ifdef SUPPORT_UCP 2099: othercase = UCD_OTHERCASE(c); 2100: #else 2101: othercase = NOTACHAR; 2102: #endif 2103: 2104: if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); } 2105: } 2106: } 2107: else 2108: #endif /* SUPPORT_UTF */ 2109: /* Not UTF mode */ 2110: { 2111: if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d)) 2112: { ADD_NEW(state_offset + 2, 0); } 2113: } 2114: break; 2115: 2116: 2117: #ifdef SUPPORT_UCP 2118: /*-----------------------------------------------------------------*/ 2119: /* This is a tricky one because it can match more than one character. 2120: Find out how many characters to skip, and then set up a negative state 2121: to wait for them to pass before continuing. */ 2122: 2123: case OP_EXTUNI: 2124: if (clen > 0) 2125: { 2126: int lgb, rgb; 2127: const pcre_uchar *nptr = ptr + clen; 2128: int ncount = 0; 2129: lgb = UCD_GRAPHBREAK(c); 2130: while (nptr < end_subject) 2131: { 2132: dlen = 1; 2133: if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } 2134: rgb = UCD_GRAPHBREAK(d); 2135: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; 2136: ncount++; 2137: lgb = rgb; 2138: nptr += dlen; 2139: } 2140: if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0) 2141: reset_could_continue = TRUE; 2142: ADD_NEW_DATA(-(state_offset + 1), 0, ncount); 2143: } 2144: break; 2145: #endif 2146: 2147: /*-----------------------------------------------------------------*/ 2148: /* This is a tricky like EXTUNI because it too can match more than one 2149: character (when CR is followed by LF). In this case, set up a negative 2150: state to wait for one character to pass before continuing. */ 2151: 2152: case OP_ANYNL: 2153: if (clen > 0) switch(c) 2154: { 2155: case CHAR_VT: 2156: case CHAR_FF: 2157: case CHAR_NEL: 2158: #ifndef EBCDIC 2159: case 0x2028: 2160: case 0x2029: 2161: #endif /* Not EBCDIC */ 2162: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; 2163: 2164: case CHAR_LF: 2165: ADD_NEW(state_offset + 1, 0); 2166: break; 2167: 2168: case CHAR_CR: 2169: if (ptr + 1 >= end_subject) 2170: { 2171: ADD_NEW(state_offset + 1, 0); 2172: if ((md->moptions & PCRE_PARTIAL_HARD) != 0) 2173: reset_could_continue = TRUE; 2174: } 2175: else if (RAWUCHARTEST(ptr + 1) == CHAR_LF) 2176: { 2177: ADD_NEW_DATA(-(state_offset + 1), 0, 1); 2178: } 2179: else 2180: { 2181: ADD_NEW(state_offset + 1, 0); 2182: } 2183: break; 2184: } 2185: break; 2186: 2187: /*-----------------------------------------------------------------*/ 2188: case OP_NOT_VSPACE: 2189: if (clen > 0) switch(c) 2190: { 2191: VSPACE_CASES: 2192: break; 2193: 2194: default: 2195: ADD_NEW(state_offset + 1, 0); 2196: break; 2197: } 2198: break; 2199: 2200: /*-----------------------------------------------------------------*/ 2201: case OP_VSPACE: 2202: if (clen > 0) switch(c) 2203: { 2204: VSPACE_CASES: 2205: ADD_NEW(state_offset + 1, 0); 2206: break; 2207: 2208: default: 2209: break; 2210: } 2211: break; 2212: 2213: /*-----------------------------------------------------------------*/ 2214: case OP_NOT_HSPACE: 2215: if (clen > 0) switch(c) 2216: { 2217: HSPACE_CASES: 2218: break; 2219: 2220: default: 2221: ADD_NEW(state_offset + 1, 0); 2222: break; 2223: } 2224: break; 2225: 2226: /*-----------------------------------------------------------------*/ 2227: case OP_HSPACE: 2228: if (clen > 0) switch(c) 2229: { 2230: HSPACE_CASES: 2231: ADD_NEW(state_offset + 1, 0); 2232: break; 2233: 2234: default: 2235: break; 2236: } 2237: break; 2238: 2239: /*-----------------------------------------------------------------*/ 2240: /* Match a negated single character casefully. */ 2241: 2242: case OP_NOT: 2243: if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); } 2244: break; 2245: 2246: /*-----------------------------------------------------------------*/ 2247: /* Match a negated single character caselessly. */ 2248: 2249: case OP_NOTI: 2250: if (clen > 0) 2251: { 2252: unsigned int otherd; 2253: #ifdef SUPPORT_UTF 2254: if (utf && d >= 128) 2255: { 2256: #ifdef SUPPORT_UCP 2257: otherd = UCD_OTHERCASE(d); 2258: #endif /* SUPPORT_UCP */ 2259: } 2260: else 2261: #endif /* SUPPORT_UTF */ 2262: otherd = TABLE_GET(d, fcc, d); 2263: if (c != d && c != otherd) 2264: { ADD_NEW(state_offset + dlen + 1, 0); } 2265: } 2266: break; 2267: 2268: /*-----------------------------------------------------------------*/ 2269: case OP_PLUSI: 2270: case OP_MINPLUSI: 2271: case OP_POSPLUSI: 2272: case OP_NOTPLUSI: 2273: case OP_NOTMINPLUSI: 2274: case OP_NOTPOSPLUSI: 2275: caseless = TRUE; 2276: codevalue -= OP_STARI - OP_STAR; 2277: 2278: /* Fall through */ 2279: case OP_PLUS: 2280: case OP_MINPLUS: 2281: case OP_POSPLUS: 2282: case OP_NOTPLUS: 2283: case OP_NOTMINPLUS: 2284: case OP_NOTPOSPLUS: 2285: count = current_state->count; /* Already matched */ 2286: if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); } 2287: if (clen > 0) 2288: { 2289: pcre_uint32 otherd = NOTACHAR; 2290: if (caseless) 2291: { 2292: #ifdef SUPPORT_UTF 2293: if (utf && d >= 128) 2294: { 2295: #ifdef SUPPORT_UCP 2296: otherd = UCD_OTHERCASE(d); 2297: #endif /* SUPPORT_UCP */ 2298: } 2299: else 2300: #endif /* SUPPORT_UTF */ 2301: otherd = TABLE_GET(d, fcc, d); 2302: } 2303: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) 2304: { 2305: if (count > 0 && 2306: (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS)) 2307: { 2308: active_count--; /* Remove non-match possibility */ 2309: next_active_state--; 2310: } 2311: count++; 2312: ADD_NEW(state_offset, count); 2313: } 2314: } 2315: break; 2316: 2317: /*-----------------------------------------------------------------*/ 2318: case OP_QUERYI: 2319: case OP_MINQUERYI: 2320: case OP_POSQUERYI: 2321: case OP_NOTQUERYI: 2322: case OP_NOTMINQUERYI: 2323: case OP_NOTPOSQUERYI: 2324: caseless = TRUE; 2325: codevalue -= OP_STARI - OP_STAR; 2326: /* Fall through */ 2327: case OP_QUERY: 2328: case OP_MINQUERY: 2329: case OP_POSQUERY: 2330: case OP_NOTQUERY: 2331: case OP_NOTMINQUERY: 2332: case OP_NOTPOSQUERY: 2333: ADD_ACTIVE(state_offset + dlen + 1, 0); 2334: if (clen > 0) 2335: { 2336: pcre_uint32 otherd = NOTACHAR; 2337: if (caseless) 2338: { 2339: #ifdef SUPPORT_UTF 2340: if (utf && d >= 128) 2341: { 2342: #ifdef SUPPORT_UCP 2343: otherd = UCD_OTHERCASE(d); 2344: #endif /* SUPPORT_UCP */ 2345: } 2346: else 2347: #endif /* SUPPORT_UTF */ 2348: otherd = TABLE_GET(d, fcc, d); 2349: } 2350: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) 2351: { 2352: if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY) 2353: { 2354: active_count--; /* Remove non-match possibility */ 2355: next_active_state--; 2356: } 2357: ADD_NEW(state_offset + dlen + 1, 0); 2358: } 2359: } 2360: break; 2361: 2362: /*-----------------------------------------------------------------*/ 2363: case OP_STARI: 2364: case OP_MINSTARI: 2365: case OP_POSSTARI: 2366: case OP_NOTSTARI: 2367: case OP_NOTMINSTARI: 2368: case OP_NOTPOSSTARI: 2369: caseless = TRUE; 2370: codevalue -= OP_STARI - OP_STAR; 2371: /* Fall through */ 2372: case OP_STAR: 2373: case OP_MINSTAR: 2374: case OP_POSSTAR: 2375: case OP_NOTSTAR: 2376: case OP_NOTMINSTAR: 2377: case OP_NOTPOSSTAR: 2378: ADD_ACTIVE(state_offset + dlen + 1, 0); 2379: if (clen > 0) 2380: { 2381: pcre_uint32 otherd = NOTACHAR; 2382: if (caseless) 2383: { 2384: #ifdef SUPPORT_UTF 2385: if (utf && d >= 128) 2386: { 2387: #ifdef SUPPORT_UCP 2388: otherd = UCD_OTHERCASE(d); 2389: #endif /* SUPPORT_UCP */ 2390: } 2391: else 2392: #endif /* SUPPORT_UTF */ 2393: otherd = TABLE_GET(d, fcc, d); 2394: } 2395: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) 2396: { 2397: if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR) 2398: { 2399: active_count--; /* Remove non-match possibility */ 2400: next_active_state--; 2401: } 2402: ADD_NEW(state_offset, 0); 2403: } 2404: } 2405: break; 2406: 2407: /*-----------------------------------------------------------------*/ 2408: case OP_EXACTI: 2409: case OP_NOTEXACTI: 2410: caseless = TRUE; 2411: codevalue -= OP_STARI - OP_STAR; 2412: /* Fall through */ 2413: case OP_EXACT: 2414: case OP_NOTEXACT: 2415: count = current_state->count; /* Number already matched */ 2416: if (clen > 0) 2417: { 2418: pcre_uint32 otherd = NOTACHAR; 2419: if (caseless) 2420: { 2421: #ifdef SUPPORT_UTF 2422: if (utf && d >= 128) 2423: { 2424: #ifdef SUPPORT_UCP 2425: otherd = UCD_OTHERCASE(d); 2426: #endif /* SUPPORT_UCP */ 2427: } 2428: else 2429: #endif /* SUPPORT_UTF */ 2430: otherd = TABLE_GET(d, fcc, d); 2431: } 2432: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) 2433: { 2434: if (++count >= (int)GET2(code, 1)) 2435: { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); } 2436: else 2437: { ADD_NEW(state_offset, count); } 2438: } 2439: } 2440: break; 2441: 2442: /*-----------------------------------------------------------------*/ 2443: case OP_UPTOI: 2444: case OP_MINUPTOI: 2445: case OP_POSUPTOI: 2446: case OP_NOTUPTOI: 2447: case OP_NOTMINUPTOI: 2448: case OP_NOTPOSUPTOI: 2449: caseless = TRUE; 2450: codevalue -= OP_STARI - OP_STAR; 2451: /* Fall through */ 2452: case OP_UPTO: 2453: case OP_MINUPTO: 2454: case OP_POSUPTO: 2455: case OP_NOTUPTO: 2456: case OP_NOTMINUPTO: 2457: case OP_NOTPOSUPTO: 2458: ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0); 2459: count = current_state->count; /* Number already matched */ 2460: if (clen > 0) 2461: { 2462: pcre_uint32 otherd = NOTACHAR; 2463: if (caseless) 2464: { 2465: #ifdef SUPPORT_UTF 2466: if (utf && d >= 128) 2467: { 2468: #ifdef SUPPORT_UCP 2469: otherd = UCD_OTHERCASE(d); 2470: #endif /* SUPPORT_UCP */ 2471: } 2472: else 2473: #endif /* SUPPORT_UTF */ 2474: otherd = TABLE_GET(d, fcc, d); 2475: } 2476: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) 2477: { 2478: if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO) 2479: { 2480: active_count--; /* Remove non-match possibility */ 2481: next_active_state--; 2482: } 2483: if (++count >= (int)GET2(code, 1)) 2484: { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); } 2485: else 2486: { ADD_NEW(state_offset, count); } 2487: } 2488: } 2489: break; 2490: 2491: 2492: /* ========================================================================== */ 2493: /* These are the class-handling opcodes */ 2494: 2495: case OP_CLASS: 2496: case OP_NCLASS: 2497: case OP_XCLASS: 2498: { 2499: BOOL isinclass = FALSE; 2500: int next_state_offset; 2501: const pcre_uchar *ecode; 2502: 2503: /* For a simple class, there is always just a 32-byte table, and we 2504: can set isinclass from it. */ 2505: 2506: if (codevalue != OP_XCLASS) 2507: { 2508: ecode = code + 1 + (32 / sizeof(pcre_uchar)); 2509: if (clen > 0) 2510: { 2511: isinclass = (c > 255)? (codevalue == OP_NCLASS) : 2512: ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0); 2513: } 2514: } 2515: 2516: /* An extended class may have a table or a list of single characters, 2517: ranges, or both, and it may be positive or negative. There's a 2518: function that sorts all this out. */ 2519: 2520: else 2521: { 2522: ecode = code + GET(code, 1); 2523: if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf); 2524: } 2525: 2526: /* At this point, isinclass is set for all kinds of class, and ecode 2527: points to the byte after the end of the class. If there is a 2528: quantifier, this is where it will be. */ 2529: 2530: next_state_offset = (int)(ecode - start_code); 2531: 2532: switch (*ecode) 2533: { 2534: case OP_CRSTAR: 2535: case OP_CRMINSTAR: 2536: ADD_ACTIVE(next_state_offset + 1, 0); 2537: if (isinclass) { ADD_NEW(state_offset, 0); } 2538: break; 2539: 2540: case OP_CRPLUS: 2541: case OP_CRMINPLUS: 2542: count = current_state->count; /* Already matched */ 2543: if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); } 2544: if (isinclass) { count++; ADD_NEW(state_offset, count); } 2545: break; 2546: 2547: case OP_CRQUERY: 2548: case OP_CRMINQUERY: 2549: ADD_ACTIVE(next_state_offset + 1, 0); 2550: if (isinclass) { ADD_NEW(next_state_offset + 1, 0); } 2551: break; 2552: 2553: case OP_CRRANGE: 2554: case OP_CRMINRANGE: 2555: count = current_state->count; /* Already matched */ 2556: if (count >= (int)GET2(ecode, 1)) 2557: { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); } 2558: if (isinclass) 2559: { 2560: int max = (int)GET2(ecode, 1 + IMM2_SIZE); 2561: if (++count >= max && max != 0) /* Max 0 => no limit */ 2562: { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); } 2563: else 2564: { ADD_NEW(state_offset, count); } 2565: } 2566: break; 2567: 2568: default: 2569: if (isinclass) { ADD_NEW(next_state_offset, 0); } 2570: break; 2571: } 2572: } 2573: break; 2574: 2575: /* ========================================================================== */ 2576: /* These are the opcodes for fancy brackets of various kinds. We have 2577: to use recursion in order to handle them. The "always failing" assertion 2578: (?!) is optimised to OP_FAIL when compiling, so we have to support that, 2579: though the other "backtracking verbs" are not supported. */ 2580: 2581: case OP_FAIL: 2582: forced_fail++; /* Count FAILs for multiple states */ 2583: break; 2584: 2585: case OP_ASSERT: 2586: case OP_ASSERT_NOT: 2587: case OP_ASSERTBACK: 2588: case OP_ASSERTBACK_NOT: 2589: { 2590: int rc; 2591: int local_offsets[2]; 2592: int local_workspace[1000]; 2593: const pcre_uchar *endasscode = code + GET(code, 1); 2594: 2595: while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1); 2596: 2597: rc = internal_dfa_exec( 2598: md, /* static match data */ 2599: code, /* this subexpression's code */ 2600: ptr, /* where we currently are */ 2601: (int)(ptr - start_subject), /* start offset */ 2602: local_offsets, /* offset vector */ 2603: sizeof(local_offsets)/sizeof(int), /* size of same */ 2604: local_workspace, /* workspace vector */ 2605: sizeof(local_workspace)/sizeof(int), /* size of same */ 2606: rlevel); /* function recursion level */ 2607: 2608: if (rc == PCRE_ERROR_DFA_UITEM) return rc; 2609: if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK)) 2610: { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); } 2611: } 2612: break; 2613: 2614: /*-----------------------------------------------------------------*/ 2615: case OP_COND: 2616: case OP_SCOND: 2617: { 2618: int local_offsets[1000]; 2619: int local_workspace[1000]; 2620: int codelink = GET(code, 1); 2621: int condcode; 2622: 2623: /* Because of the way auto-callout works during compile, a callout item 2624: is inserted between OP_COND and an assertion condition. This does not 2625: happen for the other conditions. */ 2626: 2627: if (code[LINK_SIZE+1] == OP_CALLOUT) 2628: { 2629: rrc = 0; 2630: if (PUBL(callout) != NULL) 2631: { 2632: PUBL(callout_block) cb; 2633: cb.version = 1; /* Version 1 of the callout block */ 2634: cb.callout_number = code[LINK_SIZE+2]; 2635: cb.offset_vector = offsets; 2636: #if defined COMPILE_PCRE8 2637: cb.subject = (PCRE_SPTR)start_subject; 2638: #elif defined COMPILE_PCRE16 2639: cb.subject = (PCRE_SPTR16)start_subject; 2640: #elif defined COMPILE_PCRE32 2641: cb.subject = (PCRE_SPTR32)start_subject; 2642: #endif 2643: cb.subject_length = (int)(end_subject - start_subject); 2644: cb.start_match = (int)(current_subject - start_subject); 2645: cb.current_position = (int)(ptr - start_subject); 2646: cb.pattern_position = GET(code, LINK_SIZE + 3); 2647: cb.next_item_length = GET(code, 3 + 2*LINK_SIZE); 2648: cb.capture_top = 1; 2649: cb.capture_last = -1; 2650: cb.callout_data = md->callout_data; 2651: cb.mark = NULL; /* No (*MARK) support */ 2652: if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */ 2653: } 2654: if (rrc > 0) break; /* Fail this thread */ 2655: code += PRIV(OP_lengths)[OP_CALLOUT]; /* Skip callout data */ 2656: } 2657: 2658: condcode = code[LINK_SIZE+1]; 2659: 2660: /* Back reference conditions are not supported */ 2661: 2662: if (condcode == OP_CREF || condcode == OP_NCREF) 2663: return PCRE_ERROR_DFA_UCOND; 2664: 2665: /* The DEFINE condition is always false */ 2666: 2667: if (condcode == OP_DEF) 2668: { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); } 2669: 2670: /* The only supported version of OP_RREF is for the value RREF_ANY, 2671: which means "test if in any recursion". We can't test for specifically 2672: recursed groups. */ 2673: 2674: else if (condcode == OP_RREF || condcode == OP_NRREF) 2675: { 2676: int value = GET2(code, LINK_SIZE + 2); 2677: if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND; 2678: if (md->recursive != NULL) 2679: { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); } 2680: else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); } 2681: } 2682: 2683: /* Otherwise, the condition is an assertion */ 2684: 2685: else 2686: { 2687: int rc; 2688: const pcre_uchar *asscode = code + LINK_SIZE + 1; 2689: const pcre_uchar *endasscode = asscode + GET(asscode, 1); 2690: 2691: while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1); 2692: 2693: rc = internal_dfa_exec( 2694: md, /* fixed match data */ 2695: asscode, /* this subexpression's code */ 2696: ptr, /* where we currently are */ 2697: (int)(ptr - start_subject), /* start offset */ 2698: local_offsets, /* offset vector */ 2699: sizeof(local_offsets)/sizeof(int), /* size of same */ 2700: local_workspace, /* workspace vector */ 2701: sizeof(local_workspace)/sizeof(int), /* size of same */ 2702: rlevel); /* function recursion level */ 2703: 2704: if (rc == PCRE_ERROR_DFA_UITEM) return rc; 2705: if ((rc >= 0) == 2706: (condcode == OP_ASSERT || condcode == OP_ASSERTBACK)) 2707: { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); } 2708: else 2709: { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); } 2710: } 2711: } 2712: break; 2713: 2714: /*-----------------------------------------------------------------*/ 2715: case OP_RECURSE: 2716: { 2717: dfa_recursion_info *ri; 2718: int local_offsets[1000]; 2719: int local_workspace[1000]; 2720: const pcre_uchar *callpat = start_code + GET(code, 1); 2721: int recno = (callpat == md->start_code)? 0 : 2722: GET2(callpat, 1 + LINK_SIZE); 2723: int rc; 2724: 2725: DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP)); 2726: 2727: /* Check for repeating a recursion without advancing the subject 2728: pointer. This should catch convoluted mutual recursions. (Some simple 2729: cases are caught at compile time.) */ 2730: 2731: for (ri = md->recursive; ri != NULL; ri = ri->prevrec) 2732: if (recno == ri->group_num && ptr == ri->subject_position) 2733: return PCRE_ERROR_RECURSELOOP; 2734: 2735: /* Remember this recursion and where we started it so as to 2736: catch infinite loops. */ 2737: 2738: new_recursive.group_num = recno; 2739: new_recursive.subject_position = ptr; 2740: new_recursive.prevrec = md->recursive; 2741: md->recursive = &new_recursive; 2742: 2743: rc = internal_dfa_exec( 2744: md, /* fixed match data */ 2745: callpat, /* this subexpression's code */ 2746: ptr, /* where we currently are */ 2747: (int)(ptr - start_subject), /* start offset */ 2748: local_offsets, /* offset vector */ 2749: sizeof(local_offsets)/sizeof(int), /* size of same */ 2750: local_workspace, /* workspace vector */ 2751: sizeof(local_workspace)/sizeof(int), /* size of same */ 2752: rlevel); /* function recursion level */ 2753: 2754: md->recursive = new_recursive.prevrec; /* Done this recursion */ 2755: 2756: DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP, 2757: rc)); 2758: 2759: /* Ran out of internal offsets */ 2760: 2761: if (rc == 0) return PCRE_ERROR_DFA_RECURSE; 2762: 2763: /* For each successful matched substring, set up the next state with a 2764: count of characters to skip before trying it. Note that the count is in 2765: characters, not bytes. */ 2766: 2767: if (rc > 0) 2768: { 2769: for (rc = rc*2 - 2; rc >= 0; rc -= 2) 2770: { 2771: int charcount = local_offsets[rc+1] - local_offsets[rc]; 2772: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32 2773: if (utf) 2774: { 2775: const pcre_uchar *p = start_subject + local_offsets[rc]; 2776: const pcre_uchar *pp = start_subject + local_offsets[rc+1]; 2777: while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--; 2778: } 2779: #endif 2780: if (charcount > 0) 2781: { 2782: ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1)); 2783: } 2784: else 2785: { 2786: ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0); 2787: } 2788: } 2789: } 2790: else if (rc != PCRE_ERROR_NOMATCH) return rc; 2791: } 2792: break; 2793: 2794: /*-----------------------------------------------------------------*/ 2795: case OP_BRAPOS: 2796: case OP_SBRAPOS: 2797: case OP_CBRAPOS: 2798: case OP_SCBRAPOS: 2799: case OP_BRAPOSZERO: 2800: { 2801: int charcount, matched_count; 2802: const pcre_uchar *local_ptr = ptr; 2803: BOOL allow_zero; 2804: 2805: if (codevalue == OP_BRAPOSZERO) 2806: { 2807: allow_zero = TRUE; 2808: codevalue = *(++code); /* Codevalue will be one of above BRAs */ 2809: } 2810: else allow_zero = FALSE; 2811: 2812: /* Loop to match the subpattern as many times as possible as if it were 2813: a complete pattern. */ 2814: 2815: for (matched_count = 0;; matched_count++) 2816: { 2817: int local_offsets[2]; 2818: int local_workspace[1000]; 2819: 2820: int rc = internal_dfa_exec( 2821: md, /* fixed match data */ 2822: code, /* this subexpression's code */ 2823: local_ptr, /* where we currently are */ 2824: (int)(ptr - start_subject), /* start offset */ 2825: local_offsets, /* offset vector */ 2826: sizeof(local_offsets)/sizeof(int), /* size of same */ 2827: local_workspace, /* workspace vector */ 2828: sizeof(local_workspace)/sizeof(int), /* size of same */ 2829: rlevel); /* function recursion level */ 2830: 2831: /* Failed to match */ 2832: 2833: if (rc < 0) 2834: { 2835: if (rc != PCRE_ERROR_NOMATCH) return rc; 2836: break; 2837: } 2838: 2839: /* Matched: break the loop if zero characters matched. */ 2840: 2841: charcount = local_offsets[1] - local_offsets[0]; 2842: if (charcount == 0) break; 2843: local_ptr += charcount; /* Advance temporary position ptr */ 2844: } 2845: 2846: /* At this point we have matched the subpattern matched_count 2847: times, and local_ptr is pointing to the character after the end of the 2848: last match. */ 2849: 2850: if (matched_count > 0 || allow_zero) 2851: { 2852: const pcre_uchar *end_subpattern = code; 2853: int next_state_offset; 2854: 2855: do { end_subpattern += GET(end_subpattern, 1); } 2856: while (*end_subpattern == OP_ALT); 2857: next_state_offset = 2858: (int)(end_subpattern - start_code + LINK_SIZE + 1); 2859: 2860: /* Optimization: if there are no more active states, and there 2861: are no new states yet set up, then skip over the subject string 2862: right here, to save looping. Otherwise, set up the new state to swing 2863: into action when the end of the matched substring is reached. */ 2864: 2865: if (i + 1 >= active_count && new_count == 0) 2866: { 2867: ptr = local_ptr; 2868: clen = 0; 2869: ADD_NEW(next_state_offset, 0); 2870: } 2871: else 2872: { 2873: const pcre_uchar *p = ptr; 2874: const pcre_uchar *pp = local_ptr; 2875: charcount = (int)(pp - p); 2876: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32 2877: if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--; 2878: #endif 2879: ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1)); 2880: } 2881: } 2882: } 2883: break; 2884: 2885: /*-----------------------------------------------------------------*/ 2886: case OP_ONCE: 2887: case OP_ONCE_NC: 2888: { 2889: int local_offsets[2]; 2890: int local_workspace[1000]; 2891: 2892: int rc = internal_dfa_exec( 2893: md, /* fixed match data */ 2894: code, /* this subexpression's code */ 2895: ptr, /* where we currently are */ 2896: (int)(ptr - start_subject), /* start offset */ 2897: local_offsets, /* offset vector */ 2898: sizeof(local_offsets)/sizeof(int), /* size of same */ 2899: local_workspace, /* workspace vector */ 2900: sizeof(local_workspace)/sizeof(int), /* size of same */ 2901: rlevel); /* function recursion level */ 2902: 2903: if (rc >= 0) 2904: { 2905: const pcre_uchar *end_subpattern = code; 2906: int charcount = local_offsets[1] - local_offsets[0]; 2907: int next_state_offset, repeat_state_offset; 2908: 2909: do { end_subpattern += GET(end_subpattern, 1); } 2910: while (*end_subpattern == OP_ALT); 2911: next_state_offset = 2912: (int)(end_subpattern - start_code + LINK_SIZE + 1); 2913: 2914: /* If the end of this subpattern is KETRMAX or KETRMIN, we must 2915: arrange for the repeat state also to be added to the relevant list. 2916: Calculate the offset, or set -1 for no repeat. */ 2917: 2918: repeat_state_offset = (*end_subpattern == OP_KETRMAX || 2919: *end_subpattern == OP_KETRMIN)? 2920: (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1; 2921: 2922: /* If we have matched an empty string, add the next state at the 2923: current character pointer. This is important so that the duplicate 2924: checking kicks in, which is what breaks infinite loops that match an 2925: empty string. */ 2926: 2927: if (charcount == 0) 2928: { 2929: ADD_ACTIVE(next_state_offset, 0); 2930: } 2931: 2932: /* Optimization: if there are no more active states, and there 2933: are no new states yet set up, then skip over the subject string 2934: right here, to save looping. Otherwise, set up the new state to swing 2935: into action when the end of the matched substring is reached. */ 2936: 2937: else if (i + 1 >= active_count && new_count == 0) 2938: { 2939: ptr += charcount; 2940: clen = 0; 2941: ADD_NEW(next_state_offset, 0); 2942: 2943: /* If we are adding a repeat state at the new character position, 2944: we must fudge things so that it is the only current state. 2945: Otherwise, it might be a duplicate of one we processed before, and 2946: that would cause it to be skipped. */ 2947: 2948: if (repeat_state_offset >= 0) 2949: { 2950: next_active_state = active_states; 2951: active_count = 0; 2952: i = -1; 2953: ADD_ACTIVE(repeat_state_offset, 0); 2954: } 2955: } 2956: else 2957: { 2958: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32 2959: if (utf) 2960: { 2961: const pcre_uchar *p = start_subject + local_offsets[0]; 2962: const pcre_uchar *pp = start_subject + local_offsets[1]; 2963: while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--; 2964: } 2965: #endif 2966: ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1)); 2967: if (repeat_state_offset >= 0) 2968: { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); } 2969: } 2970: } 2971: else if (rc != PCRE_ERROR_NOMATCH) return rc; 2972: } 2973: break; 2974: 2975: 2976: /* ========================================================================== */ 2977: /* Handle callouts */ 2978: 2979: case OP_CALLOUT: 2980: rrc = 0; 2981: if (PUBL(callout) != NULL) 2982: { 2983: PUBL(callout_block) cb; 2984: cb.version = 1; /* Version 1 of the callout block */ 2985: cb.callout_number = code[1]; 2986: cb.offset_vector = offsets; 2987: #if defined COMPILE_PCRE8 2988: cb.subject = (PCRE_SPTR)start_subject; 2989: #elif defined COMPILE_PCRE16 2990: cb.subject = (PCRE_SPTR16)start_subject; 2991: #elif defined COMPILE_PCRE32 2992: cb.subject = (PCRE_SPTR32)start_subject; 2993: #endif 2994: cb.subject_length = (int)(end_subject - start_subject); 2995: cb.start_match = (int)(current_subject - start_subject); 2996: cb.current_position = (int)(ptr - start_subject); 2997: cb.pattern_position = GET(code, 2); 2998: cb.next_item_length = GET(code, 2 + LINK_SIZE); 2999: cb.capture_top = 1; 3000: cb.capture_last = -1; 3001: cb.callout_data = md->callout_data; 3002: cb.mark = NULL; /* No (*MARK) support */ 3003: if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */ 3004: } 3005: if (rrc == 0) 3006: { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); } 3007: break; 3008: 3009: 3010: /* ========================================================================== */ 3011: default: /* Unsupported opcode */ 3012: return PCRE_ERROR_DFA_UITEM; 3013: } 3014: 3015: NEXT_ACTIVE_STATE: continue; 3016: 3017: } /* End of loop scanning active states */ 3018: 3019: /* We have finished the processing at the current subject character. If no 3020: new states have been set for the next character, we have found all the 3021: matches that we are going to find. If we are at the top level and partial 3022: matching has been requested, check for appropriate conditions. 3023: 3024: The "forced_ fail" variable counts the number of (*F) encountered for the 3025: character. If it is equal to the original active_count (saved in 3026: workspace[1]) it means that (*F) was found on every active state. In this 3027: case we don't want to give a partial match. 3028: 3029: The "could_continue" variable is true if a state could have continued but 3030: for the fact that the end of the subject was reached. */ 3031: 3032: if (new_count <= 0) 3033: { 3034: if (rlevel == 1 && /* Top level, and */ 3035: could_continue && /* Some could go on, and */ 3036: forced_fail != workspace[1] && /* Not all forced fail & */ 3037: ( /* either... */ 3038: (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */ 3039: || /* or... */ 3040: ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */ 3041: match_count < 0) /* no matches */ 3042: ) && /* And... */ 3043: ( 3044: partial_newline || /* Either partial NL */ 3045: ( /* or ... */ 3046: ptr >= end_subject && /* End of subject and */ 3047: ptr > md->start_used_ptr) /* Inspected non-empty string */ 3048: ) 3049: ) 3050: match_count = PCRE_ERROR_PARTIAL; 3051: DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" 3052: "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count, 3053: rlevel*2-2, SP)); 3054: break; /* In effect, "return", but see the comment below */ 3055: } 3056: 3057: /* One or more states are active for the next character. */ 3058: 3059: ptr += clen; /* Advance to next subject character */ 3060: } /* Loop to move along the subject string */ 3061: 3062: /* Control gets here from "break" a few lines above. We do it this way because 3063: if we use "return" above, we have compiler trouble. Some compilers warn if 3064: there's nothing here because they think the function doesn't return a value. On 3065: the other hand, if we put a dummy statement here, some more clever compilers 3066: complain that it can't be reached. Sigh. */ 3067: 3068: return match_count; 3069: } 3070: 3071: 3072: 3073: 3074: /************************************************* 3075: * Execute a Regular Expression - DFA engine * 3076: *************************************************/ 3077: 3078: /* This external function applies a compiled re to a subject string using a DFA 3079: engine. This function calls the internal function multiple times if the pattern 3080: is not anchored. 3081: 3082: Arguments: 3083: argument_re points to the compiled expression 3084: extra_data points to extra data or is NULL 3085: subject points to the subject string 3086: length length of subject string (may contain binary zeros) 3087: start_offset where to start in the subject string 3088: options option bits 3089: offsets vector of match offsets 3090: offsetcount size of same 3091: workspace workspace vector 3092: wscount size of same 3093: 3094: Returns: > 0 => number of match offset pairs placed in offsets 3095: = 0 => offsets overflowed; longest matches are present 3096: -1 => failed to match 3097: < -1 => some kind of unexpected problem 3098: */ 3099: 3100: #if defined COMPILE_PCRE8 3101: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION 3102: pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data, 3103: const char *subject, int length, int start_offset, int options, int *offsets, 3104: int offsetcount, int *workspace, int wscount) 3105: #elif defined COMPILE_PCRE16 3106: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION 3107: pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data, 3108: PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets, 3109: int offsetcount, int *workspace, int wscount) 3110: #elif defined COMPILE_PCRE32 3111: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION 3112: pcre32_dfa_exec(const pcre32 *argument_re, const pcre32_extra *extra_data, 3113: PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets, 3114: int offsetcount, int *workspace, int wscount) 3115: #endif 3116: { 3117: REAL_PCRE *re = (REAL_PCRE *)argument_re; 3118: dfa_match_data match_block; 3119: dfa_match_data *md = &match_block; 3120: BOOL utf, anchored, startline, firstline; 3121: const pcre_uchar *current_subject, *end_subject; 3122: const pcre_study_data *study = NULL; 3123: 3124: const pcre_uchar *req_char_ptr; 3125: const pcre_uint8 *start_bits = NULL; 3126: BOOL has_first_char = FALSE; 3127: BOOL has_req_char = FALSE; 3128: pcre_uchar first_char = 0; 3129: pcre_uchar first_char2 = 0; 3130: pcre_uchar req_char = 0; 3131: pcre_uchar req_char2 = 0; 3132: int newline; 3133: 3134: /* Plausibility checks */ 3135: 3136: if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; 3137: if (re == NULL || subject == NULL || workspace == NULL || 3138: (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; 3139: if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; 3140: if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE; 3141: if (length < 0) return PCRE_ERROR_BADLENGTH; 3142: if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET; 3143: 3144: /* Check that the first field in the block is the magic number. If it is not, 3145: return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to 3146: REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which 3147: means that the pattern is likely compiled with different endianness. */ 3148: 3149: if (re->magic_number != MAGIC_NUMBER) 3150: return re->magic_number == REVERSED_MAGIC_NUMBER? 3151: PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC; 3152: if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE; 3153: 3154: /* If restarting after a partial match, do some sanity checks on the contents 3155: of the workspace. */ 3156: 3157: if ((options & PCRE_DFA_RESTART) != 0) 3158: { 3159: if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 || 3160: workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK) 3161: return PCRE_ERROR_DFA_BADRESTART; 3162: } 3163: 3164: /* Set up study, callout, and table data */ 3165: 3166: md->tables = re->tables; 3167: md->callout_data = NULL; 3168: 3169: if (extra_data != NULL) 3170: { 3171: unsigned int flags = extra_data->flags; 3172: if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) 3173: study = (const pcre_study_data *)extra_data->study_data; 3174: if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT; 3175: if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) 3176: return PCRE_ERROR_DFA_UMLIMIT; 3177: if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) 3178: md->callout_data = extra_data->callout_data; 3179: if ((flags & PCRE_EXTRA_TABLES) != 0) 3180: md->tables = extra_data->tables; 3181: } 3182: 3183: /* Set some local values */ 3184: 3185: current_subject = (const pcre_uchar *)subject + start_offset; 3186: end_subject = (const pcre_uchar *)subject + length; 3187: req_char_ptr = current_subject - 1; 3188: 3189: #ifdef SUPPORT_UTF 3190: /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */ 3191: utf = (re->options & PCRE_UTF8) != 0; 3192: #else 3193: utf = FALSE; 3194: #endif 3195: 3196: anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 || 3197: (re->options & PCRE_ANCHORED) != 0; 3198: 3199: /* The remaining fixed data for passing around. */ 3200: 3201: md->start_code = (const pcre_uchar *)argument_re + 3202: re->name_table_offset + re->name_count * re->name_entry_size; 3203: md->start_subject = (const pcre_uchar *)subject; 3204: md->end_subject = end_subject; 3205: md->start_offset = start_offset; 3206: md->moptions = options; 3207: md->poptions = re->options; 3208: 3209: /* If the BSR option is not set at match time, copy what was set 3210: at compile time. */ 3211: 3212: if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0) 3213: { 3214: if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0) 3215: md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE); 3216: #ifdef BSR_ANYCRLF 3217: else md->moptions |= PCRE_BSR_ANYCRLF; 3218: #endif 3219: } 3220: 3221: /* Handle different types of newline. The three bits give eight cases. If 3222: nothing is set at run time, whatever was used at compile time applies. */ 3223: 3224: switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) & 3225: PCRE_NEWLINE_BITS) 3226: { 3227: case 0: newline = NEWLINE; break; /* Compile-time default */ 3228: case PCRE_NEWLINE_CR: newline = CHAR_CR; break; 3229: case PCRE_NEWLINE_LF: newline = CHAR_NL; break; 3230: case PCRE_NEWLINE_CR+ 3231: PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break; 3232: case PCRE_NEWLINE_ANY: newline = -1; break; 3233: case PCRE_NEWLINE_ANYCRLF: newline = -2; break; 3234: default: return PCRE_ERROR_BADNEWLINE; 3235: } 3236: 3237: if (newline == -2) 3238: { 3239: md->nltype = NLTYPE_ANYCRLF; 3240: } 3241: else if (newline < 0) 3242: { 3243: md->nltype = NLTYPE_ANY; 3244: } 3245: else 3246: { 3247: md->nltype = NLTYPE_FIXED; 3248: if (newline > 255) 3249: { 3250: md->nllen = 2; 3251: md->nl[0] = (newline >> 8) & 255; 3252: md->nl[1] = newline & 255; 3253: } 3254: else 3255: { 3256: md->nllen = 1; 3257: md->nl[0] = newline; 3258: } 3259: } 3260: 3261: /* Check a UTF-8 string if required. Unfortunately there's no way of passing 3262: back the character offset. */ 3263: 3264: #ifdef SUPPORT_UTF 3265: if (utf && (options & PCRE_NO_UTF8_CHECK) == 0) 3266: { 3267: int erroroffset; 3268: int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset); 3269: if (errorcode != 0) 3270: { 3271: if (offsetcount >= 2) 3272: { 3273: offsets[0] = erroroffset; 3274: offsets[1] = errorcode; 3275: } 3276: #if defined COMPILE_PCRE8 3277: return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0) ? 3278: PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8; 3279: #elif defined COMPILE_PCRE16 3280: return (errorcode <= PCRE_UTF16_ERR1 && (options & PCRE_PARTIAL_HARD) != 0) ? 3281: PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16; 3282: #elif defined COMPILE_PCRE32 3283: return PCRE_ERROR_BADUTF32; 3284: #endif 3285: } 3286: #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16 3287: if (start_offset > 0 && start_offset < length && 3288: NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset])) 3289: return PCRE_ERROR_BADUTF8_OFFSET; 3290: #endif 3291: } 3292: #endif 3293: 3294: /* If the exec call supplied NULL for tables, use the inbuilt ones. This 3295: is a feature that makes it possible to save compiled regex and re-use them 3296: in other programs later. */ 3297: 3298: if (md->tables == NULL) md->tables = PRIV(default_tables); 3299: 3300: /* The "must be at the start of a line" flags are used in a loop when finding 3301: where to start. */ 3302: 3303: startline = (re->flags & PCRE_STARTLINE) != 0; 3304: firstline = (re->options & PCRE_FIRSTLINE) != 0; 3305: 3306: /* Set up the first character to match, if available. The first_byte value is 3307: never set for an anchored regular expression, but the anchoring may be forced 3308: at run time, so we have to test for anchoring. The first char may be unset for 3309: an unanchored pattern, of course. If there's no first char and the pattern was 3310: studied, there may be a bitmap of possible first characters. */ 3311: 3312: if (!anchored) 3313: { 3314: if ((re->flags & PCRE_FIRSTSET) != 0) 3315: { 3316: has_first_char = TRUE; 3317: first_char = first_char2 = (pcre_uchar)(re->first_char); 3318: if ((re->flags & PCRE_FCH_CASELESS) != 0) 3319: { 3320: first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char); 3321: #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) 3322: if (utf && first_char > 127) 3323: first_char2 = UCD_OTHERCASE(first_char); 3324: #endif 3325: } 3326: } 3327: else 3328: { 3329: if (!startline && study != NULL && 3330: (study->flags & PCRE_STUDY_MAPPED) != 0) 3331: start_bits = study->start_bits; 3332: } 3333: } 3334: 3335: /* For anchored or unanchored matches, there may be a "last known required 3336: character" set. */ 3337: 3338: if ((re->flags & PCRE_REQCHSET) != 0) 3339: { 3340: has_req_char = TRUE; 3341: req_char = req_char2 = (pcre_uchar)(re->req_char); 3342: if ((re->flags & PCRE_RCH_CASELESS) != 0) 3343: { 3344: req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char); 3345: #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) 3346: if (utf && req_char > 127) 3347: req_char2 = UCD_OTHERCASE(req_char); 3348: #endif 3349: } 3350: } 3351: 3352: /* Call the main matching function, looping for a non-anchored regex after a 3353: failed match. If not restarting, perform certain optimizations at the start of 3354: a match. */ 3355: 3356: for (;;) 3357: { 3358: int rc; 3359: 3360: if ((options & PCRE_DFA_RESTART) == 0) 3361: { 3362: const pcre_uchar *save_end_subject = end_subject; 3363: 3364: /* If firstline is TRUE, the start of the match is constrained to the first 3365: line of a multiline string. Implement this by temporarily adjusting 3366: end_subject so that we stop scanning at a newline. If the match fails at 3367: the newline, later code breaks this loop. */ 3368: 3369: if (firstline) 3370: { 3371: PCRE_PUCHAR t = current_subject; 3372: #ifdef SUPPORT_UTF 3373: if (utf) 3374: { 3375: while (t < md->end_subject && !IS_NEWLINE(t)) 3376: { 3377: t++; 3378: ACROSSCHAR(t < end_subject, *t, t++); 3379: } 3380: } 3381: else 3382: #endif 3383: while (t < md->end_subject && !IS_NEWLINE(t)) t++; 3384: end_subject = t; 3385: } 3386: 3387: /* There are some optimizations that avoid running the match if a known 3388: starting point is not found. However, there is an option that disables 3389: these, for testing and for ensuring that all callouts do actually occur. 3390: The option can be set in the regex by (*NO_START_OPT) or passed in 3391: match-time options. */ 3392: 3393: if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0) 3394: { 3395: /* Advance to a known first char. */ 3396: 3397: if (has_first_char) 3398: { 3399: if (first_char != first_char2) 3400: { 3401: pcre_uchar csc; 3402: while (current_subject < end_subject && 3403: (csc = RAWUCHARTEST(current_subject)) != first_char && csc != first_char2) 3404: current_subject++; 3405: } 3406: else 3407: while (current_subject < end_subject && 3408: RAWUCHARTEST(current_subject) != first_char) 3409: current_subject++; 3410: } 3411: 3412: /* Or to just after a linebreak for a multiline match if possible */ 3413: 3414: else if (startline) 3415: { 3416: if (current_subject > md->start_subject + start_offset) 3417: { 3418: #ifdef SUPPORT_UTF 3419: if (utf) 3420: { 3421: while (current_subject < end_subject && 3422: !WAS_NEWLINE(current_subject)) 3423: { 3424: current_subject++; 3425: ACROSSCHAR(current_subject < end_subject, *current_subject, 3426: current_subject++); 3427: } 3428: } 3429: else 3430: #endif 3431: while (current_subject < end_subject && !WAS_NEWLINE(current_subject)) 3432: current_subject++; 3433: 3434: /* If we have just passed a CR and the newline option is ANY or 3435: ANYCRLF, and we are now at a LF, advance the match position by one 3436: more character. */ 3437: 3438: if (RAWUCHARTEST(current_subject - 1) == CHAR_CR && 3439: (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) && 3440: current_subject < end_subject && 3441: RAWUCHARTEST(current_subject) == CHAR_NL) 3442: current_subject++; 3443: } 3444: } 3445: 3446: /* Or to a non-unique first char after study */ 3447: 3448: else if (start_bits != NULL) 3449: { 3450: while (current_subject < end_subject) 3451: { 3452: register pcre_uint32 c = RAWUCHARTEST(current_subject); 3453: #ifndef COMPILE_PCRE8 3454: if (c > 255) c = 255; 3455: #endif 3456: if ((start_bits[c/8] & (1 << (c&7))) == 0) 3457: { 3458: current_subject++; 3459: #if defined SUPPORT_UTF && defined COMPILE_PCRE8 3460: /* In non 8-bit mode, the iteration will stop for 3461: characters > 255 at the beginning or not stop at all. */ 3462: if (utf) 3463: ACROSSCHAR(current_subject < end_subject, *current_subject, 3464: current_subject++); 3465: #endif 3466: } 3467: else break; 3468: } 3469: } 3470: } 3471: 3472: /* Restore fudged end_subject */ 3473: 3474: end_subject = save_end_subject; 3475: 3476: /* The following two optimizations are disabled for partial matching or if 3477: disabling is explicitly requested (and of course, by the test above, this 3478: code is not obeyed when restarting after a partial match). */ 3479: 3480: if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && 3481: (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0) 3482: { 3483: /* If the pattern was studied, a minimum subject length may be set. This 3484: is a lower bound; no actual string of that length may actually match the 3485: pattern. Although the value is, strictly, in characters, we treat it as 3486: bytes to avoid spending too much time in this optimization. */ 3487: 3488: if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 && 3489: (pcre_uint32)(end_subject - current_subject) < study->minlength) 3490: return PCRE_ERROR_NOMATCH; 3491: 3492: /* If req_char is set, we know that that character must appear in the 3493: subject for the match to succeed. If the first character is set, req_char 3494: must be later in the subject; otherwise the test starts at the match 3495: point. This optimization can save a huge amount of work in patterns with 3496: nested unlimited repeats that aren't going to match. Writing separate 3497: code for cased/caseless versions makes it go faster, as does using an 3498: autoincrement and backing off on a match. 3499: 3500: HOWEVER: when the subject string is very, very long, searching to its end 3501: can take a long time, and give bad performance on quite ordinary 3502: patterns. This showed up when somebody was matching /^C/ on a 32-megabyte 3503: string... so we don't do this when the string is sufficiently long. */ 3504: 3505: if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX) 3506: { 3507: register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0); 3508: 3509: /* We don't need to repeat the search if we haven't yet reached the 3510: place we found it at last time. */ 3511: 3512: if (p > req_char_ptr) 3513: { 3514: if (req_char != req_char2) 3515: { 3516: while (p < end_subject) 3517: { 3518: register pcre_uint32 pp = RAWUCHARINCTEST(p); 3519: if (pp == req_char || pp == req_char2) { p--; break; } 3520: } 3521: } 3522: else 3523: { 3524: while (p < end_subject) 3525: { 3526: if (RAWUCHARINCTEST(p) == req_char) { p--; break; } 3527: } 3528: } 3529: 3530: /* If we can't find the required character, break the matching loop, 3531: which will cause a return or PCRE_ERROR_NOMATCH. */ 3532: 3533: if (p >= end_subject) break; 3534: 3535: /* If we have found the required character, save the point where we 3536: found it, so that we don't search again next time round the loop if 3537: the start hasn't passed this character yet. */ 3538: 3539: req_char_ptr = p; 3540: } 3541: } 3542: } 3543: } /* End of optimizations that are done when not restarting */ 3544: 3545: /* OK, now we can do the business */ 3546: 3547: md->start_used_ptr = current_subject; 3548: md->recursive = NULL; 3549: 3550: rc = internal_dfa_exec( 3551: md, /* fixed match data */ 3552: md->start_code, /* this subexpression's code */ 3553: current_subject, /* where we currently are */ 3554: start_offset, /* start offset in subject */ 3555: offsets, /* offset vector */ 3556: offsetcount, /* size of same */ 3557: workspace, /* workspace vector */ 3558: wscount, /* size of same */ 3559: 0); /* function recurse level */ 3560: 3561: /* Anything other than "no match" means we are done, always; otherwise, carry 3562: on only if not anchored. */ 3563: 3564: if (rc != PCRE_ERROR_NOMATCH || anchored) 3565: { 3566: if (rc == PCRE_ERROR_PARTIAL && offsetcount >= 2) 3567: { 3568: offsets[0] = (int)(md->start_used_ptr - (PCRE_PUCHAR)subject); 3569: offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject); 3570: if (offsetcount > 2) 3571: offsets[2] = (int)(current_subject - (PCRE_PUCHAR)subject); 3572: } 3573: return rc; 3574: } 3575: 3576: /* Advance to the next subject character unless we are at the end of a line 3577: and firstline is set. */ 3578: 3579: if (firstline && IS_NEWLINE(current_subject)) break; 3580: current_subject++; 3581: #ifdef SUPPORT_UTF 3582: if (utf) 3583: { 3584: ACROSSCHAR(current_subject < end_subject, *current_subject, 3585: current_subject++); 3586: } 3587: #endif 3588: if (current_subject > end_subject) break; 3589: 3590: /* If we have just passed a CR and we are now at a LF, and the pattern does 3591: not contain any explicit matches for \r or \n, and the newline option is CRLF 3592: or ANY or ANYCRLF, advance the match position by one more character. */ 3593: 3594: if (RAWUCHARTEST(current_subject - 1) == CHAR_CR && 3595: current_subject < end_subject && 3596: RAWUCHARTEST(current_subject) == CHAR_NL && 3597: (re->flags & PCRE_HASCRORLF) == 0 && 3598: (md->nltype == NLTYPE_ANY || 3599: md->nltype == NLTYPE_ANYCRLF || 3600: md->nllen == 2)) 3601: current_subject++; 3602: 3603: } /* "Bumpalong" loop */ 3604: 3605: return PCRE_ERROR_NOMATCH; 3606: } 3607: 3608: /* End of pcre_dfa_exec.c */