embedaddon/pcre/pcre_study.c - view

File: [ELWIX - Embedded LightWeight unIX -] / embedaddon / pcre / pcre_study.c
Revision 1.1.1.4 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Mon Jul 22 08:25:55 2013 UTC (10 years, 11 months ago) by misho
Branches: pcre, MAIN
CVS tags: v8_33, HEAD

8.33

1: /************************************************* 2: * Perl-Compatible Regular Expressions * 3: *************************************************/ 4: 5: /* PCRE is a library of functions to support regular expressions whose syntax 6: and semantics are as close as possible to those of the Perl 5 language. 7: 8: Written by Philip Hazel 9: Copyright (c) 1997-2012 University of Cambridge 10: 11: ----------------------------------------------------------------------------- 12: Redistribution and use in source and binary forms, with or without 13: modification, are permitted provided that the following conditions are met: 14: 15: * Redistributions of source code must retain the above copyright notice, 16: this list of conditions and the following disclaimer. 17: 18: * Redistributions in binary form must reproduce the above copyright 19: notice, this list of conditions and the following disclaimer in the 20: documentation and/or other materials provided with the distribution. 21: 22: * Neither the name of the University of Cambridge nor the names of its 23: contributors may be used to endorse or promote products derived from 24: this software without specific prior written permission. 25: 26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36: POSSIBILITY OF SUCH DAMAGE. 37: ----------------------------------------------------------------------------- 38: */ 39: 40: 41: /* This module contains the external function pcre_study(), along with local 42: supporting functions. */ 43: 44: 45: #ifdef HAVE_CONFIG_H 46: #include "config.h" 47: #endif 48: 49: #include "pcre_internal.h" 50: 51: #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7)) 52: 53: /* Returns from set_start_bits() */ 54: 55: enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN }; 56: 57: 58: 59: /************************************************* 60: * Find the minimum subject length for a group * 61: *************************************************/ 62: 63: /* Scan a parenthesized group and compute the minimum length of subject that 64: is needed to match it. This is a lower bound; it does not mean there is a 65: string of that length that matches. In UTF8 mode, the result is in characters 66: rather than bytes. 67: 68: Arguments: 69: code pointer to start of group (the bracket) 70: startcode pointer to start of the whole pattern 71: options the compiling options 72: int RECURSE depth 73: 74: Returns: the minimum length 75: -1 if \C in UTF-8 mode or (*ACCEPT) was encountered 76: -2 internal error (missing capturing bracket) 77: -3 internal error (opcode not listed) 78: */ 79: 80: static int 81: find_minlength(const pcre_uchar *code, const pcre_uchar *startcode, int options, 82: int recurse_depth) 83: { 84: int length = -1; 85: /* PCRE_UTF16 has the same value as PCRE_UTF8. */ 86: BOOL utf = (options & PCRE_UTF8) != 0; 87: BOOL had_recurse = FALSE; 88: register int branchlength = 0; 89: register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE; 90: 91: if (*code == OP_CBRA || *code == OP_SCBRA || 92: *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE; 93: 94: /* Scan along the opcodes for this branch. If we get to the end of the 95: branch, check the length against that of the other branches. */ 96: 97: for (;;) 98: { 99: int d, min; 100: pcre_uchar *cs, *ce; 101: register pcre_uchar op = *cc; 102: 103: switch (op) 104: { 105: case OP_COND: 106: case OP_SCOND: 107: 108: /* If there is only one branch in a condition, the implied branch has zero 109: length, so we don't add anything. This covers the DEFINE "condition" 110: automatically. */ 111: 112: cs = cc + GET(cc, 1); 113: if (*cs != OP_ALT) 114: { 115: cc = cs + 1 + LINK_SIZE; 116: break; 117: } 118: 119: /* Otherwise we can fall through and treat it the same as any other 120: subpattern. */ 121: 122: case OP_CBRA: 123: case OP_SCBRA: 124: case OP_BRA: 125: case OP_SBRA: 126: case OP_CBRAPOS: 127: case OP_SCBRAPOS: 128: case OP_BRAPOS: 129: case OP_SBRAPOS: 130: case OP_ONCE: 131: case OP_ONCE_NC: 132: d = find_minlength(cc, startcode, options, recurse_depth); 133: if (d < 0) return d; 134: branchlength += d; 135: do cc += GET(cc, 1); while (*cc == OP_ALT); 136: cc += 1 + LINK_SIZE; 137: break; 138: 139: /* ACCEPT makes things far too complicated; we have to give up. */ 140: 141: case OP_ACCEPT: 142: case OP_ASSERT_ACCEPT: 143: return -1; 144: 145: /* Reached end of a branch; if it's a ket it is the end of a nested 146: call. If it's ALT it is an alternation in a nested call. If it is END it's 147: the end of the outer call. All can be handled by the same code. If an 148: ACCEPT was previously encountered, use the length that was in force at that 149: time, and pass back the shortest ACCEPT length. */ 150: 151: case OP_ALT: 152: case OP_KET: 153: case OP_KETRMAX: 154: case OP_KETRMIN: 155: case OP_KETRPOS: 156: case OP_END: 157: if (length < 0 || (!had_recurse && branchlength < length)) 158: length = branchlength; 159: if (op != OP_ALT) return length; 160: cc += 1 + LINK_SIZE; 161: branchlength = 0; 162: had_recurse = FALSE; 163: break; 164: 165: /* Skip over assertive subpatterns */ 166: 167: case OP_ASSERT: 168: case OP_ASSERT_NOT: 169: case OP_ASSERTBACK: 170: case OP_ASSERTBACK_NOT: 171: do cc += GET(cc, 1); while (*cc == OP_ALT); 172: /* Fall through */ 173: 174: /* Skip over things that don't match chars */ 175: 176: case OP_REVERSE: 177: case OP_CREF: 178: case OP_NCREF: 179: case OP_RREF: 180: case OP_NRREF: 181: case OP_DEF: 182: case OP_CALLOUT: 183: case OP_SOD: 184: case OP_SOM: 185: case OP_EOD: 186: case OP_EODN: 187: case OP_CIRC: 188: case OP_CIRCM: 189: case OP_DOLL: 190: case OP_DOLLM: 191: case OP_NOT_WORD_BOUNDARY: 192: case OP_WORD_BOUNDARY: 193: cc += PRIV(OP_lengths)[*cc]; 194: break; 195: 196: /* Skip over a subpattern that has a {0} or {0,x} quantifier */ 197: 198: case OP_BRAZERO: 199: case OP_BRAMINZERO: 200: case OP_BRAPOSZERO: 201: case OP_SKIPZERO: 202: cc += PRIV(OP_lengths)[*cc]; 203: do cc += GET(cc, 1); while (*cc == OP_ALT); 204: cc += 1 + LINK_SIZE; 205: break; 206: 207: /* Handle literal characters and + repetitions */ 208: 209: case OP_CHAR: 210: case OP_CHARI: 211: case OP_NOT: 212: case OP_NOTI: 213: case OP_PLUS: 214: case OP_PLUSI: 215: case OP_MINPLUS: 216: case OP_MINPLUSI: 217: case OP_POSPLUS: 218: case OP_POSPLUSI: 219: case OP_NOTPLUS: 220: case OP_NOTPLUSI: 221: case OP_NOTMINPLUS: 222: case OP_NOTMINPLUSI: 223: case OP_NOTPOSPLUS: 224: case OP_NOTPOSPLUSI: 225: branchlength++; 226: cc += 2; 227: #ifdef SUPPORT_UTF 228: if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); 229: #endif 230: break; 231: 232: case OP_TYPEPLUS: 233: case OP_TYPEMINPLUS: 234: case OP_TYPEPOSPLUS: 235: branchlength++; 236: cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2; 237: break; 238: 239: /* Handle exact repetitions. The count is already in characters, but we 240: need to skip over a multibyte character in UTF8 mode. */ 241: 242: case OP_EXACT: 243: case OP_EXACTI: 244: case OP_NOTEXACT: 245: case OP_NOTEXACTI: 246: branchlength += GET2(cc,1); 247: cc += 2 + IMM2_SIZE; 248: #ifdef SUPPORT_UTF 249: if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); 250: #endif 251: break; 252: 253: case OP_TYPEEXACT: 254: branchlength += GET2(cc,1); 255: cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP 256: || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0); 257: break; 258: 259: /* Handle single-char non-literal matchers */ 260: 261: case OP_PROP: 262: case OP_NOTPROP: 263: cc += 2; 264: /* Fall through */ 265: 266: case OP_NOT_DIGIT: 267: case OP_DIGIT: 268: case OP_NOT_WHITESPACE: 269: case OP_WHITESPACE: 270: case OP_NOT_WORDCHAR: 271: case OP_WORDCHAR: 272: case OP_ANY: 273: case OP_ALLANY: 274: case OP_EXTUNI: 275: case OP_HSPACE: 276: case OP_NOT_HSPACE: 277: case OP_VSPACE: 278: case OP_NOT_VSPACE: 279: branchlength++; 280: cc++; 281: break; 282: 283: /* "Any newline" might match two characters, but it also might match just 284: one. */ 285: 286: case OP_ANYNL: 287: branchlength += 1; 288: cc++; 289: break; 290: 291: /* The single-byte matcher means we can't proceed in UTF-8 mode. (In 292: non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever 293: appear, but leave the code, just in case.) */ 294: 295: case OP_ANYBYTE: 296: #ifdef SUPPORT_UTF 297: if (utf) return -1; 298: #endif 299: branchlength++; 300: cc++; 301: break; 302: 303: /* For repeated character types, we have to test for \p and \P, which have 304: an extra two bytes of parameters. */ 305: 306: case OP_TYPESTAR: 307: case OP_TYPEMINSTAR: 308: case OP_TYPEQUERY: 309: case OP_TYPEMINQUERY: 310: case OP_TYPEPOSSTAR: 311: case OP_TYPEPOSQUERY: 312: if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2; 313: cc += PRIV(OP_lengths)[op]; 314: break; 315: 316: case OP_TYPEUPTO: 317: case OP_TYPEMINUPTO: 318: case OP_TYPEPOSUPTO: 319: if (cc[1 + IMM2_SIZE] == OP_PROP 320: || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2; 321: cc += PRIV(OP_lengths)[op]; 322: break; 323: 324: /* Check a class for variable quantification */ 325: 326: case OP_CLASS: 327: case OP_NCLASS: 328: #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32 329: case OP_XCLASS: 330: /* The original code caused an unsigned overflow in 64 bit systems, 331: so now we use a conditional statement. */ 332: if (op == OP_XCLASS) 333: cc += GET(cc, 1); 334: else 335: cc += PRIV(OP_lengths)[OP_CLASS]; 336: #else 337: cc += PRIV(OP_lengths)[OP_CLASS]; 338: #endif 339: 340: switch (*cc) 341: { 342: case OP_CRPLUS: 343: case OP_CRMINPLUS: 344: branchlength++; 345: /* Fall through */ 346: 347: case OP_CRSTAR: 348: case OP_CRMINSTAR: 349: case OP_CRQUERY: 350: case OP_CRMINQUERY: 351: cc++; 352: break; 353: 354: case OP_CRRANGE: 355: case OP_CRMINRANGE: 356: branchlength += GET2(cc,1); 357: cc += 1 + 2 * IMM2_SIZE; 358: break; 359: 360: default: 361: branchlength++; 362: break; 363: } 364: break; 365: 366: /* Backreferences and subroutine calls are treated in the same way: we find 367: the minimum length for the subpattern. A recursion, however, causes an 368: a flag to be set that causes the length of this branch to be ignored. The 369: logic is that a recursion can only make sense if there is another 370: alternation that stops the recursing. That will provide the minimum length 371: (when no recursion happens). A backreference within the group that it is 372: referencing behaves in the same way. 373: 374: If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket 375: matches an empty string (by default it causes a matching failure), so in 376: that case we must set the minimum length to zero. */ 377: 378: case OP_REF: 379: case OP_REFI: 380: if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) 381: { 382: ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1)); 383: if (cs == NULL) return -2; 384: do ce += GET(ce, 1); while (*ce == OP_ALT); 385: if (cc > cs && cc < ce) 386: { 387: d = 0; 388: had_recurse = TRUE; 389: } 390: else 391: { 392: d = find_minlength(cs, startcode, options, recurse_depth); 393: } 394: } 395: else d = 0; 396: cc += 1 + IMM2_SIZE; 397: 398: /* Handle repeated back references */ 399: 400: switch (*cc) 401: { 402: case OP_CRSTAR: 403: case OP_CRMINSTAR: 404: case OP_CRQUERY: 405: case OP_CRMINQUERY: 406: min = 0; 407: cc++; 408: break; 409: 410: case OP_CRPLUS: 411: case OP_CRMINPLUS: 412: min = 1; 413: cc++; 414: break; 415: 416: case OP_CRRANGE: 417: case OP_CRMINRANGE: 418: min = GET2(cc, 1); 419: cc += 1 + 2 * IMM2_SIZE; 420: break; 421: 422: default: 423: min = 1; 424: break; 425: } 426: 427: branchlength += min * d; 428: break; 429: 430: /* We can easily detect direct recursion, but not mutual recursion. This is 431: caught by a recursion depth count. */ 432: 433: case OP_RECURSE: 434: cs = ce = (pcre_uchar *)startcode + GET(cc, 1); 435: do ce += GET(ce, 1); while (*ce == OP_ALT); 436: if ((cc > cs && cc < ce) || recurse_depth > 10) 437: had_recurse = TRUE; 438: else 439: { 440: branchlength += find_minlength(cs, startcode, options, recurse_depth + 1); 441: } 442: cc += 1 + LINK_SIZE; 443: break; 444: 445: /* Anything else does not or need not match a character. We can get the 446: item's length from the table, but for those that can match zero occurrences 447: of a character, we must take special action for UTF-8 characters. As it 448: happens, the "NOT" versions of these opcodes are used at present only for 449: ASCII characters, so they could be omitted from this list. However, in 450: future that may change, so we include them here so as not to leave a 451: gotcha for a future maintainer. */ 452: 453: case OP_UPTO: 454: case OP_UPTOI: 455: case OP_NOTUPTO: 456: case OP_NOTUPTOI: 457: case OP_MINUPTO: 458: case OP_MINUPTOI: 459: case OP_NOTMINUPTO: 460: case OP_NOTMINUPTOI: 461: case OP_POSUPTO: 462: case OP_POSUPTOI: 463: case OP_NOTPOSUPTO: 464: case OP_NOTPOSUPTOI: 465: 466: case OP_STAR: 467: case OP_STARI: 468: case OP_NOTSTAR: 469: case OP_NOTSTARI: 470: case OP_MINSTAR: 471: case OP_MINSTARI: 472: case OP_NOTMINSTAR: 473: case OP_NOTMINSTARI: 474: case OP_POSSTAR: 475: case OP_POSSTARI: 476: case OP_NOTPOSSTAR: 477: case OP_NOTPOSSTARI: 478: 479: case OP_QUERY: 480: case OP_QUERYI: 481: case OP_NOTQUERY: 482: case OP_NOTQUERYI: 483: case OP_MINQUERY: 484: case OP_MINQUERYI: 485: case OP_NOTMINQUERY: 486: case OP_NOTMINQUERYI: 487: case OP_POSQUERY: 488: case OP_POSQUERYI: 489: case OP_NOTPOSQUERY: 490: case OP_NOTPOSQUERYI: 491: 492: cc += PRIV(OP_lengths)[op]; 493: #ifdef SUPPORT_UTF 494: if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); 495: #endif 496: break; 497: 498: /* Skip these, but we need to add in the name length. */ 499: 500: case OP_MARK: 501: case OP_PRUNE_ARG: 502: case OP_SKIP_ARG: 503: case OP_THEN_ARG: 504: cc += PRIV(OP_lengths)[op] + cc[1]; 505: break; 506: 507: /* The remaining opcodes are just skipped over. */ 508: 509: case OP_CLOSE: 510: case OP_COMMIT: 511: case OP_FAIL: 512: case OP_PRUNE: 513: case OP_SET_SOM: 514: case OP_SKIP: 515: case OP_THEN: 516: cc += PRIV(OP_lengths)[op]; 517: break; 518: 519: /* This should not occur: we list all opcodes explicitly so that when 520: new ones get added they are properly considered. */ 521: 522: default: 523: return -3; 524: } 525: } 526: /* Control never gets here */ 527: } 528: 529: 530: 531: /************************************************* 532: * Set a bit and maybe its alternate case * 533: *************************************************/ 534: 535: /* Given a character, set its first byte's bit in the table, and also the 536: corresponding bit for the other version of a letter if we are caseless. In 537: UTF-8 mode, for characters greater than 127, we can only do the caseless thing 538: when Unicode property support is available. 539: 540: Arguments: 541: start_bits points to the bit map 542: p points to the character 543: caseless the caseless flag 544: cd the block with char table pointers 545: utf TRUE for UTF-8 / UTF-16 / UTF-32 mode 546: 547: Returns: pointer after the character 548: */ 549: 550: static const pcre_uchar * 551: set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless, 552: compile_data *cd, BOOL utf) 553: { 554: pcre_uint32 c = *p; 555: 556: #ifdef COMPILE_PCRE8 557: SET_BIT(c); 558: 559: #ifdef SUPPORT_UTF 560: if (utf && c > 127) 561: { 562: GETCHARINC(c, p); 563: #ifdef SUPPORT_UCP 564: if (caseless) 565: { 566: pcre_uchar buff[6]; 567: c = UCD_OTHERCASE(c); 568: (void)PRIV(ord2utf)(c, buff); 569: SET_BIT(buff[0]); 570: } 571: #endif /* Not SUPPORT_UCP */ 572: return p; 573: } 574: #else /* Not SUPPORT_UTF */ 575: (void)(utf); /* Stops warning for unused parameter */ 576: #endif /* SUPPORT_UTF */ 577: 578: /* Not UTF-8 mode, or character is less than 127. */ 579: 580: if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]); 581: return p + 1; 582: #endif /* COMPILE_PCRE8 */ 583: 584: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 585: if (c > 0xff) 586: { 587: c = 0xff; 588: caseless = FALSE; 589: } 590: SET_BIT(c); 591: 592: #ifdef SUPPORT_UTF 593: if (utf && c > 127) 594: { 595: GETCHARINC(c, p); 596: #ifdef SUPPORT_UCP 597: if (caseless) 598: { 599: c = UCD_OTHERCASE(c); 600: if (c > 0xff) 601: c = 0xff; 602: SET_BIT(c); 603: } 604: #endif /* SUPPORT_UCP */ 605: return p; 606: } 607: #else /* Not SUPPORT_UTF */ 608: (void)(utf); /* Stops warning for unused parameter */ 609: #endif /* SUPPORT_UTF */ 610: 611: if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]); 612: return p + 1; 613: #endif 614: } 615: 616: 617: 618: /************************************************* 619: * Set bits for a positive character type * 620: *************************************************/ 621: 622: /* This function sets starting bits for a character type. In UTF-8 mode, we can 623: only do a direct setting for bytes less than 128, as otherwise there can be 624: confusion with bytes in the middle of UTF-8 characters. In a "traditional" 625: environment, the tables will only recognize ASCII characters anyway, but in at 626: least one Windows environment, some higher bytes bits were set in the tables. 627: So we deal with that case by considering the UTF-8 encoding. 628: 629: Arguments: 630: start_bits the starting bitmap 631: cbit type the type of character wanted 632: table_limit 32 for non-UTF-8; 16 for UTF-8 633: cd the block with char table pointers 634: 635: Returns: nothing 636: */ 637: 638: static void 639: set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit, 640: compile_data *cd) 641: { 642: register pcre_uint32 c; 643: for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type]; 644: #if defined SUPPORT_UTF && defined COMPILE_PCRE8 645: if (table_limit == 32) return; 646: for (c = 128; c < 256; c++) 647: { 648: if ((cd->cbits[c/8] & (1 << (c&7))) != 0) 649: { 650: pcre_uchar buff[6]; 651: (void)PRIV(ord2utf)(c, buff); 652: SET_BIT(buff[0]); 653: } 654: } 655: #endif 656: } 657: 658: 659: /************************************************* 660: * Set bits for a negative character type * 661: *************************************************/ 662: 663: /* This function sets starting bits for a negative character type such as \D. 664: In UTF-8 mode, we can only do a direct setting for bytes less than 128, as 665: otherwise there can be confusion with bytes in the middle of UTF-8 characters. 666: Unlike in the positive case, where we can set appropriate starting bits for 667: specific high-valued UTF-8 characters, in this case we have to set the bits for 668: all high-valued characters. The lowest is 0xc2, but we overkill by starting at 669: 0xc0 (192) for simplicity. 670: 671: Arguments: 672: start_bits the starting bitmap 673: cbit type the type of character wanted 674: table_limit 32 for non-UTF-8; 16 for UTF-8 675: cd the block with char table pointers 676: 677: Returns: nothing 678: */ 679: 680: static void 681: set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit, 682: compile_data *cd) 683: { 684: register pcre_uint32 c; 685: for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type]; 686: #if defined SUPPORT_UTF && defined COMPILE_PCRE8 687: if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff; 688: #endif 689: } 690: 691: 692: 693: /************************************************* 694: * Create bitmap of starting bytes * 695: *************************************************/ 696: 697: /* This function scans a compiled unanchored expression recursively and 698: attempts to build a bitmap of the set of possible starting bytes. As time goes 699: by, we may be able to get more clever at doing this. The SSB_CONTINUE return is 700: useful for parenthesized groups in patterns such as (a*)b where the group 701: provides some optional starting bytes but scanning must continue at the outer 702: level to find at least one mandatory byte. At the outermost level, this 703: function fails unless the result is SSB_DONE. 704: 705: Arguments: 706: code points to an expression 707: start_bits points to a 32-byte table, initialized to 0 708: utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode 709: cd the block with char table pointers 710: 711: Returns: SSB_FAIL => Failed to find any starting bytes 712: SSB_DONE => Found mandatory starting bytes 713: SSB_CONTINUE => Found optional starting bytes 714: SSB_UNKNOWN => Hit an unrecognized opcode 715: */ 716: 717: static int 718: set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf, 719: compile_data *cd) 720: { 721: register pcre_uint32 c; 722: int yield = SSB_DONE; 723: #if defined SUPPORT_UTF && defined COMPILE_PCRE8 724: int table_limit = utf? 16:32; 725: #else 726: int table_limit = 32; 727: #endif 728: 729: #if 0 730: /* ========================================================================= */ 731: /* The following comment and code was inserted in January 1999. In May 2006, 732: when it was observed to cause compiler warnings about unused values, I took it 733: out again. If anybody is still using OS/2, they will have to put it back 734: manually. */ 735: 736: /* This next statement and the later reference to dummy are here in order to 737: trick the optimizer of the IBM C compiler for OS/2 into generating correct 738: code. Apparently IBM isn't going to fix the problem, and we would rather not 739: disable optimization (in this module it actually makes a big difference, and 740: the pcre module can use all the optimization it can get). */ 741: 742: volatile int dummy; 743: /* ========================================================================= */ 744: #endif 745: 746: do 747: { 748: BOOL try_next = TRUE; 749: const pcre_uchar *tcode = code + 1 + LINK_SIZE; 750: 751: if (*code == OP_CBRA || *code == OP_SCBRA || 752: *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE; 753: 754: while (try_next) /* Loop for items in this branch */ 755: { 756: int rc; 757: 758: switch(*tcode) 759: { 760: /* If we reach something we don't understand, it means a new opcode has 761: been created that hasn't been added to this code. Hopefully this problem 762: will be discovered during testing. */ 763: 764: default: 765: return SSB_UNKNOWN; 766: 767: /* Fail for a valid opcode that implies no starting bits. */ 768: 769: case OP_ACCEPT: 770: case OP_ASSERT_ACCEPT: 771: case OP_ALLANY: 772: case OP_ANY: 773: case OP_ANYBYTE: 774: case OP_CIRC: 775: case OP_CIRCM: 776: case OP_CLOSE: 777: case OP_COMMIT: 778: case OP_COND: 779: case OP_CREF: 780: case OP_DEF: 781: case OP_DOLL: 782: case OP_DOLLM: 783: case OP_END: 784: case OP_EOD: 785: case OP_EODN: 786: case OP_EXTUNI: 787: case OP_FAIL: 788: case OP_MARK: 789: case OP_NCREF: 790: case OP_NOT: 791: case OP_NOTEXACT: 792: case OP_NOTEXACTI: 793: case OP_NOTI: 794: case OP_NOTMINPLUS: 795: case OP_NOTMINPLUSI: 796: case OP_NOTMINQUERY: 797: case OP_NOTMINQUERYI: 798: case OP_NOTMINSTAR: 799: case OP_NOTMINSTARI: 800: case OP_NOTMINUPTO: 801: case OP_NOTMINUPTOI: 802: case OP_NOTPLUS: 803: case OP_NOTPLUSI: 804: case OP_NOTPOSPLUS: 805: case OP_NOTPOSPLUSI: 806: case OP_NOTPOSQUERY: 807: case OP_NOTPOSQUERYI: 808: case OP_NOTPOSSTAR: 809: case OP_NOTPOSSTARI: 810: case OP_NOTPOSUPTO: 811: case OP_NOTPOSUPTOI: 812: case OP_NOTPROP: 813: case OP_NOTQUERY: 814: case OP_NOTQUERYI: 815: case OP_NOTSTAR: 816: case OP_NOTSTARI: 817: case OP_NOTUPTO: 818: case OP_NOTUPTOI: 819: case OP_NOT_HSPACE: 820: case OP_NOT_VSPACE: 821: case OP_NRREF: 822: case OP_PROP: 823: case OP_PRUNE: 824: case OP_PRUNE_ARG: 825: case OP_RECURSE: 826: case OP_REF: 827: case OP_REFI: 828: case OP_REVERSE: 829: case OP_RREF: 830: case OP_SCOND: 831: case OP_SET_SOM: 832: case OP_SKIP: 833: case OP_SKIP_ARG: 834: case OP_SOD: 835: case OP_SOM: 836: case OP_THEN: 837: case OP_THEN_ARG: 838: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8 839: case OP_XCLASS: 840: #endif 841: return SSB_FAIL; 842: 843: /* We can ignore word boundary tests. */ 844: 845: case OP_WORD_BOUNDARY: 846: case OP_NOT_WORD_BOUNDARY: 847: tcode++; 848: break; 849: 850: /* If we hit a bracket or a positive lookahead assertion, recurse to set 851: bits from within the subpattern. If it can't find anything, we have to 852: give up. If it finds some mandatory character(s), we are done for this 853: branch. Otherwise, carry on scanning after the subpattern. */ 854: 855: case OP_BRA: 856: case OP_SBRA: 857: case OP_CBRA: 858: case OP_SCBRA: 859: case OP_BRAPOS: 860: case OP_SBRAPOS: 861: case OP_CBRAPOS: 862: case OP_SCBRAPOS: 863: case OP_ONCE: 864: case OP_ONCE_NC: 865: case OP_ASSERT: 866: rc = set_start_bits(tcode, start_bits, utf, cd); 867: if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc; 868: if (rc == SSB_DONE) try_next = FALSE; else 869: { 870: do tcode += GET(tcode, 1); while (*tcode == OP_ALT); 871: tcode += 1 + LINK_SIZE; 872: } 873: break; 874: 875: /* If we hit ALT or KET, it means we haven't found anything mandatory in 876: this branch, though we might have found something optional. For ALT, we 877: continue with the next alternative, but we have to arrange that the final 878: result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET, 879: return SSB_CONTINUE: if this is the top level, that indicates failure, 880: but after a nested subpattern, it causes scanning to continue. */ 881: 882: case OP_ALT: 883: yield = SSB_CONTINUE; 884: try_next = FALSE; 885: break; 886: 887: case OP_KET: 888: case OP_KETRMAX: 889: case OP_KETRMIN: 890: case OP_KETRPOS: 891: return SSB_CONTINUE; 892: 893: /* Skip over callout */ 894: 895: case OP_CALLOUT: 896: tcode += 2 + 2*LINK_SIZE; 897: break; 898: 899: /* Skip over lookbehind and negative lookahead assertions */ 900: 901: case OP_ASSERT_NOT: 902: case OP_ASSERTBACK: 903: case OP_ASSERTBACK_NOT: 904: do tcode += GET(tcode, 1); while (*tcode == OP_ALT); 905: tcode += 1 + LINK_SIZE; 906: break; 907: 908: /* BRAZERO does the bracket, but carries on. */ 909: 910: case OP_BRAZERO: 911: case OP_BRAMINZERO: 912: case OP_BRAPOSZERO: 913: rc = set_start_bits(++tcode, start_bits, utf, cd); 914: if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc; 915: /* ========================================================================= 916: See the comment at the head of this function concerning the next line, 917: which was an old fudge for the benefit of OS/2. 918: dummy = 1; 919: ========================================================================= */ 920: do tcode += GET(tcode,1); while (*tcode == OP_ALT); 921: tcode += 1 + LINK_SIZE; 922: break; 923: 924: /* SKIPZERO skips the bracket. */ 925: 926: case OP_SKIPZERO: 927: tcode++; 928: do tcode += GET(tcode,1); while (*tcode == OP_ALT); 929: tcode += 1 + LINK_SIZE; 930: break; 931: 932: /* Single-char * or ? sets the bit and tries the next item */ 933: 934: case OP_STAR: 935: case OP_MINSTAR: 936: case OP_POSSTAR: 937: case OP_QUERY: 938: case OP_MINQUERY: 939: case OP_POSQUERY: 940: tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf); 941: break; 942: 943: case OP_STARI: 944: case OP_MINSTARI: 945: case OP_POSSTARI: 946: case OP_QUERYI: 947: case OP_MINQUERYI: 948: case OP_POSQUERYI: 949: tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf); 950: break; 951: 952: /* Single-char upto sets the bit and tries the next */ 953: 954: case OP_UPTO: 955: case OP_MINUPTO: 956: case OP_POSUPTO: 957: tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf); 958: break; 959: 960: case OP_UPTOI: 961: case OP_MINUPTOI: 962: case OP_POSUPTOI: 963: tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf); 964: break; 965: 966: /* At least one single char sets the bit and stops */ 967: 968: case OP_EXACT: 969: tcode += IMM2_SIZE; 970: /* Fall through */ 971: case OP_CHAR: 972: case OP_PLUS: 973: case OP_MINPLUS: 974: case OP_POSPLUS: 975: (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf); 976: try_next = FALSE; 977: break; 978: 979: case OP_EXACTI: 980: tcode += IMM2_SIZE; 981: /* Fall through */ 982: case OP_CHARI: 983: case OP_PLUSI: 984: case OP_MINPLUSI: 985: case OP_POSPLUSI: 986: (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf); 987: try_next = FALSE; 988: break; 989: 990: /* Special spacing and line-terminating items. These recognize specific 991: lists of characters. The difference between VSPACE and ANYNL is that the 992: latter can match the two-character CRLF sequence, but that is not 993: relevant for finding the first character, so their code here is 994: identical. */ 995: 996: case OP_HSPACE: 997: SET_BIT(CHAR_HT); 998: SET_BIT(CHAR_SPACE); 999: #ifdef SUPPORT_UTF 1000: if (utf) 1001: { 1002: #ifdef COMPILE_PCRE8 1003: SET_BIT(0xC2); /* For U+00A0 */ 1004: SET_BIT(0xE1); /* For U+1680, U+180E */ 1005: SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */ 1006: SET_BIT(0xE3); /* For U+3000 */ 1007: #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32 1008: SET_BIT(0xA0); 1009: SET_BIT(0xFF); /* For characters > 255 */ 1010: #endif /* COMPILE_PCRE[8|16|32] */ 1011: } 1012: else 1013: #endif /* SUPPORT_UTF */ 1014: { 1015: #ifndef EBCDIC 1016: SET_BIT(0xA0); 1017: #endif /* Not EBCDIC */ 1018: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 1019: SET_BIT(0xFF); /* For characters > 255 */ 1020: #endif /* COMPILE_PCRE[16|32] */ 1021: } 1022: try_next = FALSE; 1023: break; 1024: 1025: case OP_ANYNL: 1026: case OP_VSPACE: 1027: SET_BIT(CHAR_LF); 1028: SET_BIT(CHAR_VT); 1029: SET_BIT(CHAR_FF); 1030: SET_BIT(CHAR_CR); 1031: #ifdef SUPPORT_UTF 1032: if (utf) 1033: { 1034: #ifdef COMPILE_PCRE8 1035: SET_BIT(0xC2); /* For U+0085 */ 1036: SET_BIT(0xE2); /* For U+2028, U+2029 */ 1037: #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32 1038: SET_BIT(CHAR_NEL); 1039: SET_BIT(0xFF); /* For characters > 255 */ 1040: #endif /* COMPILE_PCRE[8|16|32] */ 1041: } 1042: else 1043: #endif /* SUPPORT_UTF */ 1044: { 1045: SET_BIT(CHAR_NEL); 1046: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 1047: SET_BIT(0xFF); /* For characters > 255 */ 1048: #endif 1049: } 1050: try_next = FALSE; 1051: break; 1052: 1053: /* Single character types set the bits and stop. Note that if PCRE_UCP 1054: is set, we do not see these op codes because \d etc are converted to 1055: properties. Therefore, these apply in the case when only characters less 1056: than 256 are recognized to match the types. */ 1057: 1058: case OP_NOT_DIGIT: 1059: set_nottype_bits(start_bits, cbit_digit, table_limit, cd); 1060: try_next = FALSE; 1061: break; 1062: 1063: case OP_DIGIT: 1064: set_type_bits(start_bits, cbit_digit, table_limit, cd); 1065: try_next = FALSE; 1066: break; 1067: 1068: /* The cbit_space table has vertical tab as whitespace; we have to 1069: ensure it is set as not whitespace. Luckily, the code value is the same 1070: (0x0b) in ASCII and EBCDIC, so we can just adjust the appropriate bit. */ 1071: 1072: case OP_NOT_WHITESPACE: 1073: set_nottype_bits(start_bits, cbit_space, table_limit, cd); 1074: start_bits[1] |= 0x08; 1075: try_next = FALSE; 1076: break; 1077: 1078: /* The cbit_space table has vertical tab as whitespace; we have to not 1079: set it from the table. Luckily, the code value is the same (0x0b) in 1080: ASCII and EBCDIC, so we can just adjust the appropriate bit. */ 1081: 1082: case OP_WHITESPACE: 1083: c = start_bits[1]; /* Save in case it was already set */ 1084: set_type_bits(start_bits, cbit_space, table_limit, cd); 1085: start_bits[1] = (start_bits[1] & ~0x08) | c; 1086: try_next = FALSE; 1087: break; 1088: 1089: case OP_NOT_WORDCHAR: 1090: set_nottype_bits(start_bits, cbit_word, table_limit, cd); 1091: try_next = FALSE; 1092: break; 1093: 1094: case OP_WORDCHAR: 1095: set_type_bits(start_bits, cbit_word, table_limit, cd); 1096: try_next = FALSE; 1097: break; 1098: 1099: /* One or more character type fudges the pointer and restarts, knowing 1100: it will hit a single character type and stop there. */ 1101: 1102: case OP_TYPEPLUS: 1103: case OP_TYPEMINPLUS: 1104: case OP_TYPEPOSPLUS: 1105: tcode++; 1106: break; 1107: 1108: case OP_TYPEEXACT: 1109: tcode += 1 + IMM2_SIZE; 1110: break; 1111: 1112: /* Zero or more repeats of character types set the bits and then 1113: try again. */ 1114: 1115: case OP_TYPEUPTO: 1116: case OP_TYPEMINUPTO: 1117: case OP_TYPEPOSUPTO: 1118: tcode += IMM2_SIZE; /* Fall through */ 1119: 1120: case OP_TYPESTAR: 1121: case OP_TYPEMINSTAR: 1122: case OP_TYPEPOSSTAR: 1123: case OP_TYPEQUERY: 1124: case OP_TYPEMINQUERY: 1125: case OP_TYPEPOSQUERY: 1126: switch(tcode[1]) 1127: { 1128: default: 1129: case OP_ANY: 1130: case OP_ALLANY: 1131: return SSB_FAIL; 1132: 1133: case OP_HSPACE: 1134: SET_BIT(CHAR_HT); 1135: SET_BIT(CHAR_SPACE); 1136: #ifdef SUPPORT_UTF 1137: if (utf) 1138: { 1139: #ifdef COMPILE_PCRE8 1140: SET_BIT(0xC2); /* For U+00A0 */ 1141: SET_BIT(0xE1); /* For U+1680, U+180E */ 1142: SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */ 1143: SET_BIT(0xE3); /* For U+3000 */ 1144: #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32 1145: SET_BIT(0xA0); 1146: SET_BIT(0xFF); /* For characters > 255 */ 1147: #endif /* COMPILE_PCRE[8|16|32] */ 1148: } 1149: else 1150: #endif /* SUPPORT_UTF */ 1151: #ifndef EBCDIC 1152: SET_BIT(0xA0); 1153: #endif /* Not EBCDIC */ 1154: break; 1155: 1156: case OP_ANYNL: 1157: case OP_VSPACE: 1158: SET_BIT(CHAR_LF); 1159: SET_BIT(CHAR_VT); 1160: SET_BIT(CHAR_FF); 1161: SET_BIT(CHAR_CR); 1162: #ifdef SUPPORT_UTF 1163: if (utf) 1164: { 1165: #ifdef COMPILE_PCRE8 1166: SET_BIT(0xC2); /* For U+0085 */ 1167: SET_BIT(0xE2); /* For U+2028, U+2029 */ 1168: #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32 1169: SET_BIT(CHAR_NEL); 1170: SET_BIT(0xFF); /* For characters > 255 */ 1171: #endif /* COMPILE_PCRE16 */ 1172: } 1173: else 1174: #endif /* SUPPORT_UTF */ 1175: SET_BIT(CHAR_NEL); 1176: break; 1177: 1178: case OP_NOT_DIGIT: 1179: set_nottype_bits(start_bits, cbit_digit, table_limit, cd); 1180: break; 1181: 1182: case OP_DIGIT: 1183: set_type_bits(start_bits, cbit_digit, table_limit, cd); 1184: break; 1185: 1186: /* The cbit_space table has vertical tab as whitespace; we have to 1187: ensure it gets set as not whitespace. Luckily, the code value is the 1188: same (0x0b) in ASCII and EBCDIC, so we can just adjust the appropriate 1189: bit. */ 1190: 1191: case OP_NOT_WHITESPACE: 1192: set_nottype_bits(start_bits, cbit_space, table_limit, cd); 1193: start_bits[1] |= 0x08; 1194: break; 1195: 1196: /* The cbit_space table has vertical tab as whitespace; we have to 1197: avoid setting it. Luckily, the code value is the same (0x0b) in ASCII 1198: and EBCDIC, so we can just adjust the appropriate bit. */ 1199: 1200: case OP_WHITESPACE: 1201: c = start_bits[1]; /* Save in case it was already set */ 1202: set_type_bits(start_bits, cbit_space, table_limit, cd); 1203: start_bits[1] = (start_bits[1] & ~0x08) | c; 1204: break; 1205: 1206: case OP_NOT_WORDCHAR: 1207: set_nottype_bits(start_bits, cbit_word, table_limit, cd); 1208: break; 1209: 1210: case OP_WORDCHAR: 1211: set_type_bits(start_bits, cbit_word, table_limit, cd); 1212: break; 1213: } 1214: 1215: tcode += 2; 1216: break; 1217: 1218: /* Character class where all the information is in a bit map: set the 1219: bits and either carry on or not, according to the repeat count. If it was 1220: a negative class, and we are operating with UTF-8 characters, any byte 1221: with a value >= 0xc4 is a potentially valid starter because it starts a 1222: character with a value > 255. */ 1223: 1224: case OP_NCLASS: 1225: #if defined SUPPORT_UTF && defined COMPILE_PCRE8 1226: if (utf) 1227: { 1228: start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */ 1229: memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */ 1230: } 1231: #endif 1232: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 1233: SET_BIT(0xFF); /* For characters > 255 */ 1234: #endif 1235: /* Fall through */ 1236: 1237: case OP_CLASS: 1238: { 1239: pcre_uint8 *map; 1240: tcode++; 1241: map = (pcre_uint8 *)tcode; 1242: 1243: /* In UTF-8 mode, the bits in a bit map correspond to character 1244: values, not to byte values. However, the bit map we are constructing is 1245: for byte values. So we have to do a conversion for characters whose 1246: value is > 127. In fact, there are only two possible starting bytes for 1247: characters in the range 128 - 255. */ 1248: 1249: #if defined SUPPORT_UTF && defined COMPILE_PCRE8 1250: if (utf) 1251: { 1252: for (c = 0; c < 16; c++) start_bits[c] |= map[c]; 1253: for (c = 128; c < 256; c++) 1254: { 1255: if ((map[c/8] && (1 << (c&7))) != 0) 1256: { 1257: int d = (c >> 6) | 0xc0; /* Set bit for this starter */ 1258: start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */ 1259: c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */ 1260: } 1261: } 1262: } 1263: else 1264: #endif 1265: { 1266: /* In non-UTF-8 mode, the two bit maps are completely compatible. */ 1267: for (c = 0; c < 32; c++) start_bits[c] |= map[c]; 1268: } 1269: 1270: /* Advance past the bit map, and act on what follows. For a zero 1271: minimum repeat, continue; otherwise stop processing. */ 1272: 1273: tcode += 32 / sizeof(pcre_uchar); 1274: switch (*tcode) 1275: { 1276: case OP_CRSTAR: 1277: case OP_CRMINSTAR: 1278: case OP_CRQUERY: 1279: case OP_CRMINQUERY: 1280: tcode++; 1281: break; 1282: 1283: case OP_CRRANGE: 1284: case OP_CRMINRANGE: 1285: if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE; 1286: else try_next = FALSE; 1287: break; 1288: 1289: default: 1290: try_next = FALSE; 1291: break; 1292: } 1293: } 1294: break; /* End of bitmap class handling */ 1295: 1296: } /* End of switch */ 1297: } /* End of try_next loop */ 1298: 1299: code += GET(code, 1); /* Advance to next branch */ 1300: } 1301: while (*code == OP_ALT); 1302: return yield; 1303: } 1304: 1305: 1306: 1307: 1308: 1309: /************************************************* 1310: * Study a compiled expression * 1311: *************************************************/ 1312: 1313: /* This function is handed a compiled expression that it must study to produce 1314: information that will speed up the matching. It returns a pcre[16]_extra block 1315: which then gets handed back to pcre_exec(). 1316: 1317: Arguments: 1318: re points to the compiled expression 1319: options contains option bits 1320: errorptr points to where to place error messages; 1321: set NULL unless error 1322: 1323: Returns: pointer to a pcre[16]_extra block, with study_data filled in and 1324: the appropriate flags set; 1325: NULL on error or if no optimization possible 1326: */ 1327: 1328: #if defined COMPILE_PCRE8 1329: PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION 1330: pcre_study(const pcre *external_re, int options, const char **errorptr) 1331: #elif defined COMPILE_PCRE16 1332: PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION 1333: pcre16_study(const pcre16 *external_re, int options, const char **errorptr) 1334: #elif defined COMPILE_PCRE32 1335: PCRE_EXP_DEFN pcre32_extra * PCRE_CALL_CONVENTION 1336: pcre32_study(const pcre32 *external_re, int options, const char **errorptr) 1337: #endif 1338: { 1339: int min; 1340: BOOL bits_set = FALSE; 1341: pcre_uint8 start_bits[32]; 1342: PUBL(extra) *extra = NULL; 1343: pcre_study_data *study; 1344: const pcre_uint8 *tables; 1345: pcre_uchar *code; 1346: compile_data compile_block; 1347: const REAL_PCRE *re = (const REAL_PCRE *)external_re; 1348: 1349: *errorptr = NULL; 1350: 1351: if (re == NULL || re->magic_number != MAGIC_NUMBER) 1352: { 1353: *errorptr = "argument is not a compiled regular expression"; 1354: return NULL; 1355: } 1356: 1357: if ((re->flags & PCRE_MODE) == 0) 1358: { 1359: #if defined COMPILE_PCRE8 1360: *errorptr = "argument not compiled in 8 bit mode"; 1361: #elif defined COMPILE_PCRE16 1362: *errorptr = "argument not compiled in 16 bit mode"; 1363: #elif defined COMPILE_PCRE32 1364: *errorptr = "argument not compiled in 32 bit mode"; 1365: #endif 1366: return NULL; 1367: } 1368: 1369: if ((options & ~PUBLIC_STUDY_OPTIONS) != 0) 1370: { 1371: *errorptr = "unknown or incorrect option bit(s) set"; 1372: return NULL; 1373: } 1374: 1375: code = (pcre_uchar *)re + re->name_table_offset + 1376: (re->name_count * re->name_entry_size); 1377: 1378: /* For an anchored pattern, or an unanchored pattern that has a first char, or 1379: a multiline pattern that matches only at "line starts", there is no point in 1380: seeking a list of starting bytes. */ 1381: 1382: if ((re->options & PCRE_ANCHORED) == 0 && 1383: (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0) 1384: { 1385: int rc; 1386: 1387: /* Set the character tables in the block that is passed around */ 1388: 1389: tables = re->tables; 1390: 1391: #if defined COMPILE_PCRE8 1392: if (tables == NULL) 1393: (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, 1394: (void *)(&tables)); 1395: #elif defined COMPILE_PCRE16 1396: if (tables == NULL) 1397: (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, 1398: (void *)(&tables)); 1399: #elif defined COMPILE_PCRE32 1400: if (tables == NULL) 1401: (void)pcre32_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, 1402: (void *)(&tables)); 1403: #endif 1404: 1405: compile_block.lcc = tables + lcc_offset; 1406: compile_block.fcc = tables + fcc_offset; 1407: compile_block.cbits = tables + cbits_offset; 1408: compile_block.ctypes = tables + ctypes_offset; 1409: 1410: /* See if we can find a fixed set of initial characters for the pattern. */ 1411: 1412: memset(start_bits, 0, 32 * sizeof(pcre_uint8)); 1413: rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0, 1414: &compile_block); 1415: bits_set = rc == SSB_DONE; 1416: if (rc == SSB_UNKNOWN) 1417: { 1418: *errorptr = "internal error: opcode not recognized"; 1419: return NULL; 1420: } 1421: } 1422: 1423: /* Find the minimum length of subject string. */ 1424: 1425: switch(min = find_minlength(code, code, re->options, 0)) 1426: { 1427: case -2: *errorptr = "internal error: missing capturing bracket"; return NULL; 1428: case -3: *errorptr = "internal error: opcode not recognized"; return NULL; 1429: default: break; 1430: } 1431: 1432: /* If a set of starting bytes has been identified, or if the minimum length is 1433: greater than zero, or if JIT optimization has been requested, or if 1434: PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a 1435: pcre_study_data block. The study data is put in the latter, which is pointed to 1436: by the former, which may also get additional data set later by the calling 1437: program. At the moment, the size of pcre_study_data is fixed. We nevertheless 1438: save it in a field for returning via the pcre_fullinfo() function so that if it 1439: becomes variable in the future, we don't have to change that code. */ 1440: 1441: if (bits_set || min > 0 || (options & ( 1442: #ifdef SUPPORT_JIT 1443: PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE | 1444: PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE | 1445: #endif 1446: PCRE_STUDY_EXTRA_NEEDED)) != 0) 1447: { 1448: extra = (PUBL(extra) *)(PUBL(malloc)) 1449: (sizeof(PUBL(extra)) + sizeof(pcre_study_data)); 1450: if (extra == NULL) 1451: { 1452: *errorptr = "failed to get memory"; 1453: return NULL; 1454: } 1455: 1456: study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra))); 1457: extra->flags = PCRE_EXTRA_STUDY_DATA; 1458: extra->study_data = study; 1459: 1460: study->size = sizeof(pcre_study_data); 1461: study->flags = 0; 1462: 1463: /* Set the start bits always, to avoid unset memory errors if the 1464: study data is written to a file, but set the flag only if any of the bits 1465: are set, to save time looking when none are. */ 1466: 1467: if (bits_set) 1468: { 1469: study->flags |= PCRE_STUDY_MAPPED; 1470: memcpy(study->start_bits, start_bits, sizeof(start_bits)); 1471: } 1472: else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8)); 1473: 1474: #ifdef PCRE_DEBUG 1475: if (bits_set) 1476: { 1477: pcre_uint8 *ptr = start_bits; 1478: int i; 1479: 1480: printf("Start bits:\n"); 1481: for (i = 0; i < 32; i++) 1482: printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n"); 1483: } 1484: #endif 1485: 1486: /* Always set the minlength value in the block, because the JIT compiler 1487: makes use of it. However, don't set the bit unless the length is greater than 1488: zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time 1489: checking the zero case. */ 1490: 1491: if (min > 0) 1492: { 1493: study->flags |= PCRE_STUDY_MINLEN; 1494: study->minlength = min; 1495: } 1496: else study->minlength = 0; 1497: 1498: /* If JIT support was compiled and requested, attempt the JIT compilation. 1499: If no starting bytes were found, and the minimum length is zero, and JIT 1500: compilation fails, abandon the extra block and return NULL, unless 1501: PCRE_STUDY_EXTRA_NEEDED is set. */ 1502: 1503: #ifdef SUPPORT_JIT 1504: extra->executable_jit = NULL; 1505: if ((options & PCRE_STUDY_JIT_COMPILE) != 0) 1506: PRIV(jit_compile)(re, extra, JIT_COMPILE); 1507: if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0) 1508: PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE); 1509: if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0) 1510: PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE); 1511: 1512: if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0 && 1513: (options & PCRE_STUDY_EXTRA_NEEDED) == 0) 1514: { 1515: #if defined COMPILE_PCRE8 1516: pcre_free_study(extra); 1517: #elif defined COMPILE_PCRE16 1518: pcre16_free_study(extra); 1519: #elif defined COMPILE_PCRE32 1520: pcre32_free_study(extra); 1521: #endif 1522: extra = NULL; 1523: } 1524: #endif 1525: } 1526: 1527: return extra; 1528: } 1529: 1530: 1531: /************************************************* 1532: * Free the study data * 1533: *************************************************/ 1534: 1535: /* This function frees the memory that was obtained by pcre_study(). 1536: 1537: Argument: a pointer to the pcre[16]_extra block 1538: Returns: nothing 1539: */ 1540: 1541: #if defined COMPILE_PCRE8 1542: PCRE_EXP_DEFN void 1543: pcre_free_study(pcre_extra *extra) 1544: #elif defined COMPILE_PCRE16 1545: PCRE_EXP_DEFN void 1546: pcre16_free_study(pcre16_extra *extra) 1547: #elif defined COMPILE_PCRE32 1548: PCRE_EXP_DEFN void 1549: pcre32_free_study(pcre32_extra *extra) 1550: #endif 1551: { 1552: if (extra == NULL) 1553: return; 1554: #ifdef SUPPORT_JIT 1555: if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 && 1556: extra->executable_jit != NULL) 1557: PRIV(jit_free)(extra->executable_jit); 1558: #endif 1559: PUBL(free)(extra); 1560: } 1561: 1562: /* End of pcre_study.c */