embedaddon/pcre/pcre_study.c - view

File: [ELWIX - Embedded LightWeight unIX -] / embedaddon / pcre / pcre_study.c
Revision 1.1.1.5 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Sun Jun 15 19:46:03 2014 UTC (10 years ago) by misho
Branches: pcre, MAIN
CVS tags: v8_34, HEAD

pcre 8.34

1: /************************************************* 2: * Perl-Compatible Regular Expressions * 3: *************************************************/ 4: 5: /* PCRE is a library of functions to support regular expressions whose syntax 6: and semantics are as close as possible to those of the Perl 5 language. 7: 8: Written by Philip Hazel 9: Copyright (c) 1997-2012 University of Cambridge 10: 11: ----------------------------------------------------------------------------- 12: Redistribution and use in source and binary forms, with or without 13: modification, are permitted provided that the following conditions are met: 14: 15: * Redistributions of source code must retain the above copyright notice, 16: this list of conditions and the following disclaimer. 17: 18: * Redistributions in binary form must reproduce the above copyright 19: notice, this list of conditions and the following disclaimer in the 20: documentation and/or other materials provided with the distribution. 21: 22: * Neither the name of the University of Cambridge nor the names of its 23: contributors may be used to endorse or promote products derived from 24: this software without specific prior written permission. 25: 26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36: POSSIBILITY OF SUCH DAMAGE. 37: ----------------------------------------------------------------------------- 38: */ 39: 40: 41: /* This module contains the external function pcre_study(), along with local 42: supporting functions. */ 43: 44: 45: #ifdef HAVE_CONFIG_H 46: #include "config.h" 47: #endif 48: 49: #include "pcre_internal.h" 50: 51: #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7)) 52: 53: /* Returns from set_start_bits() */ 54: 55: enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN }; 56: 57: 58: 59: /************************************************* 60: * Find the minimum subject length for a group * 61: *************************************************/ 62: 63: /* Scan a parenthesized group and compute the minimum length of subject that 64: is needed to match it. This is a lower bound; it does not mean there is a 65: string of that length that matches. In UTF8 mode, the result is in characters 66: rather than bytes. 67: 68: Arguments: 69: re compiled pattern block 70: code pointer to start of group (the bracket) 71: startcode pointer to start of the whole pattern's code 72: options the compiling options 73: int RECURSE depth 74: 75: Returns: the minimum length 76: -1 if \C in UTF-8 mode or (*ACCEPT) was encountered 77: -2 internal error (missing capturing bracket) 78: -3 internal error (opcode not listed) 79: */ 80: 81: static int 82: find_minlength(const REAL_PCRE *re, const pcre_uchar *code, 83: const pcre_uchar *startcode, int options, int recurse_depth) 84: { 85: int length = -1; 86: /* PCRE_UTF16 has the same value as PCRE_UTF8. */ 87: BOOL utf = (options & PCRE_UTF8) != 0; 88: BOOL had_recurse = FALSE; 89: register int branchlength = 0; 90: register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE; 91: 92: if (*code == OP_CBRA || *code == OP_SCBRA || 93: *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE; 94: 95: /* Scan along the opcodes for this branch. If we get to the end of the 96: branch, check the length against that of the other branches. */ 97: 98: for (;;) 99: { 100: int d, min; 101: pcre_uchar *cs, *ce; 102: register pcre_uchar op = *cc; 103: 104: switch (op) 105: { 106: case OP_COND: 107: case OP_SCOND: 108: 109: /* If there is only one branch in a condition, the implied branch has zero 110: length, so we don't add anything. This covers the DEFINE "condition" 111: automatically. */ 112: 113: cs = cc + GET(cc, 1); 114: if (*cs != OP_ALT) 115: { 116: cc = cs + 1 + LINK_SIZE; 117: break; 118: } 119: 120: /* Otherwise we can fall through and treat it the same as any other 121: subpattern. */ 122: 123: case OP_CBRA: 124: case OP_SCBRA: 125: case OP_BRA: 126: case OP_SBRA: 127: case OP_CBRAPOS: 128: case OP_SCBRAPOS: 129: case OP_BRAPOS: 130: case OP_SBRAPOS: 131: case OP_ONCE: 132: case OP_ONCE_NC: 133: d = find_minlength(re, cc, startcode, options, recurse_depth); 134: if (d < 0) return d; 135: branchlength += d; 136: do cc += GET(cc, 1); while (*cc == OP_ALT); 137: cc += 1 + LINK_SIZE; 138: break; 139: 140: /* ACCEPT makes things far too complicated; we have to give up. */ 141: 142: case OP_ACCEPT: 143: case OP_ASSERT_ACCEPT: 144: return -1; 145: 146: /* Reached end of a branch; if it's a ket it is the end of a nested 147: call. If it's ALT it is an alternation in a nested call. If it is END it's 148: the end of the outer call. All can be handled by the same code. If an 149: ACCEPT was previously encountered, use the length that was in force at that 150: time, and pass back the shortest ACCEPT length. */ 151: 152: case OP_ALT: 153: case OP_KET: 154: case OP_KETRMAX: 155: case OP_KETRMIN: 156: case OP_KETRPOS: 157: case OP_END: 158: if (length < 0 || (!had_recurse && branchlength < length)) 159: length = branchlength; 160: if (op != OP_ALT) return length; 161: cc += 1 + LINK_SIZE; 162: branchlength = 0; 163: had_recurse = FALSE; 164: break; 165: 166: /* Skip over assertive subpatterns */ 167: 168: case OP_ASSERT: 169: case OP_ASSERT_NOT: 170: case OP_ASSERTBACK: 171: case OP_ASSERTBACK_NOT: 172: do cc += GET(cc, 1); while (*cc == OP_ALT); 173: /* Fall through */ 174: 175: /* Skip over things that don't match chars */ 176: 177: case OP_REVERSE: 178: case OP_CREF: 179: case OP_DNCREF: 180: case OP_RREF: 181: case OP_DNRREF: 182: case OP_DEF: 183: case OP_CALLOUT: 184: case OP_SOD: 185: case OP_SOM: 186: case OP_EOD: 187: case OP_EODN: 188: case OP_CIRC: 189: case OP_CIRCM: 190: case OP_DOLL: 191: case OP_DOLLM: 192: case OP_NOT_WORD_BOUNDARY: 193: case OP_WORD_BOUNDARY: 194: cc += PRIV(OP_lengths)[*cc]; 195: break; 196: 197: /* Skip over a subpattern that has a {0} or {0,x} quantifier */ 198: 199: case OP_BRAZERO: 200: case OP_BRAMINZERO: 201: case OP_BRAPOSZERO: 202: case OP_SKIPZERO: 203: cc += PRIV(OP_lengths)[*cc]; 204: do cc += GET(cc, 1); while (*cc == OP_ALT); 205: cc += 1 + LINK_SIZE; 206: break; 207: 208: /* Handle literal characters and + repetitions */ 209: 210: case OP_CHAR: 211: case OP_CHARI: 212: case OP_NOT: 213: case OP_NOTI: 214: case OP_PLUS: 215: case OP_PLUSI: 216: case OP_MINPLUS: 217: case OP_MINPLUSI: 218: case OP_POSPLUS: 219: case OP_POSPLUSI: 220: case OP_NOTPLUS: 221: case OP_NOTPLUSI: 222: case OP_NOTMINPLUS: 223: case OP_NOTMINPLUSI: 224: case OP_NOTPOSPLUS: 225: case OP_NOTPOSPLUSI: 226: branchlength++; 227: cc += 2; 228: #ifdef SUPPORT_UTF 229: if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); 230: #endif 231: break; 232: 233: case OP_TYPEPLUS: 234: case OP_TYPEMINPLUS: 235: case OP_TYPEPOSPLUS: 236: branchlength++; 237: cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2; 238: break; 239: 240: /* Handle exact repetitions. The count is already in characters, but we 241: need to skip over a multibyte character in UTF8 mode. */ 242: 243: case OP_EXACT: 244: case OP_EXACTI: 245: case OP_NOTEXACT: 246: case OP_NOTEXACTI: 247: branchlength += GET2(cc,1); 248: cc += 2 + IMM2_SIZE; 249: #ifdef SUPPORT_UTF 250: if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); 251: #endif 252: break; 253: 254: case OP_TYPEEXACT: 255: branchlength += GET2(cc,1); 256: cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP 257: || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0); 258: break; 259: 260: /* Handle single-char non-literal matchers */ 261: 262: case OP_PROP: 263: case OP_NOTPROP: 264: cc += 2; 265: /* Fall through */ 266: 267: case OP_NOT_DIGIT: 268: case OP_DIGIT: 269: case OP_NOT_WHITESPACE: 270: case OP_WHITESPACE: 271: case OP_NOT_WORDCHAR: 272: case OP_WORDCHAR: 273: case OP_ANY: 274: case OP_ALLANY: 275: case OP_EXTUNI: 276: case OP_HSPACE: 277: case OP_NOT_HSPACE: 278: case OP_VSPACE: 279: case OP_NOT_VSPACE: 280: branchlength++; 281: cc++; 282: break; 283: 284: /* "Any newline" might match two characters, but it also might match just 285: one. */ 286: 287: case OP_ANYNL: 288: branchlength += 1; 289: cc++; 290: break; 291: 292: /* The single-byte matcher means we can't proceed in UTF-8 mode. (In 293: non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever 294: appear, but leave the code, just in case.) */ 295: 296: case OP_ANYBYTE: 297: #ifdef SUPPORT_UTF 298: if (utf) return -1; 299: #endif 300: branchlength++; 301: cc++; 302: break; 303: 304: /* For repeated character types, we have to test for \p and \P, which have 305: an extra two bytes of parameters. */ 306: 307: case OP_TYPESTAR: 308: case OP_TYPEMINSTAR: 309: case OP_TYPEQUERY: 310: case OP_TYPEMINQUERY: 311: case OP_TYPEPOSSTAR: 312: case OP_TYPEPOSQUERY: 313: if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2; 314: cc += PRIV(OP_lengths)[op]; 315: break; 316: 317: case OP_TYPEUPTO: 318: case OP_TYPEMINUPTO: 319: case OP_TYPEPOSUPTO: 320: if (cc[1 + IMM2_SIZE] == OP_PROP 321: || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2; 322: cc += PRIV(OP_lengths)[op]; 323: break; 324: 325: /* Check a class for variable quantification */ 326: 327: case OP_CLASS: 328: case OP_NCLASS: 329: #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32 330: case OP_XCLASS: 331: /* The original code caused an unsigned overflow in 64 bit systems, 332: so now we use a conditional statement. */ 333: if (op == OP_XCLASS) 334: cc += GET(cc, 1); 335: else 336: cc += PRIV(OP_lengths)[OP_CLASS]; 337: #else 338: cc += PRIV(OP_lengths)[OP_CLASS]; 339: #endif 340: 341: switch (*cc) 342: { 343: case OP_CRPLUS: 344: case OP_CRMINPLUS: 345: case OP_CRPOSPLUS: 346: branchlength++; 347: /* Fall through */ 348: 349: case OP_CRSTAR: 350: case OP_CRMINSTAR: 351: case OP_CRQUERY: 352: case OP_CRMINQUERY: 353: case OP_CRPOSSTAR: 354: case OP_CRPOSQUERY: 355: cc++; 356: break; 357: 358: case OP_CRRANGE: 359: case OP_CRMINRANGE: 360: case OP_CRPOSRANGE: 361: branchlength += GET2(cc,1); 362: cc += 1 + 2 * IMM2_SIZE; 363: break; 364: 365: default: 366: branchlength++; 367: break; 368: } 369: break; 370: 371: /* Backreferences and subroutine calls are treated in the same way: we find 372: the minimum length for the subpattern. A recursion, however, causes an 373: a flag to be set that causes the length of this branch to be ignored. The 374: logic is that a recursion can only make sense if there is another 375: alternation that stops the recursing. That will provide the minimum length 376: (when no recursion happens). A backreference within the group that it is 377: referencing behaves in the same way. 378: 379: If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket 380: matches an empty string (by default it causes a matching failure), so in 381: that case we must set the minimum length to zero. */ 382: 383: case OP_DNREF: /* Duplicate named pattern back reference */ 384: case OP_DNREFI: 385: if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) 386: { 387: int count = GET2(cc, 1+IMM2_SIZE); 388: pcre_uchar *slot = (pcre_uchar *)re + 389: re->name_table_offset + GET2(cc, 1) * re->name_entry_size; 390: d = INT_MAX; 391: while (count-- > 0) 392: { 393: ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0)); 394: if (cs == NULL) return -2; 395: do ce += GET(ce, 1); while (*ce == OP_ALT); 396: if (cc > cs && cc < ce) 397: { 398: d = 0; 399: had_recurse = TRUE; 400: break; 401: } 402: else 403: { 404: int dd = find_minlength(re, cs, startcode, options, recurse_depth); 405: if (dd < d) d = dd; 406: } 407: slot += re->name_entry_size; 408: } 409: } 410: else d = 0; 411: cc += 1 + 2*IMM2_SIZE; 412: goto REPEAT_BACK_REFERENCE; 413: 414: case OP_REF: /* Single back reference */ 415: case OP_REFI: 416: if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) 417: { 418: ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1)); 419: if (cs == NULL) return -2; 420: do ce += GET(ce, 1); while (*ce == OP_ALT); 421: if (cc > cs && cc < ce) 422: { 423: d = 0; 424: had_recurse = TRUE; 425: } 426: else 427: { 428: d = find_minlength(re, cs, startcode, options, recurse_depth); 429: } 430: } 431: else d = 0; 432: cc += 1 + IMM2_SIZE; 433: 434: /* Handle repeated back references */ 435: 436: REPEAT_BACK_REFERENCE: 437: switch (*cc) 438: { 439: case OP_CRSTAR: 440: case OP_CRMINSTAR: 441: case OP_CRQUERY: 442: case OP_CRMINQUERY: 443: case OP_CRPOSSTAR: 444: case OP_CRPOSQUERY: 445: min = 0; 446: cc++; 447: break; 448: 449: case OP_CRPLUS: 450: case OP_CRMINPLUS: 451: case OP_CRPOSPLUS: 452: min = 1; 453: cc++; 454: break; 455: 456: case OP_CRRANGE: 457: case OP_CRMINRANGE: 458: case OP_CRPOSRANGE: 459: min = GET2(cc, 1); 460: cc += 1 + 2 * IMM2_SIZE; 461: break; 462: 463: default: 464: min = 1; 465: break; 466: } 467: 468: branchlength += min * d; 469: break; 470: 471: /* We can easily detect direct recursion, but not mutual recursion. This is 472: caught by a recursion depth count. */ 473: 474: case OP_RECURSE: 475: cs = ce = (pcre_uchar *)startcode + GET(cc, 1); 476: do ce += GET(ce, 1); while (*ce == OP_ALT); 477: if ((cc > cs && cc < ce) || recurse_depth > 10) 478: had_recurse = TRUE; 479: else 480: { 481: branchlength += find_minlength(re, cs, startcode, options, 482: recurse_depth + 1); 483: } 484: cc += 1 + LINK_SIZE; 485: break; 486: 487: /* Anything else does not or need not match a character. We can get the 488: item's length from the table, but for those that can match zero occurrences 489: of a character, we must take special action for UTF-8 characters. As it 490: happens, the "NOT" versions of these opcodes are used at present only for 491: ASCII characters, so they could be omitted from this list. However, in 492: future that may change, so we include them here so as not to leave a 493: gotcha for a future maintainer. */ 494: 495: case OP_UPTO: 496: case OP_UPTOI: 497: case OP_NOTUPTO: 498: case OP_NOTUPTOI: 499: case OP_MINUPTO: 500: case OP_MINUPTOI: 501: case OP_NOTMINUPTO: 502: case OP_NOTMINUPTOI: 503: case OP_POSUPTO: 504: case OP_POSUPTOI: 505: case OP_NOTPOSUPTO: 506: case OP_NOTPOSUPTOI: 507: 508: case OP_STAR: 509: case OP_STARI: 510: case OP_NOTSTAR: 511: case OP_NOTSTARI: 512: case OP_MINSTAR: 513: case OP_MINSTARI: 514: case OP_NOTMINSTAR: 515: case OP_NOTMINSTARI: 516: case OP_POSSTAR: 517: case OP_POSSTARI: 518: case OP_NOTPOSSTAR: 519: case OP_NOTPOSSTARI: 520: 521: case OP_QUERY: 522: case OP_QUERYI: 523: case OP_NOTQUERY: 524: case OP_NOTQUERYI: 525: case OP_MINQUERY: 526: case OP_MINQUERYI: 527: case OP_NOTMINQUERY: 528: case OP_NOTMINQUERYI: 529: case OP_POSQUERY: 530: case OP_POSQUERYI: 531: case OP_NOTPOSQUERY: 532: case OP_NOTPOSQUERYI: 533: 534: cc += PRIV(OP_lengths)[op]; 535: #ifdef SUPPORT_UTF 536: if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); 537: #endif 538: break; 539: 540: /* Skip these, but we need to add in the name length. */ 541: 542: case OP_MARK: 543: case OP_PRUNE_ARG: 544: case OP_SKIP_ARG: 545: case OP_THEN_ARG: 546: cc += PRIV(OP_lengths)[op] + cc[1]; 547: break; 548: 549: /* The remaining opcodes are just skipped over. */ 550: 551: case OP_CLOSE: 552: case OP_COMMIT: 553: case OP_FAIL: 554: case OP_PRUNE: 555: case OP_SET_SOM: 556: case OP_SKIP: 557: case OP_THEN: 558: cc += PRIV(OP_lengths)[op]; 559: break; 560: 561: /* This should not occur: we list all opcodes explicitly so that when 562: new ones get added they are properly considered. */ 563: 564: default: 565: return -3; 566: } 567: } 568: /* Control never gets here */ 569: } 570: 571: 572: 573: /************************************************* 574: * Set a bit and maybe its alternate case * 575: *************************************************/ 576: 577: /* Given a character, set its first byte's bit in the table, and also the 578: corresponding bit for the other version of a letter if we are caseless. In 579: UTF-8 mode, for characters greater than 127, we can only do the caseless thing 580: when Unicode property support is available. 581: 582: Arguments: 583: start_bits points to the bit map 584: p points to the character 585: caseless the caseless flag 586: cd the block with char table pointers 587: utf TRUE for UTF-8 / UTF-16 / UTF-32 mode 588: 589: Returns: pointer after the character 590: */ 591: 592: static const pcre_uchar * 593: set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless, 594: compile_data *cd, BOOL utf) 595: { 596: pcre_uint32 c = *p; 597: 598: #ifdef COMPILE_PCRE8 599: SET_BIT(c); 600: 601: #ifdef SUPPORT_UTF 602: if (utf && c > 127) 603: { 604: GETCHARINC(c, p); 605: #ifdef SUPPORT_UCP 606: if (caseless) 607: { 608: pcre_uchar buff[6]; 609: c = UCD_OTHERCASE(c); 610: (void)PRIV(ord2utf)(c, buff); 611: SET_BIT(buff[0]); 612: } 613: #endif /* Not SUPPORT_UCP */ 614: return p; 615: } 616: #else /* Not SUPPORT_UTF */ 617: (void)(utf); /* Stops warning for unused parameter */ 618: #endif /* SUPPORT_UTF */ 619: 620: /* Not UTF-8 mode, or character is less than 127. */ 621: 622: if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]); 623: return p + 1; 624: #endif /* COMPILE_PCRE8 */ 625: 626: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 627: if (c > 0xff) 628: { 629: c = 0xff; 630: caseless = FALSE; 631: } 632: SET_BIT(c); 633: 634: #ifdef SUPPORT_UTF 635: if (utf && c > 127) 636: { 637: GETCHARINC(c, p); 638: #ifdef SUPPORT_UCP 639: if (caseless) 640: { 641: c = UCD_OTHERCASE(c); 642: if (c > 0xff) 643: c = 0xff; 644: SET_BIT(c); 645: } 646: #endif /* SUPPORT_UCP */ 647: return p; 648: } 649: #else /* Not SUPPORT_UTF */ 650: (void)(utf); /* Stops warning for unused parameter */ 651: #endif /* SUPPORT_UTF */ 652: 653: if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]); 654: return p + 1; 655: #endif 656: } 657: 658: 659: 660: /************************************************* 661: * Set bits for a positive character type * 662: *************************************************/ 663: 664: /* This function sets starting bits for a character type. In UTF-8 mode, we can 665: only do a direct setting for bytes less than 128, as otherwise there can be 666: confusion with bytes in the middle of UTF-8 characters. In a "traditional" 667: environment, the tables will only recognize ASCII characters anyway, but in at 668: least one Windows environment, some higher bytes bits were set in the tables. 669: So we deal with that case by considering the UTF-8 encoding. 670: 671: Arguments: 672: start_bits the starting bitmap 673: cbit type the type of character wanted 674: table_limit 32 for non-UTF-8; 16 for UTF-8 675: cd the block with char table pointers 676: 677: Returns: nothing 678: */ 679: 680: static void 681: set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit, 682: compile_data *cd) 683: { 684: register pcre_uint32 c; 685: for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type]; 686: #if defined SUPPORT_UTF && defined COMPILE_PCRE8 687: if (table_limit == 32) return; 688: for (c = 128; c < 256; c++) 689: { 690: if ((cd->cbits[c/8] & (1 << (c&7))) != 0) 691: { 692: pcre_uchar buff[6]; 693: (void)PRIV(ord2utf)(c, buff); 694: SET_BIT(buff[0]); 695: } 696: } 697: #endif 698: } 699: 700: 701: /************************************************* 702: * Set bits for a negative character type * 703: *************************************************/ 704: 705: /* This function sets starting bits for a negative character type such as \D. 706: In UTF-8 mode, we can only do a direct setting for bytes less than 128, as 707: otherwise there can be confusion with bytes in the middle of UTF-8 characters. 708: Unlike in the positive case, where we can set appropriate starting bits for 709: specific high-valued UTF-8 characters, in this case we have to set the bits for 710: all high-valued characters. The lowest is 0xc2, but we overkill by starting at 711: 0xc0 (192) for simplicity. 712: 713: Arguments: 714: start_bits the starting bitmap 715: cbit type the type of character wanted 716: table_limit 32 for non-UTF-8; 16 for UTF-8 717: cd the block with char table pointers 718: 719: Returns: nothing 720: */ 721: 722: static void 723: set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit, 724: compile_data *cd) 725: { 726: register pcre_uint32 c; 727: for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type]; 728: #if defined SUPPORT_UTF && defined COMPILE_PCRE8 729: if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff; 730: #endif 731: } 732: 733: 734: 735: /************************************************* 736: * Create bitmap of starting bytes * 737: *************************************************/ 738: 739: /* This function scans a compiled unanchored expression recursively and 740: attempts to build a bitmap of the set of possible starting bytes. As time goes 741: by, we may be able to get more clever at doing this. The SSB_CONTINUE return is 742: useful for parenthesized groups in patterns such as (a*)b where the group 743: provides some optional starting bytes but scanning must continue at the outer 744: level to find at least one mandatory byte. At the outermost level, this 745: function fails unless the result is SSB_DONE. 746: 747: Arguments: 748: code points to an expression 749: start_bits points to a 32-byte table, initialized to 0 750: utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode 751: cd the block with char table pointers 752: 753: Returns: SSB_FAIL => Failed to find any starting bytes 754: SSB_DONE => Found mandatory starting bytes 755: SSB_CONTINUE => Found optional starting bytes 756: SSB_UNKNOWN => Hit an unrecognized opcode 757: */ 758: 759: static int 760: set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf, 761: compile_data *cd) 762: { 763: register pcre_uint32 c; 764: int yield = SSB_DONE; 765: #if defined SUPPORT_UTF && defined COMPILE_PCRE8 766: int table_limit = utf? 16:32; 767: #else 768: int table_limit = 32; 769: #endif 770: 771: #if 0 772: /* ========================================================================= */ 773: /* The following comment and code was inserted in January 1999. In May 2006, 774: when it was observed to cause compiler warnings about unused values, I took it 775: out again. If anybody is still using OS/2, they will have to put it back 776: manually. */ 777: 778: /* This next statement and the later reference to dummy are here in order to 779: trick the optimizer of the IBM C compiler for OS/2 into generating correct 780: code. Apparently IBM isn't going to fix the problem, and we would rather not 781: disable optimization (in this module it actually makes a big difference, and 782: the pcre module can use all the optimization it can get). */ 783: 784: volatile int dummy; 785: /* ========================================================================= */ 786: #endif 787: 788: do 789: { 790: BOOL try_next = TRUE; 791: const pcre_uchar *tcode = code + 1 + LINK_SIZE; 792: 793: if (*code == OP_CBRA || *code == OP_SCBRA || 794: *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE; 795: 796: while (try_next) /* Loop for items in this branch */ 797: { 798: int rc; 799: 800: switch(*tcode) 801: { 802: /* If we reach something we don't understand, it means a new opcode has 803: been created that hasn't been added to this code. Hopefully this problem 804: will be discovered during testing. */ 805: 806: default: 807: return SSB_UNKNOWN; 808: 809: /* Fail for a valid opcode that implies no starting bits. */ 810: 811: case OP_ACCEPT: 812: case OP_ASSERT_ACCEPT: 813: case OP_ALLANY: 814: case OP_ANY: 815: case OP_ANYBYTE: 816: case OP_CIRC: 817: case OP_CIRCM: 818: case OP_CLOSE: 819: case OP_COMMIT: 820: case OP_COND: 821: case OP_CREF: 822: case OP_DEF: 823: case OP_DNCREF: 824: case OP_DNREF: 825: case OP_DNREFI: 826: case OP_DNRREF: 827: case OP_DOLL: 828: case OP_DOLLM: 829: case OP_END: 830: case OP_EOD: 831: case OP_EODN: 832: case OP_EXTUNI: 833: case OP_FAIL: 834: case OP_MARK: 835: case OP_NOT: 836: case OP_NOTEXACT: 837: case OP_NOTEXACTI: 838: case OP_NOTI: 839: case OP_NOTMINPLUS: 840: case OP_NOTMINPLUSI: 841: case OP_NOTMINQUERY: 842: case OP_NOTMINQUERYI: 843: case OP_NOTMINSTAR: 844: case OP_NOTMINSTARI: 845: case OP_NOTMINUPTO: 846: case OP_NOTMINUPTOI: 847: case OP_NOTPLUS: 848: case OP_NOTPLUSI: 849: case OP_NOTPOSPLUS: 850: case OP_NOTPOSPLUSI: 851: case OP_NOTPOSQUERY: 852: case OP_NOTPOSQUERYI: 853: case OP_NOTPOSSTAR: 854: case OP_NOTPOSSTARI: 855: case OP_NOTPOSUPTO: 856: case OP_NOTPOSUPTOI: 857: case OP_NOTPROP: 858: case OP_NOTQUERY: 859: case OP_NOTQUERYI: 860: case OP_NOTSTAR: 861: case OP_NOTSTARI: 862: case OP_NOTUPTO: 863: case OP_NOTUPTOI: 864: case OP_NOT_HSPACE: 865: case OP_NOT_VSPACE: 866: case OP_PROP: 867: case OP_PRUNE: 868: case OP_PRUNE_ARG: 869: case OP_RECURSE: 870: case OP_REF: 871: case OP_REFI: 872: case OP_REVERSE: 873: case OP_RREF: 874: case OP_SCOND: 875: case OP_SET_SOM: 876: case OP_SKIP: 877: case OP_SKIP_ARG: 878: case OP_SOD: 879: case OP_SOM: 880: case OP_THEN: 881: case OP_THEN_ARG: 882: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8 883: case OP_XCLASS: 884: #endif 885: return SSB_FAIL; 886: 887: /* We can ignore word boundary tests. */ 888: 889: case OP_WORD_BOUNDARY: 890: case OP_NOT_WORD_BOUNDARY: 891: tcode++; 892: break; 893: 894: /* If we hit a bracket or a positive lookahead assertion, recurse to set 895: bits from within the subpattern. If it can't find anything, we have to 896: give up. If it finds some mandatory character(s), we are done for this 897: branch. Otherwise, carry on scanning after the subpattern. */ 898: 899: case OP_BRA: 900: case OP_SBRA: 901: case OP_CBRA: 902: case OP_SCBRA: 903: case OP_BRAPOS: 904: case OP_SBRAPOS: 905: case OP_CBRAPOS: 906: case OP_SCBRAPOS: 907: case OP_ONCE: 908: case OP_ONCE_NC: 909: case OP_ASSERT: 910: rc = set_start_bits(tcode, start_bits, utf, cd); 911: if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc; 912: if (rc == SSB_DONE) try_next = FALSE; else 913: { 914: do tcode += GET(tcode, 1); while (*tcode == OP_ALT); 915: tcode += 1 + LINK_SIZE; 916: } 917: break; 918: 919: /* If we hit ALT or KET, it means we haven't found anything mandatory in 920: this branch, though we might have found something optional. For ALT, we 921: continue with the next alternative, but we have to arrange that the final 922: result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET, 923: return SSB_CONTINUE: if this is the top level, that indicates failure, 924: but after a nested subpattern, it causes scanning to continue. */ 925: 926: case OP_ALT: 927: yield = SSB_CONTINUE; 928: try_next = FALSE; 929: break; 930: 931: case OP_KET: 932: case OP_KETRMAX: 933: case OP_KETRMIN: 934: case OP_KETRPOS: 935: return SSB_CONTINUE; 936: 937: /* Skip over callout */ 938: 939: case OP_CALLOUT: 940: tcode += 2 + 2*LINK_SIZE; 941: break; 942: 943: /* Skip over lookbehind and negative lookahead assertions */ 944: 945: case OP_ASSERT_NOT: 946: case OP_ASSERTBACK: 947: case OP_ASSERTBACK_NOT: 948: do tcode += GET(tcode, 1); while (*tcode == OP_ALT); 949: tcode += 1 + LINK_SIZE; 950: break; 951: 952: /* BRAZERO does the bracket, but carries on. */ 953: 954: case OP_BRAZERO: 955: case OP_BRAMINZERO: 956: case OP_BRAPOSZERO: 957: rc = set_start_bits(++tcode, start_bits, utf, cd); 958: if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc; 959: /* ========================================================================= 960: See the comment at the head of this function concerning the next line, 961: which was an old fudge for the benefit of OS/2. 962: dummy = 1; 963: ========================================================================= */ 964: do tcode += GET(tcode,1); while (*tcode == OP_ALT); 965: tcode += 1 + LINK_SIZE; 966: break; 967: 968: /* SKIPZERO skips the bracket. */ 969: 970: case OP_SKIPZERO: 971: tcode++; 972: do tcode += GET(tcode,1); while (*tcode == OP_ALT); 973: tcode += 1 + LINK_SIZE; 974: break; 975: 976: /* Single-char * or ? sets the bit and tries the next item */ 977: 978: case OP_STAR: 979: case OP_MINSTAR: 980: case OP_POSSTAR: 981: case OP_QUERY: 982: case OP_MINQUERY: 983: case OP_POSQUERY: 984: tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf); 985: break; 986: 987: case OP_STARI: 988: case OP_MINSTARI: 989: case OP_POSSTARI: 990: case OP_QUERYI: 991: case OP_MINQUERYI: 992: case OP_POSQUERYI: 993: tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf); 994: break; 995: 996: /* Single-char upto sets the bit and tries the next */ 997: 998: case OP_UPTO: 999: case OP_MINUPTO: 1000: case OP_POSUPTO: 1001: tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf); 1002: break; 1003: 1004: case OP_UPTOI: 1005: case OP_MINUPTOI: 1006: case OP_POSUPTOI: 1007: tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf); 1008: break; 1009: 1010: /* At least one single char sets the bit and stops */ 1011: 1012: case OP_EXACT: 1013: tcode += IMM2_SIZE; 1014: /* Fall through */ 1015: case OP_CHAR: 1016: case OP_PLUS: 1017: case OP_MINPLUS: 1018: case OP_POSPLUS: 1019: (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf); 1020: try_next = FALSE; 1021: break; 1022: 1023: case OP_EXACTI: 1024: tcode += IMM2_SIZE; 1025: /* Fall through */ 1026: case OP_CHARI: 1027: case OP_PLUSI: 1028: case OP_MINPLUSI: 1029: case OP_POSPLUSI: 1030: (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf); 1031: try_next = FALSE; 1032: break; 1033: 1034: /* Special spacing and line-terminating items. These recognize specific 1035: lists of characters. The difference between VSPACE and ANYNL is that the 1036: latter can match the two-character CRLF sequence, but that is not 1037: relevant for finding the first character, so their code here is 1038: identical. */ 1039: 1040: case OP_HSPACE: 1041: SET_BIT(CHAR_HT); 1042: SET_BIT(CHAR_SPACE); 1043: #ifdef SUPPORT_UTF 1044: if (utf) 1045: { 1046: #ifdef COMPILE_PCRE8 1047: SET_BIT(0xC2); /* For U+00A0 */ 1048: SET_BIT(0xE1); /* For U+1680, U+180E */ 1049: SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */ 1050: SET_BIT(0xE3); /* For U+3000 */ 1051: #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32 1052: SET_BIT(0xA0); 1053: SET_BIT(0xFF); /* For characters > 255 */ 1054: #endif /* COMPILE_PCRE[8|16|32] */ 1055: } 1056: else 1057: #endif /* SUPPORT_UTF */ 1058: { 1059: #ifndef EBCDIC 1060: SET_BIT(0xA0); 1061: #endif /* Not EBCDIC */ 1062: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 1063: SET_BIT(0xFF); /* For characters > 255 */ 1064: #endif /* COMPILE_PCRE[16|32] */ 1065: } 1066: try_next = FALSE; 1067: break; 1068: 1069: case OP_ANYNL: 1070: case OP_VSPACE: 1071: SET_BIT(CHAR_LF); 1072: SET_BIT(CHAR_VT); 1073: SET_BIT(CHAR_FF); 1074: SET_BIT(CHAR_CR); 1075: #ifdef SUPPORT_UTF 1076: if (utf) 1077: { 1078: #ifdef COMPILE_PCRE8 1079: SET_BIT(0xC2); /* For U+0085 */ 1080: SET_BIT(0xE2); /* For U+2028, U+2029 */ 1081: #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32 1082: SET_BIT(CHAR_NEL); 1083: SET_BIT(0xFF); /* For characters > 255 */ 1084: #endif /* COMPILE_PCRE[8|16|32] */ 1085: } 1086: else 1087: #endif /* SUPPORT_UTF */ 1088: { 1089: SET_BIT(CHAR_NEL); 1090: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 1091: SET_BIT(0xFF); /* For characters > 255 */ 1092: #endif 1093: } 1094: try_next = FALSE; 1095: break; 1096: 1097: /* Single character types set the bits and stop. Note that if PCRE_UCP 1098: is set, we do not see these op codes because \d etc are converted to 1099: properties. Therefore, these apply in the case when only characters less 1100: than 256 are recognized to match the types. */ 1101: 1102: case OP_NOT_DIGIT: 1103: set_nottype_bits(start_bits, cbit_digit, table_limit, cd); 1104: try_next = FALSE; 1105: break; 1106: 1107: case OP_DIGIT: 1108: set_type_bits(start_bits, cbit_digit, table_limit, cd); 1109: try_next = FALSE; 1110: break; 1111: 1112: /* The cbit_space table has vertical tab as whitespace; we have to 1113: ensure it is set as not whitespace. Luckily, the code value is the same 1114: (0x0b) in ASCII and EBCDIC, so we can just adjust the appropriate bit. */ 1115: 1116: case OP_NOT_WHITESPACE: 1117: set_nottype_bits(start_bits, cbit_space, table_limit, cd); 1118: start_bits[1] |= 0x08; 1119: try_next = FALSE; 1120: break; 1121: 1122: /* The cbit_space table has vertical tab as whitespace; we have to not 1123: set it from the table. Luckily, the code value is the same (0x0b) in 1124: ASCII and EBCDIC, so we can just adjust the appropriate bit. */ 1125: 1126: case OP_WHITESPACE: 1127: c = start_bits[1]; /* Save in case it was already set */ 1128: set_type_bits(start_bits, cbit_space, table_limit, cd); 1129: start_bits[1] = (start_bits[1] & ~0x08) | c; 1130: try_next = FALSE; 1131: break; 1132: 1133: case OP_NOT_WORDCHAR: 1134: set_nottype_bits(start_bits, cbit_word, table_limit, cd); 1135: try_next = FALSE; 1136: break; 1137: 1138: case OP_WORDCHAR: 1139: set_type_bits(start_bits, cbit_word, table_limit, cd); 1140: try_next = FALSE; 1141: break; 1142: 1143: /* One or more character type fudges the pointer and restarts, knowing 1144: it will hit a single character type and stop there. */ 1145: 1146: case OP_TYPEPLUS: 1147: case OP_TYPEMINPLUS: 1148: case OP_TYPEPOSPLUS: 1149: tcode++; 1150: break; 1151: 1152: case OP_TYPEEXACT: 1153: tcode += 1 + IMM2_SIZE; 1154: break; 1155: 1156: /* Zero or more repeats of character types set the bits and then 1157: try again. */ 1158: 1159: case OP_TYPEUPTO: 1160: case OP_TYPEMINUPTO: 1161: case OP_TYPEPOSUPTO: 1162: tcode += IMM2_SIZE; /* Fall through */ 1163: 1164: case OP_TYPESTAR: 1165: case OP_TYPEMINSTAR: 1166: case OP_TYPEPOSSTAR: 1167: case OP_TYPEQUERY: 1168: case OP_TYPEMINQUERY: 1169: case OP_TYPEPOSQUERY: 1170: switch(tcode[1]) 1171: { 1172: default: 1173: case OP_ANY: 1174: case OP_ALLANY: 1175: return SSB_FAIL; 1176: 1177: case OP_HSPACE: 1178: SET_BIT(CHAR_HT); 1179: SET_BIT(CHAR_SPACE); 1180: #ifdef SUPPORT_UTF 1181: if (utf) 1182: { 1183: #ifdef COMPILE_PCRE8 1184: SET_BIT(0xC2); /* For U+00A0 */ 1185: SET_BIT(0xE1); /* For U+1680, U+180E */ 1186: SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */ 1187: SET_BIT(0xE3); /* For U+3000 */ 1188: #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32 1189: SET_BIT(0xA0); 1190: SET_BIT(0xFF); /* For characters > 255 */ 1191: #endif /* COMPILE_PCRE[8|16|32] */ 1192: } 1193: else 1194: #endif /* SUPPORT_UTF */ 1195: #ifndef EBCDIC 1196: SET_BIT(0xA0); 1197: #endif /* Not EBCDIC */ 1198: break; 1199: 1200: case OP_ANYNL: 1201: case OP_VSPACE: 1202: SET_BIT(CHAR_LF); 1203: SET_BIT(CHAR_VT); 1204: SET_BIT(CHAR_FF); 1205: SET_BIT(CHAR_CR); 1206: #ifdef SUPPORT_UTF 1207: if (utf) 1208: { 1209: #ifdef COMPILE_PCRE8 1210: SET_BIT(0xC2); /* For U+0085 */ 1211: SET_BIT(0xE2); /* For U+2028, U+2029 */ 1212: #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32 1213: SET_BIT(CHAR_NEL); 1214: SET_BIT(0xFF); /* For characters > 255 */ 1215: #endif /* COMPILE_PCRE16 */ 1216: } 1217: else 1218: #endif /* SUPPORT_UTF */ 1219: SET_BIT(CHAR_NEL); 1220: break; 1221: 1222: case OP_NOT_DIGIT: 1223: set_nottype_bits(start_bits, cbit_digit, table_limit, cd); 1224: break; 1225: 1226: case OP_DIGIT: 1227: set_type_bits(start_bits, cbit_digit, table_limit, cd); 1228: break; 1229: 1230: /* The cbit_space table has vertical tab as whitespace; we no longer 1231: have to play fancy tricks because Perl added VT to its whitespace at 1232: release 5.18. PCRE added it at release 8.34. */ 1233: 1234: case OP_NOT_WHITESPACE: 1235: set_nottype_bits(start_bits, cbit_space, table_limit, cd); 1236: break; 1237: 1238: case OP_WHITESPACE: 1239: set_type_bits(start_bits, cbit_space, table_limit, cd); 1240: break; 1241: 1242: case OP_NOT_WORDCHAR: 1243: set_nottype_bits(start_bits, cbit_word, table_limit, cd); 1244: break; 1245: 1246: case OP_WORDCHAR: 1247: set_type_bits(start_bits, cbit_word, table_limit, cd); 1248: break; 1249: } 1250: 1251: tcode += 2; 1252: break; 1253: 1254: /* Character class where all the information is in a bit map: set the 1255: bits and either carry on or not, according to the repeat count. If it was 1256: a negative class, and we are operating with UTF-8 characters, any byte 1257: with a value >= 0xc4 is a potentially valid starter because it starts a 1258: character with a value > 255. */ 1259: 1260: case OP_NCLASS: 1261: #if defined SUPPORT_UTF && defined COMPILE_PCRE8 1262: if (utf) 1263: { 1264: start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */ 1265: memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */ 1266: } 1267: #endif 1268: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 1269: SET_BIT(0xFF); /* For characters > 255 */ 1270: #endif 1271: /* Fall through */ 1272: 1273: case OP_CLASS: 1274: { 1275: pcre_uint8 *map; 1276: tcode++; 1277: map = (pcre_uint8 *)tcode; 1278: 1279: /* In UTF-8 mode, the bits in a bit map correspond to character 1280: values, not to byte values. However, the bit map we are constructing is 1281: for byte values. So we have to do a conversion for characters whose 1282: value is > 127. In fact, there are only two possible starting bytes for 1283: characters in the range 128 - 255. */ 1284: 1285: #if defined SUPPORT_UTF && defined COMPILE_PCRE8 1286: if (utf) 1287: { 1288: for (c = 0; c < 16; c++) start_bits[c] |= map[c]; 1289: for (c = 128; c < 256; c++) 1290: { 1291: if ((map[c/8] && (1 << (c&7))) != 0) 1292: { 1293: int d = (c >> 6) | 0xc0; /* Set bit for this starter */ 1294: start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */ 1295: c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */ 1296: } 1297: } 1298: } 1299: else 1300: #endif 1301: { 1302: /* In non-UTF-8 mode, the two bit maps are completely compatible. */ 1303: for (c = 0; c < 32; c++) start_bits[c] |= map[c]; 1304: } 1305: 1306: /* Advance past the bit map, and act on what follows. For a zero 1307: minimum repeat, continue; otherwise stop processing. */ 1308: 1309: tcode += 32 / sizeof(pcre_uchar); 1310: switch (*tcode) 1311: { 1312: case OP_CRSTAR: 1313: case OP_CRMINSTAR: 1314: case OP_CRQUERY: 1315: case OP_CRMINQUERY: 1316: case OP_CRPOSSTAR: 1317: case OP_CRPOSQUERY: 1318: tcode++; 1319: break; 1320: 1321: case OP_CRRANGE: 1322: case OP_CRMINRANGE: 1323: case OP_CRPOSRANGE: 1324: if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE; 1325: else try_next = FALSE; 1326: break; 1327: 1328: default: 1329: try_next = FALSE; 1330: break; 1331: } 1332: } 1333: break; /* End of bitmap class handling */ 1334: 1335: } /* End of switch */ 1336: } /* End of try_next loop */ 1337: 1338: code += GET(code, 1); /* Advance to next branch */ 1339: } 1340: while (*code == OP_ALT); 1341: return yield; 1342: } 1343: 1344: 1345: 1346: 1347: 1348: /************************************************* 1349: * Study a compiled expression * 1350: *************************************************/ 1351: 1352: /* This function is handed a compiled expression that it must study to produce 1353: information that will speed up the matching. It returns a pcre[16]_extra block 1354: which then gets handed back to pcre_exec(). 1355: 1356: Arguments: 1357: re points to the compiled expression 1358: options contains option bits 1359: errorptr points to where to place error messages; 1360: set NULL unless error 1361: 1362: Returns: pointer to a pcre[16]_extra block, with study_data filled in and 1363: the appropriate flags set; 1364: NULL on error or if no optimization possible 1365: */ 1366: 1367: #if defined COMPILE_PCRE8 1368: PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION 1369: pcre_study(const pcre *external_re, int options, const char **errorptr) 1370: #elif defined COMPILE_PCRE16 1371: PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION 1372: pcre16_study(const pcre16 *external_re, int options, const char **errorptr) 1373: #elif defined COMPILE_PCRE32 1374: PCRE_EXP_DEFN pcre32_extra * PCRE_CALL_CONVENTION 1375: pcre32_study(const pcre32 *external_re, int options, const char **errorptr) 1376: #endif 1377: { 1378: int min; 1379: BOOL bits_set = FALSE; 1380: pcre_uint8 start_bits[32]; 1381: PUBL(extra) *extra = NULL; 1382: pcre_study_data *study; 1383: const pcre_uint8 *tables; 1384: pcre_uchar *code; 1385: compile_data compile_block; 1386: const REAL_PCRE *re = (const REAL_PCRE *)external_re; 1387: 1388: 1389: *errorptr = NULL; 1390: 1391: if (re == NULL || re->magic_number != MAGIC_NUMBER) 1392: { 1393: *errorptr = "argument is not a compiled regular expression"; 1394: return NULL; 1395: } 1396: 1397: if ((re->flags & PCRE_MODE) == 0) 1398: { 1399: #if defined COMPILE_PCRE8 1400: *errorptr = "argument not compiled in 8 bit mode"; 1401: #elif defined COMPILE_PCRE16 1402: *errorptr = "argument not compiled in 16 bit mode"; 1403: #elif defined COMPILE_PCRE32 1404: *errorptr = "argument not compiled in 32 bit mode"; 1405: #endif 1406: return NULL; 1407: } 1408: 1409: if ((options & ~PUBLIC_STUDY_OPTIONS) != 0) 1410: { 1411: *errorptr = "unknown or incorrect option bit(s) set"; 1412: return NULL; 1413: } 1414: 1415: code = (pcre_uchar *)re + re->name_table_offset + 1416: (re->name_count * re->name_entry_size); 1417: 1418: /* For an anchored pattern, or an unanchored pattern that has a first char, or 1419: a multiline pattern that matches only at "line starts", there is no point in 1420: seeking a list of starting bytes. */ 1421: 1422: if ((re->options & PCRE_ANCHORED) == 0 && 1423: (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0) 1424: { 1425: int rc; 1426: 1427: /* Set the character tables in the block that is passed around */ 1428: 1429: tables = re->tables; 1430: 1431: #if defined COMPILE_PCRE8 1432: if (tables == NULL) 1433: (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, 1434: (void *)(&tables)); 1435: #elif defined COMPILE_PCRE16 1436: if (tables == NULL) 1437: (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, 1438: (void *)(&tables)); 1439: #elif defined COMPILE_PCRE32 1440: if (tables == NULL) 1441: (void)pcre32_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, 1442: (void *)(&tables)); 1443: #endif 1444: 1445: compile_block.lcc = tables + lcc_offset; 1446: compile_block.fcc = tables + fcc_offset; 1447: compile_block.cbits = tables + cbits_offset; 1448: compile_block.ctypes = tables + ctypes_offset; 1449: 1450: /* See if we can find a fixed set of initial characters for the pattern. */ 1451: 1452: memset(start_bits, 0, 32 * sizeof(pcre_uint8)); 1453: rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0, 1454: &compile_block); 1455: bits_set = rc == SSB_DONE; 1456: if (rc == SSB_UNKNOWN) 1457: { 1458: *errorptr = "internal error: opcode not recognized"; 1459: return NULL; 1460: } 1461: } 1462: 1463: /* Find the minimum length of subject string. */ 1464: 1465: switch(min = find_minlength(re, code, code, re->options, 0)) 1466: { 1467: case -2: *errorptr = "internal error: missing capturing bracket"; return NULL; 1468: case -3: *errorptr = "internal error: opcode not recognized"; return NULL; 1469: default: break; 1470: } 1471: 1472: /* If a set of starting bytes has been identified, or if the minimum length is 1473: greater than zero, or if JIT optimization has been requested, or if 1474: PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a 1475: pcre_study_data block. The study data is put in the latter, which is pointed to 1476: by the former, which may also get additional data set later by the calling 1477: program. At the moment, the size of pcre_study_data is fixed. We nevertheless 1478: save it in a field for returning via the pcre_fullinfo() function so that if it 1479: becomes variable in the future, we don't have to change that code. */ 1480: 1481: if (bits_set || min > 0 || (options & ( 1482: #ifdef SUPPORT_JIT 1483: PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE | 1484: PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE | 1485: #endif 1486: PCRE_STUDY_EXTRA_NEEDED)) != 0) 1487: { 1488: extra = (PUBL(extra) *)(PUBL(malloc)) 1489: (sizeof(PUBL(extra)) + sizeof(pcre_study_data)); 1490: if (extra == NULL) 1491: { 1492: *errorptr = "failed to get memory"; 1493: return NULL; 1494: } 1495: 1496: study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra))); 1497: extra->flags = PCRE_EXTRA_STUDY_DATA; 1498: extra->study_data = study; 1499: 1500: study->size = sizeof(pcre_study_data); 1501: study->flags = 0; 1502: 1503: /* Set the start bits always, to avoid unset memory errors if the 1504: study data is written to a file, but set the flag only if any of the bits 1505: are set, to save time looking when none are. */ 1506: 1507: if (bits_set) 1508: { 1509: study->flags |= PCRE_STUDY_MAPPED; 1510: memcpy(study->start_bits, start_bits, sizeof(start_bits)); 1511: } 1512: else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8)); 1513: 1514: #ifdef PCRE_DEBUG 1515: if (bits_set) 1516: { 1517: pcre_uint8 *ptr = start_bits; 1518: int i; 1519: 1520: printf("Start bits:\n"); 1521: for (i = 0; i < 32; i++) 1522: printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n"); 1523: } 1524: #endif 1525: 1526: /* Always set the minlength value in the block, because the JIT compiler 1527: makes use of it. However, don't set the bit unless the length is greater than 1528: zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time 1529: checking the zero case. */ 1530: 1531: if (min > 0) 1532: { 1533: study->flags |= PCRE_STUDY_MINLEN; 1534: study->minlength = min; 1535: } 1536: else study->minlength = 0; 1537: 1538: /* If JIT support was compiled and requested, attempt the JIT compilation. 1539: If no starting bytes were found, and the minimum length is zero, and JIT 1540: compilation fails, abandon the extra block and return NULL, unless 1541: PCRE_STUDY_EXTRA_NEEDED is set. */ 1542: 1543: #ifdef SUPPORT_JIT 1544: extra->executable_jit = NULL; 1545: if ((options & PCRE_STUDY_JIT_COMPILE) != 0) 1546: PRIV(jit_compile)(re, extra, JIT_COMPILE); 1547: if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0) 1548: PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE); 1549: if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0) 1550: PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE); 1551: 1552: if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0 && 1553: (options & PCRE_STUDY_EXTRA_NEEDED) == 0) 1554: { 1555: #if defined COMPILE_PCRE8 1556: pcre_free_study(extra); 1557: #elif defined COMPILE_PCRE16 1558: pcre16_free_study(extra); 1559: #elif defined COMPILE_PCRE32 1560: pcre32_free_study(extra); 1561: #endif 1562: extra = NULL; 1563: } 1564: #endif 1565: } 1566: 1567: return extra; 1568: } 1569: 1570: 1571: /************************************************* 1572: * Free the study data * 1573: *************************************************/ 1574: 1575: /* This function frees the memory that was obtained by pcre_study(). 1576: 1577: Argument: a pointer to the pcre[16]_extra block 1578: Returns: nothing 1579: */ 1580: 1581: #if defined COMPILE_PCRE8 1582: PCRE_EXP_DEFN void 1583: pcre_free_study(pcre_extra *extra) 1584: #elif defined COMPILE_PCRE16 1585: PCRE_EXP_DEFN void 1586: pcre16_free_study(pcre16_extra *extra) 1587: #elif defined COMPILE_PCRE32 1588: PCRE_EXP_DEFN void 1589: pcre32_free_study(pcre32_extra *extra) 1590: #endif 1591: { 1592: if (extra == NULL) 1593: return; 1594: #ifdef SUPPORT_JIT 1595: if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 && 1596: extra->executable_jit != NULL) 1597: PRIV(jit_free)(extra->executable_jit); 1598: #endif 1599: PUBL(free)(extra); 1600: } 1601: 1602: /* End of pcre_study.c */