Annotation of embedaddon/pcre/pcre_study.c, revision 1.1
1.1 ! misho 1: /*************************************************
! 2: * Perl-Compatible Regular Expressions *
! 3: *************************************************/
! 4:
! 5: /* PCRE is a library of functions to support regular expressions whose syntax
! 6: and semantics are as close as possible to those of the Perl 5 language.
! 7:
! 8: Written by Philip Hazel
! 9: Copyright (c) 1997-2010 University of Cambridge
! 10:
! 11: -----------------------------------------------------------------------------
! 12: Redistribution and use in source and binary forms, with or without
! 13: modification, are permitted provided that the following conditions are met:
! 14:
! 15: * Redistributions of source code must retain the above copyright notice,
! 16: this list of conditions and the following disclaimer.
! 17:
! 18: * Redistributions in binary form must reproduce the above copyright
! 19: notice, this list of conditions and the following disclaimer in the
! 20: documentation and/or other materials provided with the distribution.
! 21:
! 22: * Neither the name of the University of Cambridge nor the names of its
! 23: contributors may be used to endorse or promote products derived from
! 24: this software without specific prior written permission.
! 25:
! 26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
! 27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
! 28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
! 29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
! 30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
! 31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
! 32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
! 33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
! 34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
! 35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
! 36: POSSIBILITY OF SUCH DAMAGE.
! 37: -----------------------------------------------------------------------------
! 38: */
! 39:
! 40:
! 41: /* This module contains the external function pcre_study(), along with local
! 42: supporting functions. */
! 43:
! 44:
! 45: #ifdef HAVE_CONFIG_H
! 46: #include "config.h"
! 47: #endif
! 48:
! 49: #include "pcre_internal.h"
! 50:
! 51: #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
! 52:
! 53: /* Returns from set_start_bits() */
! 54:
! 55: enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
! 56:
! 57:
! 58:
! 59: /*************************************************
! 60: * Find the minimum subject length for a group *
! 61: *************************************************/
! 62:
! 63: /* Scan a parenthesized group and compute the minimum length of subject that
! 64: is needed to match it. This is a lower bound; it does not mean there is a
! 65: string of that length that matches. In UTF8 mode, the result is in characters
! 66: rather than bytes.
! 67:
! 68: Arguments:
! 69: code pointer to start of group (the bracket)
! 70: startcode pointer to start of the whole pattern
! 71: options the compiling options
! 72: int RECURSE depth
! 73:
! 74: Returns: the minimum length
! 75: -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
! 76: -2 internal error (missing capturing bracket)
! 77: -3 internal error (opcode not listed)
! 78: */
! 79:
! 80: static int
! 81: find_minlength(const uschar *code, const uschar *startcode, int options,
! 82: int recurse_depth)
! 83: {
! 84: int length = -1;
! 85: BOOL utf8 = (options & PCRE_UTF8) != 0;
! 86: BOOL had_recurse = FALSE;
! 87: register int branchlength = 0;
! 88: register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
! 89:
! 90: if (*code == OP_CBRA || *code == OP_SCBRA ||
! 91: *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += 2;
! 92:
! 93: /* Scan along the opcodes for this branch. If we get to the end of the
! 94: branch, check the length against that of the other branches. */
! 95:
! 96: for (;;)
! 97: {
! 98: int d, min;
! 99: uschar *cs, *ce;
! 100: register int op = *cc;
! 101:
! 102: switch (op)
! 103: {
! 104: case OP_COND:
! 105: case OP_SCOND:
! 106:
! 107: /* If there is only one branch in a condition, the implied branch has zero
! 108: length, so we don't add anything. This covers the DEFINE "condition"
! 109: automatically. */
! 110:
! 111: cs = cc + GET(cc, 1);
! 112: if (*cs != OP_ALT)
! 113: {
! 114: cc = cs + 1 + LINK_SIZE;
! 115: break;
! 116: }
! 117:
! 118: /* Otherwise we can fall through and treat it the same as any other
! 119: subpattern. */
! 120:
! 121: case OP_CBRA:
! 122: case OP_SCBRA:
! 123: case OP_BRA:
! 124: case OP_SBRA:
! 125: case OP_CBRAPOS:
! 126: case OP_SCBRAPOS:
! 127: case OP_BRAPOS:
! 128: case OP_SBRAPOS:
! 129: case OP_ONCE:
! 130: case OP_ONCE_NC:
! 131: d = find_minlength(cc, startcode, options, recurse_depth);
! 132: if (d < 0) return d;
! 133: branchlength += d;
! 134: do cc += GET(cc, 1); while (*cc == OP_ALT);
! 135: cc += 1 + LINK_SIZE;
! 136: break;
! 137:
! 138: /* ACCEPT makes things far too complicated; we have to give up. */
! 139:
! 140: case OP_ACCEPT:
! 141: case OP_ASSERT_ACCEPT:
! 142: return -1;
! 143:
! 144: /* Reached end of a branch; if it's a ket it is the end of a nested
! 145: call. If it's ALT it is an alternation in a nested call. If it is END it's
! 146: the end of the outer call. All can be handled by the same code. If an
! 147: ACCEPT was previously encountered, use the length that was in force at that
! 148: time, and pass back the shortest ACCEPT length. */
! 149:
! 150: case OP_ALT:
! 151: case OP_KET:
! 152: case OP_KETRMAX:
! 153: case OP_KETRMIN:
! 154: case OP_KETRPOS:
! 155: case OP_END:
! 156: if (length < 0 || (!had_recurse && branchlength < length))
! 157: length = branchlength;
! 158: if (op != OP_ALT) return length;
! 159: cc += 1 + LINK_SIZE;
! 160: branchlength = 0;
! 161: had_recurse = FALSE;
! 162: break;
! 163:
! 164: /* Skip over assertive subpatterns */
! 165:
! 166: case OP_ASSERT:
! 167: case OP_ASSERT_NOT:
! 168: case OP_ASSERTBACK:
! 169: case OP_ASSERTBACK_NOT:
! 170: do cc += GET(cc, 1); while (*cc == OP_ALT);
! 171: /* Fall through */
! 172:
! 173: /* Skip over things that don't match chars */
! 174:
! 175: case OP_REVERSE:
! 176: case OP_CREF:
! 177: case OP_NCREF:
! 178: case OP_RREF:
! 179: case OP_NRREF:
! 180: case OP_DEF:
! 181: case OP_CALLOUT:
! 182: case OP_SOD:
! 183: case OP_SOM:
! 184: case OP_EOD:
! 185: case OP_EODN:
! 186: case OP_CIRC:
! 187: case OP_CIRCM:
! 188: case OP_DOLL:
! 189: case OP_DOLLM:
! 190: case OP_NOT_WORD_BOUNDARY:
! 191: case OP_WORD_BOUNDARY:
! 192: cc += _pcre_OP_lengths[*cc];
! 193: break;
! 194:
! 195: /* Skip over a subpattern that has a {0} or {0,x} quantifier */
! 196:
! 197: case OP_BRAZERO:
! 198: case OP_BRAMINZERO:
! 199: case OP_BRAPOSZERO:
! 200: case OP_SKIPZERO:
! 201: cc += _pcre_OP_lengths[*cc];
! 202: do cc += GET(cc, 1); while (*cc == OP_ALT);
! 203: cc += 1 + LINK_SIZE;
! 204: break;
! 205:
! 206: /* Handle literal characters and + repetitions */
! 207:
! 208: case OP_CHAR:
! 209: case OP_CHARI:
! 210: case OP_NOT:
! 211: case OP_NOTI:
! 212: case OP_PLUS:
! 213: case OP_PLUSI:
! 214: case OP_MINPLUS:
! 215: case OP_MINPLUSI:
! 216: case OP_POSPLUS:
! 217: case OP_POSPLUSI:
! 218: case OP_NOTPLUS:
! 219: case OP_NOTPLUSI:
! 220: case OP_NOTMINPLUS:
! 221: case OP_NOTMINPLUSI:
! 222: case OP_NOTPOSPLUS:
! 223: case OP_NOTPOSPLUSI:
! 224: branchlength++;
! 225: cc += 2;
! 226: #ifdef SUPPORT_UTF8
! 227: if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
! 228: #endif
! 229: break;
! 230:
! 231: case OP_TYPEPLUS:
! 232: case OP_TYPEMINPLUS:
! 233: case OP_TYPEPOSPLUS:
! 234: branchlength++;
! 235: cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
! 236: break;
! 237:
! 238: /* Handle exact repetitions. The count is already in characters, but we
! 239: need to skip over a multibyte character in UTF8 mode. */
! 240:
! 241: case OP_EXACT:
! 242: case OP_EXACTI:
! 243: case OP_NOTEXACT:
! 244: case OP_NOTEXACTI:
! 245: branchlength += GET2(cc,1);
! 246: cc += 4;
! 247: #ifdef SUPPORT_UTF8
! 248: if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
! 249: #endif
! 250: break;
! 251:
! 252: case OP_TYPEEXACT:
! 253: branchlength += GET2(cc,1);
! 254: cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
! 255: break;
! 256:
! 257: /* Handle single-char non-literal matchers */
! 258:
! 259: case OP_PROP:
! 260: case OP_NOTPROP:
! 261: cc += 2;
! 262: /* Fall through */
! 263:
! 264: case OP_NOT_DIGIT:
! 265: case OP_DIGIT:
! 266: case OP_NOT_WHITESPACE:
! 267: case OP_WHITESPACE:
! 268: case OP_NOT_WORDCHAR:
! 269: case OP_WORDCHAR:
! 270: case OP_ANY:
! 271: case OP_ALLANY:
! 272: case OP_EXTUNI:
! 273: case OP_HSPACE:
! 274: case OP_NOT_HSPACE:
! 275: case OP_VSPACE:
! 276: case OP_NOT_VSPACE:
! 277: branchlength++;
! 278: cc++;
! 279: break;
! 280:
! 281: /* "Any newline" might match two characters, but it also might match just
! 282: one. */
! 283:
! 284: case OP_ANYNL:
! 285: branchlength += 1;
! 286: cc++;
! 287: break;
! 288:
! 289: /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
! 290: non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
! 291: appear, but leave the code, just in case.) */
! 292:
! 293: case OP_ANYBYTE:
! 294: #ifdef SUPPORT_UTF8
! 295: if (utf8) return -1;
! 296: #endif
! 297: branchlength++;
! 298: cc++;
! 299: break;
! 300:
! 301: /* For repeated character types, we have to test for \p and \P, which have
! 302: an extra two bytes of parameters. */
! 303:
! 304: case OP_TYPESTAR:
! 305: case OP_TYPEMINSTAR:
! 306: case OP_TYPEQUERY:
! 307: case OP_TYPEMINQUERY:
! 308: case OP_TYPEPOSSTAR:
! 309: case OP_TYPEPOSQUERY:
! 310: if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
! 311: cc += _pcre_OP_lengths[op];
! 312: break;
! 313:
! 314: case OP_TYPEUPTO:
! 315: case OP_TYPEMINUPTO:
! 316: case OP_TYPEPOSUPTO:
! 317: if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
! 318: cc += _pcre_OP_lengths[op];
! 319: break;
! 320:
! 321: /* Check a class for variable quantification */
! 322:
! 323: #ifdef SUPPORT_UTF8
! 324: case OP_XCLASS:
! 325: cc += GET(cc, 1) - 33;
! 326: /* Fall through */
! 327: #endif
! 328:
! 329: case OP_CLASS:
! 330: case OP_NCLASS:
! 331: cc += 33;
! 332:
! 333: switch (*cc)
! 334: {
! 335: case OP_CRPLUS:
! 336: case OP_CRMINPLUS:
! 337: branchlength++;
! 338: /* Fall through */
! 339:
! 340: case OP_CRSTAR:
! 341: case OP_CRMINSTAR:
! 342: case OP_CRQUERY:
! 343: case OP_CRMINQUERY:
! 344: cc++;
! 345: break;
! 346:
! 347: case OP_CRRANGE:
! 348: case OP_CRMINRANGE:
! 349: branchlength += GET2(cc,1);
! 350: cc += 5;
! 351: break;
! 352:
! 353: default:
! 354: branchlength++;
! 355: break;
! 356: }
! 357: break;
! 358:
! 359: /* Backreferences and subroutine calls are treated in the same way: we find
! 360: the minimum length for the subpattern. A recursion, however, causes an
! 361: a flag to be set that causes the length of this branch to be ignored. The
! 362: logic is that a recursion can only make sense if there is another
! 363: alternation that stops the recursing. That will provide the minimum length
! 364: (when no recursion happens). A backreference within the group that it is
! 365: referencing behaves in the same way.
! 366:
! 367: If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
! 368: matches an empty string (by default it causes a matching failure), so in
! 369: that case we must set the minimum length to zero. */
! 370:
! 371: case OP_REF:
! 372: case OP_REFI:
! 373: if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
! 374: {
! 375: ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
! 376: if (cs == NULL) return -2;
! 377: do ce += GET(ce, 1); while (*ce == OP_ALT);
! 378: if (cc > cs && cc < ce)
! 379: {
! 380: d = 0;
! 381: had_recurse = TRUE;
! 382: }
! 383: else
! 384: {
! 385: d = find_minlength(cs, startcode, options, recurse_depth);
! 386: }
! 387: }
! 388: else d = 0;
! 389: cc += 3;
! 390:
! 391: /* Handle repeated back references */
! 392:
! 393: switch (*cc)
! 394: {
! 395: case OP_CRSTAR:
! 396: case OP_CRMINSTAR:
! 397: case OP_CRQUERY:
! 398: case OP_CRMINQUERY:
! 399: min = 0;
! 400: cc++;
! 401: break;
! 402:
! 403: case OP_CRPLUS:
! 404: case OP_CRMINPLUS:
! 405: min = 1;
! 406: cc++;
! 407: break;
! 408:
! 409: case OP_CRRANGE:
! 410: case OP_CRMINRANGE:
! 411: min = GET2(cc, 1);
! 412: cc += 5;
! 413: break;
! 414:
! 415: default:
! 416: min = 1;
! 417: break;
! 418: }
! 419:
! 420: branchlength += min * d;
! 421: break;
! 422:
! 423: /* We can easily detect direct recursion, but not mutual recursion. This is
! 424: caught by a recursion depth count. */
! 425:
! 426: case OP_RECURSE:
! 427: cs = ce = (uschar *)startcode + GET(cc, 1);
! 428: do ce += GET(ce, 1); while (*ce == OP_ALT);
! 429: if ((cc > cs && cc < ce) || recurse_depth > 10)
! 430: had_recurse = TRUE;
! 431: else
! 432: {
! 433: branchlength += find_minlength(cs, startcode, options, recurse_depth + 1);
! 434: }
! 435: cc += 1 + LINK_SIZE;
! 436: break;
! 437:
! 438: /* Anything else does not or need not match a character. We can get the
! 439: item's length from the table, but for those that can match zero occurrences
! 440: of a character, we must take special action for UTF-8 characters. As it
! 441: happens, the "NOT" versions of these opcodes are used at present only for
! 442: ASCII characters, so they could be omitted from this list. However, in
! 443: future that may change, so we include them here so as not to leave a
! 444: gotcha for a future maintainer. */
! 445:
! 446: case OP_UPTO:
! 447: case OP_UPTOI:
! 448: case OP_NOTUPTO:
! 449: case OP_NOTUPTOI:
! 450: case OP_MINUPTO:
! 451: case OP_MINUPTOI:
! 452: case OP_NOTMINUPTO:
! 453: case OP_NOTMINUPTOI:
! 454: case OP_POSUPTO:
! 455: case OP_POSUPTOI:
! 456: case OP_NOTPOSUPTO:
! 457: case OP_NOTPOSUPTOI:
! 458:
! 459: case OP_STAR:
! 460: case OP_STARI:
! 461: case OP_NOTSTAR:
! 462: case OP_NOTSTARI:
! 463: case OP_MINSTAR:
! 464: case OP_MINSTARI:
! 465: case OP_NOTMINSTAR:
! 466: case OP_NOTMINSTARI:
! 467: case OP_POSSTAR:
! 468: case OP_POSSTARI:
! 469: case OP_NOTPOSSTAR:
! 470: case OP_NOTPOSSTARI:
! 471:
! 472: case OP_QUERY:
! 473: case OP_QUERYI:
! 474: case OP_NOTQUERY:
! 475: case OP_NOTQUERYI:
! 476: case OP_MINQUERY:
! 477: case OP_MINQUERYI:
! 478: case OP_NOTMINQUERY:
! 479: case OP_NOTMINQUERYI:
! 480: case OP_POSQUERY:
! 481: case OP_POSQUERYI:
! 482: case OP_NOTPOSQUERY:
! 483: case OP_NOTPOSQUERYI:
! 484:
! 485: cc += _pcre_OP_lengths[op];
! 486: #ifdef SUPPORT_UTF8
! 487: if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
! 488: #endif
! 489: break;
! 490:
! 491: /* Skip these, but we need to add in the name length. */
! 492:
! 493: case OP_MARK:
! 494: case OP_PRUNE_ARG:
! 495: case OP_SKIP_ARG:
! 496: case OP_THEN_ARG:
! 497: cc += _pcre_OP_lengths[op] + cc[1];
! 498: break;
! 499:
! 500: /* The remaining opcodes are just skipped over. */
! 501:
! 502: case OP_CLOSE:
! 503: case OP_COMMIT:
! 504: case OP_FAIL:
! 505: case OP_PRUNE:
! 506: case OP_SET_SOM:
! 507: case OP_SKIP:
! 508: case OP_THEN:
! 509: cc += _pcre_OP_lengths[op];
! 510: break;
! 511:
! 512: /* This should not occur: we list all opcodes explicitly so that when
! 513: new ones get added they are properly considered. */
! 514:
! 515: default:
! 516: return -3;
! 517: }
! 518: }
! 519: /* Control never gets here */
! 520: }
! 521:
! 522:
! 523:
! 524: /*************************************************
! 525: * Set a bit and maybe its alternate case *
! 526: *************************************************/
! 527:
! 528: /* Given a character, set its first byte's bit in the table, and also the
! 529: corresponding bit for the other version of a letter if we are caseless. In
! 530: UTF-8 mode, for characters greater than 127, we can only do the caseless thing
! 531: when Unicode property support is available.
! 532:
! 533: Arguments:
! 534: start_bits points to the bit map
! 535: p points to the character
! 536: caseless the caseless flag
! 537: cd the block with char table pointers
! 538: utf8 TRUE for UTF-8 mode
! 539:
! 540: Returns: pointer after the character
! 541: */
! 542:
! 543: static const uschar *
! 544: set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
! 545: compile_data *cd, BOOL utf8)
! 546: {
! 547: unsigned int c = *p;
! 548:
! 549: SET_BIT(c);
! 550:
! 551: #ifdef SUPPORT_UTF8
! 552: if (utf8 && c > 127)
! 553: {
! 554: GETCHARINC(c, p);
! 555: #ifdef SUPPORT_UCP
! 556: if (caseless)
! 557: {
! 558: uschar buff[8];
! 559: c = UCD_OTHERCASE(c);
! 560: (void)_pcre_ord2utf8(c, buff);
! 561: SET_BIT(buff[0]);
! 562: }
! 563: #endif
! 564: return p;
! 565: }
! 566: #endif
! 567:
! 568: /* Not UTF-8 mode, or character is less than 127. */
! 569:
! 570: if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
! 571: return p + 1;
! 572: }
! 573:
! 574:
! 575:
! 576: /*************************************************
! 577: * Set bits for a positive character type *
! 578: *************************************************/
! 579:
! 580: /* This function sets starting bits for a character type. In UTF-8 mode, we can
! 581: only do a direct setting for bytes less than 128, as otherwise there can be
! 582: confusion with bytes in the middle of UTF-8 characters. In a "traditional"
! 583: environment, the tables will only recognize ASCII characters anyway, but in at
! 584: least one Windows environment, some higher bytes bits were set in the tables.
! 585: So we deal with that case by considering the UTF-8 encoding.
! 586:
! 587: Arguments:
! 588: start_bits the starting bitmap
! 589: cbit type the type of character wanted
! 590: table_limit 32 for non-UTF-8; 16 for UTF-8
! 591: cd the block with char table pointers
! 592:
! 593: Returns: nothing
! 594: */
! 595:
! 596: static void
! 597: set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
! 598: compile_data *cd)
! 599: {
! 600: register int c;
! 601: for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
! 602: if (table_limit == 32) return;
! 603: for (c = 128; c < 256; c++)
! 604: {
! 605: if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
! 606: {
! 607: uschar buff[8];
! 608: (void)_pcre_ord2utf8(c, buff);
! 609: SET_BIT(buff[0]);
! 610: }
! 611: }
! 612: }
! 613:
! 614:
! 615: /*************************************************
! 616: * Set bits for a negative character type *
! 617: *************************************************/
! 618:
! 619: /* This function sets starting bits for a negative character type such as \D.
! 620: In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
! 621: otherwise there can be confusion with bytes in the middle of UTF-8 characters.
! 622: Unlike in the positive case, where we can set appropriate starting bits for
! 623: specific high-valued UTF-8 characters, in this case we have to set the bits for
! 624: all high-valued characters. The lowest is 0xc2, but we overkill by starting at
! 625: 0xc0 (192) for simplicity.
! 626:
! 627: Arguments:
! 628: start_bits the starting bitmap
! 629: cbit type the type of character wanted
! 630: table_limit 32 for non-UTF-8; 16 for UTF-8
! 631: cd the block with char table pointers
! 632:
! 633: Returns: nothing
! 634: */
! 635:
! 636: static void
! 637: set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
! 638: compile_data *cd)
! 639: {
! 640: register int c;
! 641: for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
! 642: if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
! 643: }
! 644:
! 645:
! 646:
! 647: /*************************************************
! 648: * Create bitmap of starting bytes *
! 649: *************************************************/
! 650:
! 651: /* This function scans a compiled unanchored expression recursively and
! 652: attempts to build a bitmap of the set of possible starting bytes. As time goes
! 653: by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
! 654: useful for parenthesized groups in patterns such as (a*)b where the group
! 655: provides some optional starting bytes but scanning must continue at the outer
! 656: level to find at least one mandatory byte. At the outermost level, this
! 657: function fails unless the result is SSB_DONE.
! 658:
! 659: Arguments:
! 660: code points to an expression
! 661: start_bits points to a 32-byte table, initialized to 0
! 662: utf8 TRUE if in UTF-8 mode
! 663: cd the block with char table pointers
! 664:
! 665: Returns: SSB_FAIL => Failed to find any starting bytes
! 666: SSB_DONE => Found mandatory starting bytes
! 667: SSB_CONTINUE => Found optional starting bytes
! 668: SSB_UNKNOWN => Hit an unrecognized opcode
! 669: */
! 670:
! 671: static int
! 672: set_start_bits(const uschar *code, uschar *start_bits, BOOL utf8,
! 673: compile_data *cd)
! 674: {
! 675: register int c;
! 676: int yield = SSB_DONE;
! 677: int table_limit = utf8? 16:32;
! 678:
! 679: #if 0
! 680: /* ========================================================================= */
! 681: /* The following comment and code was inserted in January 1999. In May 2006,
! 682: when it was observed to cause compiler warnings about unused values, I took it
! 683: out again. If anybody is still using OS/2, they will have to put it back
! 684: manually. */
! 685:
! 686: /* This next statement and the later reference to dummy are here in order to
! 687: trick the optimizer of the IBM C compiler for OS/2 into generating correct
! 688: code. Apparently IBM isn't going to fix the problem, and we would rather not
! 689: disable optimization (in this module it actually makes a big difference, and
! 690: the pcre module can use all the optimization it can get). */
! 691:
! 692: volatile int dummy;
! 693: /* ========================================================================= */
! 694: #endif
! 695:
! 696: do
! 697: {
! 698: BOOL try_next = TRUE;
! 699: const uschar *tcode = code + 1 + LINK_SIZE;
! 700:
! 701: if (*code == OP_CBRA || *code == OP_SCBRA ||
! 702: *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += 2;
! 703:
! 704: while (try_next) /* Loop for items in this branch */
! 705: {
! 706: int rc;
! 707:
! 708: switch(*tcode)
! 709: {
! 710: /* If we reach something we don't understand, it means a new opcode has
! 711: been created that hasn't been added to this code. Hopefully this problem
! 712: will be discovered during testing. */
! 713:
! 714: default:
! 715: return SSB_UNKNOWN;
! 716:
! 717: /* Fail for a valid opcode that implies no starting bits. */
! 718:
! 719: case OP_ACCEPT:
! 720: case OP_ASSERT_ACCEPT:
! 721: case OP_ALLANY:
! 722: case OP_ANY:
! 723: case OP_ANYBYTE:
! 724: case OP_CIRC:
! 725: case OP_CIRCM:
! 726: case OP_CLOSE:
! 727: case OP_COMMIT:
! 728: case OP_COND:
! 729: case OP_CREF:
! 730: case OP_DEF:
! 731: case OP_DOLL:
! 732: case OP_DOLLM:
! 733: case OP_END:
! 734: case OP_EOD:
! 735: case OP_EODN:
! 736: case OP_EXTUNI:
! 737: case OP_FAIL:
! 738: case OP_MARK:
! 739: case OP_NCREF:
! 740: case OP_NOT:
! 741: case OP_NOTEXACT:
! 742: case OP_NOTEXACTI:
! 743: case OP_NOTI:
! 744: case OP_NOTMINPLUS:
! 745: case OP_NOTMINPLUSI:
! 746: case OP_NOTMINQUERY:
! 747: case OP_NOTMINQUERYI:
! 748: case OP_NOTMINSTAR:
! 749: case OP_NOTMINSTARI:
! 750: case OP_NOTMINUPTO:
! 751: case OP_NOTMINUPTOI:
! 752: case OP_NOTPLUS:
! 753: case OP_NOTPLUSI:
! 754: case OP_NOTPOSPLUS:
! 755: case OP_NOTPOSPLUSI:
! 756: case OP_NOTPOSQUERY:
! 757: case OP_NOTPOSQUERYI:
! 758: case OP_NOTPOSSTAR:
! 759: case OP_NOTPOSSTARI:
! 760: case OP_NOTPOSUPTO:
! 761: case OP_NOTPOSUPTOI:
! 762: case OP_NOTPROP:
! 763: case OP_NOTQUERY:
! 764: case OP_NOTQUERYI:
! 765: case OP_NOTSTAR:
! 766: case OP_NOTSTARI:
! 767: case OP_NOTUPTO:
! 768: case OP_NOTUPTOI:
! 769: case OP_NOT_HSPACE:
! 770: case OP_NOT_VSPACE:
! 771: case OP_NRREF:
! 772: case OP_PROP:
! 773: case OP_PRUNE:
! 774: case OP_PRUNE_ARG:
! 775: case OP_RECURSE:
! 776: case OP_REF:
! 777: case OP_REFI:
! 778: case OP_REVERSE:
! 779: case OP_RREF:
! 780: case OP_SCOND:
! 781: case OP_SET_SOM:
! 782: case OP_SKIP:
! 783: case OP_SKIP_ARG:
! 784: case OP_SOD:
! 785: case OP_SOM:
! 786: case OP_THEN:
! 787: case OP_THEN_ARG:
! 788: case OP_XCLASS:
! 789: return SSB_FAIL;
! 790:
! 791: /* We can ignore word boundary tests. */
! 792:
! 793: case OP_WORD_BOUNDARY:
! 794: case OP_NOT_WORD_BOUNDARY:
! 795: tcode++;
! 796: break;
! 797:
! 798: /* If we hit a bracket or a positive lookahead assertion, recurse to set
! 799: bits from within the subpattern. If it can't find anything, we have to
! 800: give up. If it finds some mandatory character(s), we are done for this
! 801: branch. Otherwise, carry on scanning after the subpattern. */
! 802:
! 803: case OP_BRA:
! 804: case OP_SBRA:
! 805: case OP_CBRA:
! 806: case OP_SCBRA:
! 807: case OP_BRAPOS:
! 808: case OP_SBRAPOS:
! 809: case OP_CBRAPOS:
! 810: case OP_SCBRAPOS:
! 811: case OP_ONCE:
! 812: case OP_ONCE_NC:
! 813: case OP_ASSERT:
! 814: rc = set_start_bits(tcode, start_bits, utf8, cd);
! 815: if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
! 816: if (rc == SSB_DONE) try_next = FALSE; else
! 817: {
! 818: do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
! 819: tcode += 1 + LINK_SIZE;
! 820: }
! 821: break;
! 822:
! 823: /* If we hit ALT or KET, it means we haven't found anything mandatory in
! 824: this branch, though we might have found something optional. For ALT, we
! 825: continue with the next alternative, but we have to arrange that the final
! 826: result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
! 827: return SSB_CONTINUE: if this is the top level, that indicates failure,
! 828: but after a nested subpattern, it causes scanning to continue. */
! 829:
! 830: case OP_ALT:
! 831: yield = SSB_CONTINUE;
! 832: try_next = FALSE;
! 833: break;
! 834:
! 835: case OP_KET:
! 836: case OP_KETRMAX:
! 837: case OP_KETRMIN:
! 838: case OP_KETRPOS:
! 839: return SSB_CONTINUE;
! 840:
! 841: /* Skip over callout */
! 842:
! 843: case OP_CALLOUT:
! 844: tcode += 2 + 2*LINK_SIZE;
! 845: break;
! 846:
! 847: /* Skip over lookbehind and negative lookahead assertions */
! 848:
! 849: case OP_ASSERT_NOT:
! 850: case OP_ASSERTBACK:
! 851: case OP_ASSERTBACK_NOT:
! 852: do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
! 853: tcode += 1 + LINK_SIZE;
! 854: break;
! 855:
! 856: /* BRAZERO does the bracket, but carries on. */
! 857:
! 858: case OP_BRAZERO:
! 859: case OP_BRAMINZERO:
! 860: case OP_BRAPOSZERO:
! 861: rc = set_start_bits(++tcode, start_bits, utf8, cd);
! 862: if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
! 863: /* =========================================================================
! 864: See the comment at the head of this function concerning the next line,
! 865: which was an old fudge for the benefit of OS/2.
! 866: dummy = 1;
! 867: ========================================================================= */
! 868: do tcode += GET(tcode,1); while (*tcode == OP_ALT);
! 869: tcode += 1 + LINK_SIZE;
! 870: break;
! 871:
! 872: /* SKIPZERO skips the bracket. */
! 873:
! 874: case OP_SKIPZERO:
! 875: tcode++;
! 876: do tcode += GET(tcode,1); while (*tcode == OP_ALT);
! 877: tcode += 1 + LINK_SIZE;
! 878: break;
! 879:
! 880: /* Single-char * or ? sets the bit and tries the next item */
! 881:
! 882: case OP_STAR:
! 883: case OP_MINSTAR:
! 884: case OP_POSSTAR:
! 885: case OP_QUERY:
! 886: case OP_MINQUERY:
! 887: case OP_POSQUERY:
! 888: tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
! 889: break;
! 890:
! 891: case OP_STARI:
! 892: case OP_MINSTARI:
! 893: case OP_POSSTARI:
! 894: case OP_QUERYI:
! 895: case OP_MINQUERYI:
! 896: case OP_POSQUERYI:
! 897: tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
! 898: break;
! 899:
! 900: /* Single-char upto sets the bit and tries the next */
! 901:
! 902: case OP_UPTO:
! 903: case OP_MINUPTO:
! 904: case OP_POSUPTO:
! 905: tcode = set_table_bit(start_bits, tcode + 3, FALSE, cd, utf8);
! 906: break;
! 907:
! 908: case OP_UPTOI:
! 909: case OP_MINUPTOI:
! 910: case OP_POSUPTOI:
! 911: tcode = set_table_bit(start_bits, tcode + 3, TRUE, cd, utf8);
! 912: break;
! 913:
! 914: /* At least one single char sets the bit and stops */
! 915:
! 916: case OP_EXACT:
! 917: tcode += 2;
! 918: /* Fall through */
! 919: case OP_CHAR:
! 920: case OP_PLUS:
! 921: case OP_MINPLUS:
! 922: case OP_POSPLUS:
! 923: (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
! 924: try_next = FALSE;
! 925: break;
! 926:
! 927: case OP_EXACTI:
! 928: tcode += 2;
! 929: /* Fall through */
! 930: case OP_CHARI:
! 931: case OP_PLUSI:
! 932: case OP_MINPLUSI:
! 933: case OP_POSPLUSI:
! 934: (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
! 935: try_next = FALSE;
! 936: break;
! 937:
! 938: /* Special spacing and line-terminating items. These recognize specific
! 939: lists of characters. The difference between VSPACE and ANYNL is that the
! 940: latter can match the two-character CRLF sequence, but that is not
! 941: relevant for finding the first character, so their code here is
! 942: identical. */
! 943:
! 944: case OP_HSPACE:
! 945: SET_BIT(0x09);
! 946: SET_BIT(0x20);
! 947: if (utf8)
! 948: {
! 949: SET_BIT(0xC2); /* For U+00A0 */
! 950: SET_BIT(0xE1); /* For U+1680, U+180E */
! 951: SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
! 952: SET_BIT(0xE3); /* For U+3000 */
! 953: }
! 954: else SET_BIT(0xA0);
! 955: try_next = FALSE;
! 956: break;
! 957:
! 958: case OP_ANYNL:
! 959: case OP_VSPACE:
! 960: SET_BIT(0x0A);
! 961: SET_BIT(0x0B);
! 962: SET_BIT(0x0C);
! 963: SET_BIT(0x0D);
! 964: if (utf8)
! 965: {
! 966: SET_BIT(0xC2); /* For U+0085 */
! 967: SET_BIT(0xE2); /* For U+2028, U+2029 */
! 968: }
! 969: else SET_BIT(0x85);
! 970: try_next = FALSE;
! 971: break;
! 972:
! 973: /* Single character types set the bits and stop. Note that if PCRE_UCP
! 974: is set, we do not see these op codes because \d etc are converted to
! 975: properties. Therefore, these apply in the case when only characters less
! 976: than 256 are recognized to match the types. */
! 977:
! 978: case OP_NOT_DIGIT:
! 979: set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
! 980: try_next = FALSE;
! 981: break;
! 982:
! 983: case OP_DIGIT:
! 984: set_type_bits(start_bits, cbit_digit, table_limit, cd);
! 985: try_next = FALSE;
! 986: break;
! 987:
! 988: /* The cbit_space table has vertical tab as whitespace; we have to
! 989: ensure it is set as not whitespace. */
! 990:
! 991: case OP_NOT_WHITESPACE:
! 992: set_nottype_bits(start_bits, cbit_space, table_limit, cd);
! 993: start_bits[1] |= 0x08;
! 994: try_next = FALSE;
! 995: break;
! 996:
! 997: /* The cbit_space table has vertical tab as whitespace; we have to
! 998: not set it from the table. */
! 999:
! 1000: case OP_WHITESPACE:
! 1001: c = start_bits[1]; /* Save in case it was already set */
! 1002: set_type_bits(start_bits, cbit_space, table_limit, cd);
! 1003: start_bits[1] = (start_bits[1] & ~0x08) | c;
! 1004: try_next = FALSE;
! 1005: break;
! 1006:
! 1007: case OP_NOT_WORDCHAR:
! 1008: set_nottype_bits(start_bits, cbit_word, table_limit, cd);
! 1009: try_next = FALSE;
! 1010: break;
! 1011:
! 1012: case OP_WORDCHAR:
! 1013: set_type_bits(start_bits, cbit_word, table_limit, cd);
! 1014: try_next = FALSE;
! 1015: break;
! 1016:
! 1017: /* One or more character type fudges the pointer and restarts, knowing
! 1018: it will hit a single character type and stop there. */
! 1019:
! 1020: case OP_TYPEPLUS:
! 1021: case OP_TYPEMINPLUS:
! 1022: case OP_TYPEPOSPLUS:
! 1023: tcode++;
! 1024: break;
! 1025:
! 1026: case OP_TYPEEXACT:
! 1027: tcode += 3;
! 1028: break;
! 1029:
! 1030: /* Zero or more repeats of character types set the bits and then
! 1031: try again. */
! 1032:
! 1033: case OP_TYPEUPTO:
! 1034: case OP_TYPEMINUPTO:
! 1035: case OP_TYPEPOSUPTO:
! 1036: tcode += 2; /* Fall through */
! 1037:
! 1038: case OP_TYPESTAR:
! 1039: case OP_TYPEMINSTAR:
! 1040: case OP_TYPEPOSSTAR:
! 1041: case OP_TYPEQUERY:
! 1042: case OP_TYPEMINQUERY:
! 1043: case OP_TYPEPOSQUERY:
! 1044: switch(tcode[1])
! 1045: {
! 1046: default:
! 1047: case OP_ANY:
! 1048: case OP_ALLANY:
! 1049: return SSB_FAIL;
! 1050:
! 1051: case OP_HSPACE:
! 1052: SET_BIT(0x09);
! 1053: SET_BIT(0x20);
! 1054: if (utf8)
! 1055: {
! 1056: SET_BIT(0xC2); /* For U+00A0 */
! 1057: SET_BIT(0xE1); /* For U+1680, U+180E */
! 1058: SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
! 1059: SET_BIT(0xE3); /* For U+3000 */
! 1060: }
! 1061: else SET_BIT(0xA0);
! 1062: break;
! 1063:
! 1064: case OP_ANYNL:
! 1065: case OP_VSPACE:
! 1066: SET_BIT(0x0A);
! 1067: SET_BIT(0x0B);
! 1068: SET_BIT(0x0C);
! 1069: SET_BIT(0x0D);
! 1070: if (utf8)
! 1071: {
! 1072: SET_BIT(0xC2); /* For U+0085 */
! 1073: SET_BIT(0xE2); /* For U+2028, U+2029 */
! 1074: }
! 1075: else SET_BIT(0x85);
! 1076: break;
! 1077:
! 1078: case OP_NOT_DIGIT:
! 1079: set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
! 1080: break;
! 1081:
! 1082: case OP_DIGIT:
! 1083: set_type_bits(start_bits, cbit_digit, table_limit, cd);
! 1084: break;
! 1085:
! 1086: /* The cbit_space table has vertical tab as whitespace; we have to
! 1087: ensure it gets set as not whitespace. */
! 1088:
! 1089: case OP_NOT_WHITESPACE:
! 1090: set_nottype_bits(start_bits, cbit_space, table_limit, cd);
! 1091: start_bits[1] |= 0x08;
! 1092: break;
! 1093:
! 1094: /* The cbit_space table has vertical tab as whitespace; we have to
! 1095: avoid setting it. */
! 1096:
! 1097: case OP_WHITESPACE:
! 1098: c = start_bits[1]; /* Save in case it was already set */
! 1099: set_type_bits(start_bits, cbit_space, table_limit, cd);
! 1100: start_bits[1] = (start_bits[1] & ~0x08) | c;
! 1101: break;
! 1102:
! 1103: case OP_NOT_WORDCHAR:
! 1104: set_nottype_bits(start_bits, cbit_word, table_limit, cd);
! 1105: break;
! 1106:
! 1107: case OP_WORDCHAR:
! 1108: set_type_bits(start_bits, cbit_word, table_limit, cd);
! 1109: break;
! 1110: }
! 1111:
! 1112: tcode += 2;
! 1113: break;
! 1114:
! 1115: /* Character class where all the information is in a bit map: set the
! 1116: bits and either carry on or not, according to the repeat count. If it was
! 1117: a negative class, and we are operating with UTF-8 characters, any byte
! 1118: with a value >= 0xc4 is a potentially valid starter because it starts a
! 1119: character with a value > 255. */
! 1120:
! 1121: case OP_NCLASS:
! 1122: #ifdef SUPPORT_UTF8
! 1123: if (utf8)
! 1124: {
! 1125: start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
! 1126: memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
! 1127: }
! 1128: #endif
! 1129: /* Fall through */
! 1130:
! 1131: case OP_CLASS:
! 1132: {
! 1133: tcode++;
! 1134:
! 1135: /* In UTF-8 mode, the bits in a bit map correspond to character
! 1136: values, not to byte values. However, the bit map we are constructing is
! 1137: for byte values. So we have to do a conversion for characters whose
! 1138: value is > 127. In fact, there are only two possible starting bytes for
! 1139: characters in the range 128 - 255. */
! 1140:
! 1141: #ifdef SUPPORT_UTF8
! 1142: if (utf8)
! 1143: {
! 1144: for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
! 1145: for (c = 128; c < 256; c++)
! 1146: {
! 1147: if ((tcode[c/8] && (1 << (c&7))) != 0)
! 1148: {
! 1149: int d = (c >> 6) | 0xc0; /* Set bit for this starter */
! 1150: start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
! 1151: c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
! 1152: }
! 1153: }
! 1154: }
! 1155:
! 1156: /* In non-UTF-8 mode, the two bit maps are completely compatible. */
! 1157:
! 1158: else
! 1159: #endif
! 1160: {
! 1161: for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
! 1162: }
! 1163:
! 1164: /* Advance past the bit map, and act on what follows. For a zero
! 1165: minimum repeat, continue; otherwise stop processing. */
! 1166:
! 1167: tcode += 32;
! 1168: switch (*tcode)
! 1169: {
! 1170: case OP_CRSTAR:
! 1171: case OP_CRMINSTAR:
! 1172: case OP_CRQUERY:
! 1173: case OP_CRMINQUERY:
! 1174: tcode++;
! 1175: break;
! 1176:
! 1177: case OP_CRRANGE:
! 1178: case OP_CRMINRANGE:
! 1179: if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
! 1180: else try_next = FALSE;
! 1181: break;
! 1182:
! 1183: default:
! 1184: try_next = FALSE;
! 1185: break;
! 1186: }
! 1187: }
! 1188: break; /* End of bitmap class handling */
! 1189:
! 1190: } /* End of switch */
! 1191: } /* End of try_next loop */
! 1192:
! 1193: code += GET(code, 1); /* Advance to next branch */
! 1194: }
! 1195: while (*code == OP_ALT);
! 1196: return yield;
! 1197: }
! 1198:
! 1199:
! 1200:
! 1201:
! 1202:
! 1203: /*************************************************
! 1204: * Study a compiled expression *
! 1205: *************************************************/
! 1206:
! 1207: /* This function is handed a compiled expression that it must study to produce
! 1208: information that will speed up the matching. It returns a pcre_extra block
! 1209: which then gets handed back to pcre_exec().
! 1210:
! 1211: Arguments:
! 1212: re points to the compiled expression
! 1213: options contains option bits
! 1214: errorptr points to where to place error messages;
! 1215: set NULL unless error
! 1216:
! 1217: Returns: pointer to a pcre_extra block, with study_data filled in and the
! 1218: appropriate flags set;
! 1219: NULL on error or if no optimization possible
! 1220: */
! 1221:
! 1222: PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
! 1223: pcre_study(const pcre *external_re, int options, const char **errorptr)
! 1224: {
! 1225: int min;
! 1226: BOOL bits_set = FALSE;
! 1227: uschar start_bits[32];
! 1228: pcre_extra *extra = NULL;
! 1229: pcre_study_data *study;
! 1230: const uschar *tables;
! 1231: uschar *code;
! 1232: compile_data compile_block;
! 1233: const real_pcre *re = (const real_pcre *)external_re;
! 1234:
! 1235: *errorptr = NULL;
! 1236:
! 1237: if (re == NULL || re->magic_number != MAGIC_NUMBER)
! 1238: {
! 1239: *errorptr = "argument is not a compiled regular expression";
! 1240: return NULL;
! 1241: }
! 1242:
! 1243: if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
! 1244: {
! 1245: *errorptr = "unknown or incorrect option bit(s) set";
! 1246: return NULL;
! 1247: }
! 1248:
! 1249: code = (uschar *)re + re->name_table_offset +
! 1250: (re->name_count * re->name_entry_size);
! 1251:
! 1252: /* For an anchored pattern, or an unanchored pattern that has a first char, or
! 1253: a multiline pattern that matches only at "line starts", there is no point in
! 1254: seeking a list of starting bytes. */
! 1255:
! 1256: if ((re->options & PCRE_ANCHORED) == 0 &&
! 1257: (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
! 1258: {
! 1259: int rc;
! 1260:
! 1261: /* Set the character tables in the block that is passed around */
! 1262:
! 1263: tables = re->tables;
! 1264: if (tables == NULL)
! 1265: (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
! 1266: (void *)(&tables));
! 1267:
! 1268: compile_block.lcc = tables + lcc_offset;
! 1269: compile_block.fcc = tables + fcc_offset;
! 1270: compile_block.cbits = tables + cbits_offset;
! 1271: compile_block.ctypes = tables + ctypes_offset;
! 1272:
! 1273: /* See if we can find a fixed set of initial characters for the pattern. */
! 1274:
! 1275: memset(start_bits, 0, 32 * sizeof(uschar));
! 1276: rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
! 1277: &compile_block);
! 1278: bits_set = rc == SSB_DONE;
! 1279: if (rc == SSB_UNKNOWN)
! 1280: {
! 1281: *errorptr = "internal error: opcode not recognized";
! 1282: return NULL;
! 1283: }
! 1284: }
! 1285:
! 1286: /* Find the minimum length of subject string. */
! 1287:
! 1288: switch(min = find_minlength(code, code, re->options, 0))
! 1289: {
! 1290: case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
! 1291: case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
! 1292: default: break;
! 1293: }
! 1294:
! 1295: /* If a set of starting bytes has been identified, or if the minimum length is
! 1296: greater than zero, or if JIT optimization has been requested, get a pcre_extra
! 1297: block and a pcre_study_data block. The study data is put in the latter, which
! 1298: is pointed to by the former, which may also get additional data set later by
! 1299: the calling program. At the moment, the size of pcre_study_data is fixed. We
! 1300: nevertheless save it in a field for returning via the pcre_fullinfo() function
! 1301: so that if it becomes variable in the future, we don't have to change that
! 1302: code. */
! 1303:
! 1304: if (bits_set || min > 0
! 1305: #ifdef SUPPORT_JIT
! 1306: || (options & PCRE_STUDY_JIT_COMPILE) != 0
! 1307: #endif
! 1308: )
! 1309: {
! 1310: extra = (pcre_extra *)(pcre_malloc)
! 1311: (sizeof(pcre_extra) + sizeof(pcre_study_data));
! 1312: if (extra == NULL)
! 1313: {
! 1314: *errorptr = "failed to get memory";
! 1315: return NULL;
! 1316: }
! 1317:
! 1318: study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
! 1319: extra->flags = PCRE_EXTRA_STUDY_DATA;
! 1320: extra->study_data = study;
! 1321:
! 1322: study->size = sizeof(pcre_study_data);
! 1323: study->flags = 0;
! 1324:
! 1325: /* Set the start bits always, to avoid unset memory errors if the
! 1326: study data is written to a file, but set the flag only if any of the bits
! 1327: are set, to save time looking when none are. */
! 1328:
! 1329: if (bits_set)
! 1330: {
! 1331: study->flags |= PCRE_STUDY_MAPPED;
! 1332: memcpy(study->start_bits, start_bits, sizeof(start_bits));
! 1333: }
! 1334: else memset(study->start_bits, 0, 32 * sizeof(uschar));
! 1335:
! 1336: /* Always set the minlength value in the block, because the JIT compiler
! 1337: makes use of it. However, don't set the bit unless the length is greater than
! 1338: zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
! 1339: checking the zero case. */
! 1340:
! 1341: if (min > 0)
! 1342: {
! 1343: study->flags |= PCRE_STUDY_MINLEN;
! 1344: study->minlength = min;
! 1345: }
! 1346: else study->minlength = 0;
! 1347:
! 1348: /* If JIT support was compiled and requested, attempt the JIT compilation.
! 1349: If no starting bytes were found, and the minimum length is zero, and JIT
! 1350: compilation fails, abandon the extra block and return NULL. */
! 1351:
! 1352: #ifdef SUPPORT_JIT
! 1353: extra->executable_jit = NULL;
! 1354: if ((options & PCRE_STUDY_JIT_COMPILE) != 0) _pcre_jit_compile(re, extra);
! 1355: if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0)
! 1356: {
! 1357: pcre_free_study(extra);
! 1358: extra = NULL;
! 1359: }
! 1360: #endif
! 1361: }
! 1362:
! 1363: return extra;
! 1364: }
! 1365:
! 1366:
! 1367: /*************************************************
! 1368: * Free the study data *
! 1369: *************************************************/
! 1370:
! 1371: /* This function frees the memory that was obtained by pcre_study().
! 1372:
! 1373: Argument: a pointer to the pcre_extra block
! 1374: Returns: nothing
! 1375: */
! 1376:
! 1377: PCRE_EXP_DEFN void
! 1378: pcre_free_study(pcre_extra *extra)
! 1379: {
! 1380: #ifdef SUPPORT_JIT
! 1381: if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
! 1382: extra->executable_jit != NULL)
! 1383: _pcre_jit_free(extra->executable_jit);
! 1384: #endif
! 1385: pcre_free(extra);
! 1386: }
! 1387:
! 1388: /* End of pcre_study.c */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>