Annotation of embedaddon/php/ext/pcre/pcrelib/pcre_study.c, revision 1.1
1.1 ! misho 1: /*************************************************
! 2: * Perl-Compatible Regular Expressions *
! 3: *************************************************/
! 4:
! 5: /* PCRE is a library of functions to support regular expressions whose syntax
! 6: and semantics are as close as possible to those of the Perl 5 language.
! 7:
! 8: Written by Philip Hazel
! 9: Copyright (c) 1997-2010 University of Cambridge
! 10:
! 11: -----------------------------------------------------------------------------
! 12: Redistribution and use in source and binary forms, with or without
! 13: modification, are permitted provided that the following conditions are met:
! 14:
! 15: * Redistributions of source code must retain the above copyright notice,
! 16: this list of conditions and the following disclaimer.
! 17:
! 18: * Redistributions in binary form must reproduce the above copyright
! 19: notice, this list of conditions and the following disclaimer in the
! 20: documentation and/or other materials provided with the distribution.
! 21:
! 22: * Neither the name of the University of Cambridge nor the names of its
! 23: contributors may be used to endorse or promote products derived from
! 24: this software without specific prior written permission.
! 25:
! 26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
! 27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
! 28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
! 29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
! 30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
! 31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
! 32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
! 33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
! 34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
! 35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
! 36: POSSIBILITY OF SUCH DAMAGE.
! 37: -----------------------------------------------------------------------------
! 38: */
! 39:
! 40:
! 41: /* This module contains the external function pcre_study(), along with local
! 42: supporting functions. */
! 43:
! 44:
! 45: #include "config.h"
! 46:
! 47: #include "pcre_internal.h"
! 48:
! 49: #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
! 50:
! 51: /* Returns from set_start_bits() */
! 52:
! 53: enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
! 54:
! 55:
! 56:
! 57: /*************************************************
! 58: * Find the minimum subject length for a group *
! 59: *************************************************/
! 60:
! 61: /* Scan a parenthesized group and compute the minimum length of subject that
! 62: is needed to match it. This is a lower bound; it does not mean there is a
! 63: string of that length that matches. In UTF8 mode, the result is in characters
! 64: rather than bytes.
! 65:
! 66: Arguments:
! 67: code pointer to start of group (the bracket)
! 68: startcode pointer to start of the whole pattern
! 69: options the compiling options
! 70:
! 71: Returns: the minimum length
! 72: -1 if \C was encountered
! 73: -2 internal error (missing capturing bracket)
! 74: */
! 75:
! 76: static int
! 77: find_minlength(const uschar *code, const uschar *startcode, int options)
! 78: {
! 79: int length = -1;
! 80: BOOL utf8 = (options & PCRE_UTF8) != 0;
! 81: BOOL had_recurse = FALSE;
! 82: register int branchlength = 0;
! 83: register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
! 84:
! 85: if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;
! 86:
! 87: /* Scan along the opcodes for this branch. If we get to the end of the
! 88: branch, check the length against that of the other branches. */
! 89:
! 90: for (;;)
! 91: {
! 92: int d, min;
! 93: uschar *cs, *ce;
! 94: register int op = *cc;
! 95:
! 96: switch (op)
! 97: {
! 98: case OP_COND:
! 99: case OP_SCOND:
! 100:
! 101: /* If there is only one branch in a condition, the implied branch has zero
! 102: length, so we don't add anything. This covers the DEFINE "condition"
! 103: automatically. */
! 104:
! 105: cs = cc + GET(cc, 1);
! 106: if (*cs != OP_ALT)
! 107: {
! 108: cc = cs + 1 + LINK_SIZE;
! 109: break;
! 110: }
! 111:
! 112: /* Otherwise we can fall through and treat it the same as any other
! 113: subpattern. */
! 114:
! 115: case OP_CBRA:
! 116: case OP_SCBRA:
! 117: case OP_BRA:
! 118: case OP_SBRA:
! 119: case OP_ONCE:
! 120: d = find_minlength(cc, startcode, options);
! 121: if (d < 0) return d;
! 122: branchlength += d;
! 123: do cc += GET(cc, 1); while (*cc == OP_ALT);
! 124: cc += 1 + LINK_SIZE;
! 125: break;
! 126:
! 127: /* Reached end of a branch; if it's a ket it is the end of a nested
! 128: call. If it's ALT it is an alternation in a nested call. If it is
! 129: END it's the end of the outer call. All can be handled by the same code. */
! 130:
! 131: case OP_ALT:
! 132: case OP_KET:
! 133: case OP_KETRMAX:
! 134: case OP_KETRMIN:
! 135: case OP_END:
! 136: if (length < 0 || (!had_recurse && branchlength < length))
! 137: length = branchlength;
! 138: if (*cc != OP_ALT) return length;
! 139: cc += 1 + LINK_SIZE;
! 140: branchlength = 0;
! 141: had_recurse = FALSE;
! 142: break;
! 143:
! 144: /* Skip over assertive subpatterns */
! 145:
! 146: case OP_ASSERT:
! 147: case OP_ASSERT_NOT:
! 148: case OP_ASSERTBACK:
! 149: case OP_ASSERTBACK_NOT:
! 150: do cc += GET(cc, 1); while (*cc == OP_ALT);
! 151: /* Fall through */
! 152:
! 153: /* Skip over things that don't match chars */
! 154:
! 155: case OP_REVERSE:
! 156: case OP_CREF:
! 157: case OP_NCREF:
! 158: case OP_RREF:
! 159: case OP_NRREF:
! 160: case OP_DEF:
! 161: case OP_OPT:
! 162: case OP_CALLOUT:
! 163: case OP_SOD:
! 164: case OP_SOM:
! 165: case OP_EOD:
! 166: case OP_EODN:
! 167: case OP_CIRC:
! 168: case OP_DOLL:
! 169: case OP_NOT_WORD_BOUNDARY:
! 170: case OP_WORD_BOUNDARY:
! 171: cc += _pcre_OP_lengths[*cc];
! 172: break;
! 173:
! 174: /* Skip over a subpattern that has a {0} or {0,x} quantifier */
! 175:
! 176: case OP_BRAZERO:
! 177: case OP_BRAMINZERO:
! 178: case OP_SKIPZERO:
! 179: cc += _pcre_OP_lengths[*cc];
! 180: do cc += GET(cc, 1); while (*cc == OP_ALT);
! 181: cc += 1 + LINK_SIZE;
! 182: break;
! 183:
! 184: /* Handle literal characters and + repetitions */
! 185:
! 186: case OP_CHAR:
! 187: case OP_CHARNC:
! 188: case OP_NOT:
! 189: case OP_PLUS:
! 190: case OP_MINPLUS:
! 191: case OP_POSPLUS:
! 192: case OP_NOTPLUS:
! 193: case OP_NOTMINPLUS:
! 194: case OP_NOTPOSPLUS:
! 195: branchlength++;
! 196: cc += 2;
! 197: #ifdef SUPPORT_UTF8
! 198: if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
! 199: #endif
! 200: break;
! 201:
! 202: case OP_TYPEPLUS:
! 203: case OP_TYPEMINPLUS:
! 204: case OP_TYPEPOSPLUS:
! 205: branchlength++;
! 206: cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
! 207: break;
! 208:
! 209: /* Handle exact repetitions. The count is already in characters, but we
! 210: need to skip over a multibyte character in UTF8 mode. */
! 211:
! 212: case OP_EXACT:
! 213: case OP_NOTEXACT:
! 214: branchlength += GET2(cc,1);
! 215: cc += 4;
! 216: #ifdef SUPPORT_UTF8
! 217: if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
! 218: #endif
! 219: break;
! 220:
! 221: case OP_TYPEEXACT:
! 222: branchlength += GET2(cc,1);
! 223: cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
! 224: break;
! 225:
! 226: /* Handle single-char non-literal matchers */
! 227:
! 228: case OP_PROP:
! 229: case OP_NOTPROP:
! 230: cc += 2;
! 231: /* Fall through */
! 232:
! 233: case OP_NOT_DIGIT:
! 234: case OP_DIGIT:
! 235: case OP_NOT_WHITESPACE:
! 236: case OP_WHITESPACE:
! 237: case OP_NOT_WORDCHAR:
! 238: case OP_WORDCHAR:
! 239: case OP_ANY:
! 240: case OP_ALLANY:
! 241: case OP_EXTUNI:
! 242: case OP_HSPACE:
! 243: case OP_NOT_HSPACE:
! 244: case OP_VSPACE:
! 245: case OP_NOT_VSPACE:
! 246: branchlength++;
! 247: cc++;
! 248: break;
! 249:
! 250: /* "Any newline" might match two characters */
! 251:
! 252: case OP_ANYNL:
! 253: branchlength += 2;
! 254: cc++;
! 255: break;
! 256:
! 257: /* The single-byte matcher means we can't proceed in UTF-8 mode */
! 258:
! 259: case OP_ANYBYTE:
! 260: #ifdef SUPPORT_UTF8
! 261: if (utf8) return -1;
! 262: #endif
! 263: branchlength++;
! 264: cc++;
! 265: break;
! 266:
! 267: /* For repeated character types, we have to test for \p and \P, which have
! 268: an extra two bytes of parameters. */
! 269:
! 270: case OP_TYPESTAR:
! 271: case OP_TYPEMINSTAR:
! 272: case OP_TYPEQUERY:
! 273: case OP_TYPEMINQUERY:
! 274: case OP_TYPEPOSSTAR:
! 275: case OP_TYPEPOSQUERY:
! 276: if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
! 277: cc += _pcre_OP_lengths[op];
! 278: break;
! 279:
! 280: case OP_TYPEUPTO:
! 281: case OP_TYPEMINUPTO:
! 282: case OP_TYPEPOSUPTO:
! 283: if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
! 284: cc += _pcre_OP_lengths[op];
! 285: break;
! 286:
! 287: /* Check a class for variable quantification */
! 288:
! 289: #ifdef SUPPORT_UTF8
! 290: case OP_XCLASS:
! 291: cc += GET(cc, 1) - 33;
! 292: /* Fall through */
! 293: #endif
! 294:
! 295: case OP_CLASS:
! 296: case OP_NCLASS:
! 297: cc += 33;
! 298:
! 299: switch (*cc)
! 300: {
! 301: case OP_CRPLUS:
! 302: case OP_CRMINPLUS:
! 303: branchlength++;
! 304: /* Fall through */
! 305:
! 306: case OP_CRSTAR:
! 307: case OP_CRMINSTAR:
! 308: case OP_CRQUERY:
! 309: case OP_CRMINQUERY:
! 310: cc++;
! 311: break;
! 312:
! 313: case OP_CRRANGE:
! 314: case OP_CRMINRANGE:
! 315: branchlength += GET2(cc,1);
! 316: cc += 5;
! 317: break;
! 318:
! 319: default:
! 320: branchlength++;
! 321: break;
! 322: }
! 323: break;
! 324:
! 325: /* Backreferences and subroutine calls are treated in the same way: we find
! 326: the minimum length for the subpattern. A recursion, however, causes an
! 327: a flag to be set that causes the length of this branch to be ignored. The
! 328: logic is that a recursion can only make sense if there is another
! 329: alternation that stops the recursing. That will provide the minimum length
! 330: (when no recursion happens). A backreference within the group that it is
! 331: referencing behaves in the same way.
! 332:
! 333: If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
! 334: matches an empty string (by default it causes a matching failure), so in
! 335: that case we must set the minimum length to zero. */
! 336:
! 337: case OP_REF:
! 338: if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
! 339: {
! 340: ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
! 341: if (cs == NULL) return -2;
! 342: do ce += GET(ce, 1); while (*ce == OP_ALT);
! 343: if (cc > cs && cc < ce)
! 344: {
! 345: d = 0;
! 346: had_recurse = TRUE;
! 347: }
! 348: else d = find_minlength(cs, startcode, options);
! 349: }
! 350: else d = 0;
! 351: cc += 3;
! 352:
! 353: /* Handle repeated back references */
! 354:
! 355: switch (*cc)
! 356: {
! 357: case OP_CRSTAR:
! 358: case OP_CRMINSTAR:
! 359: case OP_CRQUERY:
! 360: case OP_CRMINQUERY:
! 361: min = 0;
! 362: cc++;
! 363: break;
! 364:
! 365: case OP_CRRANGE:
! 366: case OP_CRMINRANGE:
! 367: min = GET2(cc, 1);
! 368: cc += 5;
! 369: break;
! 370:
! 371: default:
! 372: min = 1;
! 373: break;
! 374: }
! 375:
! 376: branchlength += min * d;
! 377: break;
! 378:
! 379: case OP_RECURSE:
! 380: cs = ce = (uschar *)startcode + GET(cc, 1);
! 381: if (cs == NULL) return -2;
! 382: do ce += GET(ce, 1); while (*ce == OP_ALT);
! 383: if (cc > cs && cc < ce)
! 384: had_recurse = TRUE;
! 385: else
! 386: branchlength += find_minlength(cs, startcode, options);
! 387: cc += 1 + LINK_SIZE;
! 388: break;
! 389:
! 390: /* Anything else does not or need not match a character. We can get the
! 391: item's length from the table, but for those that can match zero occurrences
! 392: of a character, we must take special action for UTF-8 characters. */
! 393:
! 394: case OP_UPTO:
! 395: case OP_NOTUPTO:
! 396: case OP_MINUPTO:
! 397: case OP_NOTMINUPTO:
! 398: case OP_POSUPTO:
! 399: case OP_STAR:
! 400: case OP_MINSTAR:
! 401: case OP_NOTMINSTAR:
! 402: case OP_POSSTAR:
! 403: case OP_NOTPOSSTAR:
! 404: case OP_QUERY:
! 405: case OP_MINQUERY:
! 406: case OP_NOTMINQUERY:
! 407: case OP_POSQUERY:
! 408: case OP_NOTPOSQUERY:
! 409: cc += _pcre_OP_lengths[op];
! 410: #ifdef SUPPORT_UTF8
! 411: if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
! 412: #endif
! 413: break;
! 414:
! 415: /* Skip these, but we need to add in the name length. */
! 416:
! 417: case OP_MARK:
! 418: case OP_PRUNE_ARG:
! 419: case OP_SKIP_ARG:
! 420: cc += _pcre_OP_lengths[op] + cc[1];
! 421: break;
! 422:
! 423: case OP_THEN_ARG:
! 424: cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
! 425: break;
! 426:
! 427: /* For the record, these are the opcodes that are matched by "default":
! 428: OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
! 429: OP_THEN. */
! 430:
! 431: default:
! 432: cc += _pcre_OP_lengths[op];
! 433: break;
! 434: }
! 435: }
! 436: /* Control never gets here */
! 437: }
! 438:
! 439:
! 440:
! 441: /*************************************************
! 442: * Set a bit and maybe its alternate case *
! 443: *************************************************/
! 444:
! 445: /* Given a character, set its first byte's bit in the table, and also the
! 446: corresponding bit for the other version of a letter if we are caseless. In
! 447: UTF-8 mode, for characters greater than 127, we can only do the caseless thing
! 448: when Unicode property support is available.
! 449:
! 450: Arguments:
! 451: start_bits points to the bit map
! 452: p points to the character
! 453: caseless the caseless flag
! 454: cd the block with char table pointers
! 455: utf8 TRUE for UTF-8 mode
! 456:
! 457: Returns: pointer after the character
! 458: */
! 459:
! 460: static const uschar *
! 461: set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
! 462: compile_data *cd, BOOL utf8)
! 463: {
! 464: unsigned int c = *p;
! 465:
! 466: SET_BIT(c);
! 467:
! 468: #ifdef SUPPORT_UTF8
! 469: if (utf8 && c > 127)
! 470: {
! 471: GETCHARINC(c, p);
! 472: #ifdef SUPPORT_UCP
! 473: if (caseless)
! 474: {
! 475: uschar buff[8];
! 476: c = UCD_OTHERCASE(c);
! 477: (void)_pcre_ord2utf8(c, buff);
! 478: SET_BIT(buff[0]);
! 479: }
! 480: #endif
! 481: return p;
! 482: }
! 483: #endif
! 484:
! 485: /* Not UTF-8 mode, or character is less than 127. */
! 486:
! 487: if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
! 488: return p + 1;
! 489: }
! 490:
! 491:
! 492:
! 493: /*************************************************
! 494: * Set bits for a positive character type *
! 495: *************************************************/
! 496:
! 497: /* This function sets starting bits for a character type. In UTF-8 mode, we can
! 498: only do a direct setting for bytes less than 128, as otherwise there can be
! 499: confusion with bytes in the middle of UTF-8 characters. In a "traditional"
! 500: environment, the tables will only recognize ASCII characters anyway, but in at
! 501: least one Windows environment, some higher bytes bits were set in the tables.
! 502: So we deal with that case by considering the UTF-8 encoding.
! 503:
! 504: Arguments:
! 505: start_bits the starting bitmap
! 506: cbit type the type of character wanted
! 507: table_limit 32 for non-UTF-8; 16 for UTF-8
! 508: cd the block with char table pointers
! 509:
! 510: Returns: nothing
! 511: */
! 512:
! 513: static void
! 514: set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
! 515: compile_data *cd)
! 516: {
! 517: register int c;
! 518: for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
! 519: if (table_limit == 32) return;
! 520: for (c = 128; c < 256; c++)
! 521: {
! 522: if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
! 523: {
! 524: uschar buff[8];
! 525: (void)_pcre_ord2utf8(c, buff);
! 526: SET_BIT(buff[0]);
! 527: }
! 528: }
! 529: }
! 530:
! 531:
! 532: /*************************************************
! 533: * Set bits for a negative character type *
! 534: *************************************************/
! 535:
! 536: /* This function sets starting bits for a negative character type such as \D.
! 537: In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
! 538: otherwise there can be confusion with bytes in the middle of UTF-8 characters.
! 539: Unlike in the positive case, where we can set appropriate starting bits for
! 540: specific high-valued UTF-8 characters, in this case we have to set the bits for
! 541: all high-valued characters. The lowest is 0xc2, but we overkill by starting at
! 542: 0xc0 (192) for simplicity.
! 543:
! 544: Arguments:
! 545: start_bits the starting bitmap
! 546: cbit type the type of character wanted
! 547: table_limit 32 for non-UTF-8; 16 for UTF-8
! 548: cd the block with char table pointers
! 549:
! 550: Returns: nothing
! 551: */
! 552:
! 553: static void
! 554: set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
! 555: compile_data *cd)
! 556: {
! 557: register int c;
! 558: for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
! 559: if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
! 560: }
! 561:
! 562:
! 563:
! 564: /*************************************************
! 565: * Create bitmap of starting bytes *
! 566: *************************************************/
! 567:
! 568: /* This function scans a compiled unanchored expression recursively and
! 569: attempts to build a bitmap of the set of possible starting bytes. As time goes
! 570: by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
! 571: useful for parenthesized groups in patterns such as (a*)b where the group
! 572: provides some optional starting bytes but scanning must continue at the outer
! 573: level to find at least one mandatory byte. At the outermost level, this
! 574: function fails unless the result is SSB_DONE.
! 575:
! 576: Arguments:
! 577: code points to an expression
! 578: start_bits points to a 32-byte table, initialized to 0
! 579: caseless the current state of the caseless flag
! 580: utf8 TRUE if in UTF-8 mode
! 581: cd the block with char table pointers
! 582:
! 583: Returns: SSB_FAIL => Failed to find any starting bytes
! 584: SSB_DONE => Found mandatory starting bytes
! 585: SSB_CONTINUE => Found optional starting bytes
! 586: */
! 587:
! 588: static int
! 589: set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
! 590: BOOL utf8, compile_data *cd)
! 591: {
! 592: register int c;
! 593: int yield = SSB_DONE;
! 594: int table_limit = utf8? 16:32;
! 595:
! 596: #if 0
! 597: /* ========================================================================= */
! 598: /* The following comment and code was inserted in January 1999. In May 2006,
! 599: when it was observed to cause compiler warnings about unused values, I took it
! 600: out again. If anybody is still using OS/2, they will have to put it back
! 601: manually. */
! 602:
! 603: /* This next statement and the later reference to dummy are here in order to
! 604: trick the optimizer of the IBM C compiler for OS/2 into generating correct
! 605: code. Apparently IBM isn't going to fix the problem, and we would rather not
! 606: disable optimization (in this module it actually makes a big difference, and
! 607: the pcre module can use all the optimization it can get). */
! 608:
! 609: volatile int dummy;
! 610: /* ========================================================================= */
! 611: #endif
! 612:
! 613: do
! 614: {
! 615: const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
! 616: BOOL try_next = TRUE;
! 617:
! 618: while (try_next) /* Loop for items in this branch */
! 619: {
! 620: int rc;
! 621: switch(*tcode)
! 622: {
! 623: /* Fail if we reach something we don't understand */
! 624:
! 625: default:
! 626: return SSB_FAIL;
! 627:
! 628: /* If we hit a bracket or a positive lookahead assertion, recurse to set
! 629: bits from within the subpattern. If it can't find anything, we have to
! 630: give up. If it finds some mandatory character(s), we are done for this
! 631: branch. Otherwise, carry on scanning after the subpattern. */
! 632:
! 633: case OP_BRA:
! 634: case OP_SBRA:
! 635: case OP_CBRA:
! 636: case OP_SCBRA:
! 637: case OP_ONCE:
! 638: case OP_ASSERT:
! 639: rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
! 640: if (rc == SSB_FAIL) return SSB_FAIL;
! 641: if (rc == SSB_DONE) try_next = FALSE; else
! 642: {
! 643: do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
! 644: tcode += 1 + LINK_SIZE;
! 645: }
! 646: break;
! 647:
! 648: /* If we hit ALT or KET, it means we haven't found anything mandatory in
! 649: this branch, though we might have found something optional. For ALT, we
! 650: continue with the next alternative, but we have to arrange that the final
! 651: result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
! 652: return SSB_CONTINUE: if this is the top level, that indicates failure,
! 653: but after a nested subpattern, it causes scanning to continue. */
! 654:
! 655: case OP_ALT:
! 656: yield = SSB_CONTINUE;
! 657: try_next = FALSE;
! 658: break;
! 659:
! 660: case OP_KET:
! 661: case OP_KETRMAX:
! 662: case OP_KETRMIN:
! 663: return SSB_CONTINUE;
! 664:
! 665: /* Skip over callout */
! 666:
! 667: case OP_CALLOUT:
! 668: tcode += 2 + 2*LINK_SIZE;
! 669: break;
! 670:
! 671: /* Skip over lookbehind and negative lookahead assertions */
! 672:
! 673: case OP_ASSERT_NOT:
! 674: case OP_ASSERTBACK:
! 675: case OP_ASSERTBACK_NOT:
! 676: do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
! 677: tcode += 1 + LINK_SIZE;
! 678: break;
! 679:
! 680: /* Skip over an option setting, changing the caseless flag */
! 681:
! 682: case OP_OPT:
! 683: caseless = (tcode[1] & PCRE_CASELESS) != 0;
! 684: tcode += 2;
! 685: break;
! 686:
! 687: /* BRAZERO does the bracket, but carries on. */
! 688:
! 689: case OP_BRAZERO:
! 690: case OP_BRAMINZERO:
! 691: if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
! 692: return SSB_FAIL;
! 693: /* =========================================================================
! 694: See the comment at the head of this function concerning the next line,
! 695: which was an old fudge for the benefit of OS/2.
! 696: dummy = 1;
! 697: ========================================================================= */
! 698: do tcode += GET(tcode,1); while (*tcode == OP_ALT);
! 699: tcode += 1 + LINK_SIZE;
! 700: break;
! 701:
! 702: /* SKIPZERO skips the bracket. */
! 703:
! 704: case OP_SKIPZERO:
! 705: tcode++;
! 706: do tcode += GET(tcode,1); while (*tcode == OP_ALT);
! 707: tcode += 1 + LINK_SIZE;
! 708: break;
! 709:
! 710: /* Single-char * or ? sets the bit and tries the next item */
! 711:
! 712: case OP_STAR:
! 713: case OP_MINSTAR:
! 714: case OP_POSSTAR:
! 715: case OP_QUERY:
! 716: case OP_MINQUERY:
! 717: case OP_POSQUERY:
! 718: tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
! 719: break;
! 720:
! 721: /* Single-char upto sets the bit and tries the next */
! 722:
! 723: case OP_UPTO:
! 724: case OP_MINUPTO:
! 725: case OP_POSUPTO:
! 726: tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
! 727: break;
! 728:
! 729: /* At least one single char sets the bit and stops */
! 730:
! 731: case OP_EXACT: /* Fall through */
! 732: tcode += 2;
! 733:
! 734: case OP_CHAR:
! 735: case OP_CHARNC:
! 736: case OP_PLUS:
! 737: case OP_MINPLUS:
! 738: case OP_POSPLUS:
! 739: (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
! 740: try_next = FALSE;
! 741: break;
! 742:
! 743: /* Special spacing and line-terminating items. These recognize specific
! 744: lists of characters. The difference between VSPACE and ANYNL is that the
! 745: latter can match the two-character CRLF sequence, but that is not
! 746: relevant for finding the first character, so their code here is
! 747: identical. */
! 748:
! 749: case OP_HSPACE:
! 750: SET_BIT(0x09);
! 751: SET_BIT(0x20);
! 752: if (utf8)
! 753: {
! 754: SET_BIT(0xC2); /* For U+00A0 */
! 755: SET_BIT(0xE1); /* For U+1680, U+180E */
! 756: SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
! 757: SET_BIT(0xE3); /* For U+3000 */
! 758: }
! 759: else SET_BIT(0xA0);
! 760: try_next = FALSE;
! 761: break;
! 762:
! 763: case OP_ANYNL:
! 764: case OP_VSPACE:
! 765: SET_BIT(0x0A);
! 766: SET_BIT(0x0B);
! 767: SET_BIT(0x0C);
! 768: SET_BIT(0x0D);
! 769: if (utf8)
! 770: {
! 771: SET_BIT(0xC2); /* For U+0085 */
! 772: SET_BIT(0xE2); /* For U+2028, U+2029 */
! 773: }
! 774: else SET_BIT(0x85);
! 775: try_next = FALSE;
! 776: break;
! 777:
! 778: /* Single character types set the bits and stop. Note that if PCRE_UCP
! 779: is set, we do not see these op codes because \d etc are converted to
! 780: properties. Therefore, these apply in the case when only characters less
! 781: than 256 are recognized to match the types. */
! 782:
! 783: case OP_NOT_DIGIT:
! 784: set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
! 785: try_next = FALSE;
! 786: break;
! 787:
! 788: case OP_DIGIT:
! 789: set_type_bits(start_bits, cbit_digit, table_limit, cd);
! 790: try_next = FALSE;
! 791: break;
! 792:
! 793: /* The cbit_space table has vertical tab as whitespace; we have to
! 794: ensure it is set as not whitespace. */
! 795:
! 796: case OP_NOT_WHITESPACE:
! 797: set_nottype_bits(start_bits, cbit_space, table_limit, cd);
! 798: start_bits[1] |= 0x08;
! 799: try_next = FALSE;
! 800: break;
! 801:
! 802: /* The cbit_space table has vertical tab as whitespace; we have to
! 803: not set it from the table. */
! 804:
! 805: case OP_WHITESPACE:
! 806: c = start_bits[1]; /* Save in case it was already set */
! 807: set_type_bits(start_bits, cbit_space, table_limit, cd);
! 808: start_bits[1] = (start_bits[1] & ~0x08) | c;
! 809: try_next = FALSE;
! 810: break;
! 811:
! 812: case OP_NOT_WORDCHAR:
! 813: set_nottype_bits(start_bits, cbit_word, table_limit, cd);
! 814: try_next = FALSE;
! 815: break;
! 816:
! 817: case OP_WORDCHAR:
! 818: set_type_bits(start_bits, cbit_word, table_limit, cd);
! 819: try_next = FALSE;
! 820: break;
! 821:
! 822: /* One or more character type fudges the pointer and restarts, knowing
! 823: it will hit a single character type and stop there. */
! 824:
! 825: case OP_TYPEPLUS:
! 826: case OP_TYPEMINPLUS:
! 827: case OP_TYPEPOSPLUS:
! 828: tcode++;
! 829: break;
! 830:
! 831: case OP_TYPEEXACT:
! 832: tcode += 3;
! 833: break;
! 834:
! 835: /* Zero or more repeats of character types set the bits and then
! 836: try again. */
! 837:
! 838: case OP_TYPEUPTO:
! 839: case OP_TYPEMINUPTO:
! 840: case OP_TYPEPOSUPTO:
! 841: tcode += 2; /* Fall through */
! 842:
! 843: case OP_TYPESTAR:
! 844: case OP_TYPEMINSTAR:
! 845: case OP_TYPEPOSSTAR:
! 846: case OP_TYPEQUERY:
! 847: case OP_TYPEMINQUERY:
! 848: case OP_TYPEPOSQUERY:
! 849: switch(tcode[1])
! 850: {
! 851: default:
! 852: case OP_ANY:
! 853: case OP_ALLANY:
! 854: return SSB_FAIL;
! 855:
! 856: case OP_HSPACE:
! 857: SET_BIT(0x09);
! 858: SET_BIT(0x20);
! 859: if (utf8)
! 860: {
! 861: SET_BIT(0xC2); /* For U+00A0 */
! 862: SET_BIT(0xE1); /* For U+1680, U+180E */
! 863: SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
! 864: SET_BIT(0xE3); /* For U+3000 */
! 865: }
! 866: else SET_BIT(0xA0);
! 867: break;
! 868:
! 869: case OP_ANYNL:
! 870: case OP_VSPACE:
! 871: SET_BIT(0x0A);
! 872: SET_BIT(0x0B);
! 873: SET_BIT(0x0C);
! 874: SET_BIT(0x0D);
! 875: if (utf8)
! 876: {
! 877: SET_BIT(0xC2); /* For U+0085 */
! 878: SET_BIT(0xE2); /* For U+2028, U+2029 */
! 879: }
! 880: else SET_BIT(0x85);
! 881: break;
! 882:
! 883: case OP_NOT_DIGIT:
! 884: set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
! 885: break;
! 886:
! 887: case OP_DIGIT:
! 888: set_type_bits(start_bits, cbit_digit, table_limit, cd);
! 889: break;
! 890:
! 891: /* The cbit_space table has vertical tab as whitespace; we have to
! 892: ensure it gets set as not whitespace. */
! 893:
! 894: case OP_NOT_WHITESPACE:
! 895: set_nottype_bits(start_bits, cbit_space, table_limit, cd);
! 896: start_bits[1] |= 0x08;
! 897: break;
! 898:
! 899: /* The cbit_space table has vertical tab as whitespace; we have to
! 900: avoid setting it. */
! 901:
! 902: case OP_WHITESPACE:
! 903: c = start_bits[1]; /* Save in case it was already set */
! 904: set_type_bits(start_bits, cbit_space, table_limit, cd);
! 905: start_bits[1] = (start_bits[1] & ~0x08) | c;
! 906: break;
! 907:
! 908: case OP_NOT_WORDCHAR:
! 909: set_nottype_bits(start_bits, cbit_word, table_limit, cd);
! 910: break;
! 911:
! 912: case OP_WORDCHAR:
! 913: set_type_bits(start_bits, cbit_word, table_limit, cd);
! 914: break;
! 915: }
! 916:
! 917: tcode += 2;
! 918: break;
! 919:
! 920: /* Character class where all the information is in a bit map: set the
! 921: bits and either carry on or not, according to the repeat count. If it was
! 922: a negative class, and we are operating with UTF-8 characters, any byte
! 923: with a value >= 0xc4 is a potentially valid starter because it starts a
! 924: character with a value > 255. */
! 925:
! 926: case OP_NCLASS:
! 927: #ifdef SUPPORT_UTF8
! 928: if (utf8)
! 929: {
! 930: start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
! 931: memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
! 932: }
! 933: #endif
! 934: /* Fall through */
! 935:
! 936: case OP_CLASS:
! 937: {
! 938: tcode++;
! 939:
! 940: /* In UTF-8 mode, the bits in a bit map correspond to character
! 941: values, not to byte values. However, the bit map we are constructing is
! 942: for byte values. So we have to do a conversion for characters whose
! 943: value is > 127. In fact, there are only two possible starting bytes for
! 944: characters in the range 128 - 255. */
! 945:
! 946: #ifdef SUPPORT_UTF8
! 947: if (utf8)
! 948: {
! 949: for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
! 950: for (c = 128; c < 256; c++)
! 951: {
! 952: if ((tcode[c/8] && (1 << (c&7))) != 0)
! 953: {
! 954: int d = (c >> 6) | 0xc0; /* Set bit for this starter */
! 955: start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
! 956: c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
! 957: }
! 958: }
! 959: }
! 960:
! 961: /* In non-UTF-8 mode, the two bit maps are completely compatible. */
! 962:
! 963: else
! 964: #endif
! 965: {
! 966: for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
! 967: }
! 968:
! 969: /* Advance past the bit map, and act on what follows */
! 970:
! 971: tcode += 32;
! 972: switch (*tcode)
! 973: {
! 974: case OP_CRSTAR:
! 975: case OP_CRMINSTAR:
! 976: case OP_CRQUERY:
! 977: case OP_CRMINQUERY:
! 978: tcode++;
! 979: break;
! 980:
! 981: case OP_CRRANGE:
! 982: case OP_CRMINRANGE:
! 983: if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
! 984: else try_next = FALSE;
! 985: break;
! 986:
! 987: default:
! 988: try_next = FALSE;
! 989: break;
! 990: }
! 991: }
! 992: break; /* End of bitmap class handling */
! 993:
! 994: } /* End of switch */
! 995: } /* End of try_next loop */
! 996:
! 997: code += GET(code, 1); /* Advance to next branch */
! 998: }
! 999: while (*code == OP_ALT);
! 1000: return yield;
! 1001: }
! 1002:
! 1003:
! 1004:
! 1005: /*************************************************
! 1006: * Study a compiled expression *
! 1007: *************************************************/
! 1008:
! 1009: /* This function is handed a compiled expression that it must study to produce
! 1010: information that will speed up the matching. It returns a pcre_extra block
! 1011: which then gets handed back to pcre_exec().
! 1012:
! 1013: Arguments:
! 1014: re points to the compiled expression
! 1015: options contains option bits
! 1016: errorptr points to where to place error messages;
! 1017: set NULL unless error
! 1018:
! 1019: Returns: pointer to a pcre_extra block, with study_data filled in and the
! 1020: appropriate flags set;
! 1021: NULL on error or if no optimization possible
! 1022: */
! 1023:
! 1024: PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
! 1025: pcre_study(const pcre *external_re, int options, const char **errorptr)
! 1026: {
! 1027: int min;
! 1028: BOOL bits_set = FALSE;
! 1029: uschar start_bits[32];
! 1030: pcre_extra *extra;
! 1031: pcre_study_data *study;
! 1032: const uschar *tables;
! 1033: uschar *code;
! 1034: compile_data compile_block;
! 1035: const real_pcre *re = (const real_pcre *)external_re;
! 1036:
! 1037: *errorptr = NULL;
! 1038:
! 1039: if (re == NULL || re->magic_number != MAGIC_NUMBER)
! 1040: {
! 1041: *errorptr = "argument is not a compiled regular expression";
! 1042: return NULL;
! 1043: }
! 1044:
! 1045: if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
! 1046: {
! 1047: *errorptr = "unknown or incorrect option bit(s) set";
! 1048: return NULL;
! 1049: }
! 1050:
! 1051: code = (uschar *)re + re->name_table_offset +
! 1052: (re->name_count * re->name_entry_size);
! 1053:
! 1054: /* For an anchored pattern, or an unanchored pattern that has a first char, or
! 1055: a multiline pattern that matches only at "line starts", there is no point in
! 1056: seeking a list of starting bytes. */
! 1057:
! 1058: if ((re->options & PCRE_ANCHORED) == 0 &&
! 1059: (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
! 1060: {
! 1061: /* Set the character tables in the block that is passed around */
! 1062:
! 1063: tables = re->tables;
! 1064: if (tables == NULL)
! 1065: (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
! 1066: (void *)(&tables));
! 1067:
! 1068: compile_block.lcc = tables + lcc_offset;
! 1069: compile_block.fcc = tables + fcc_offset;
! 1070: compile_block.cbits = tables + cbits_offset;
! 1071: compile_block.ctypes = tables + ctypes_offset;
! 1072:
! 1073: /* See if we can find a fixed set of initial characters for the pattern. */
! 1074:
! 1075: memset(start_bits, 0, 32 * sizeof(uschar));
! 1076: bits_set = set_start_bits(code, start_bits,
! 1077: (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,
! 1078: &compile_block) == SSB_DONE;
! 1079: }
! 1080:
! 1081: /* Find the minimum length of subject string. */
! 1082:
! 1083: min = find_minlength(code, code, re->options);
! 1084:
! 1085: /* Return NULL if no optimization is possible. */
! 1086:
! 1087: if (!bits_set && min < 0) return NULL;
! 1088:
! 1089: /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
! 1090: the latter, which is pointed to by the former, which may also get additional
! 1091: data set later by the calling program. At the moment, the size of
! 1092: pcre_study_data is fixed. We nevertheless save it in a field for returning via
! 1093: the pcre_fullinfo() function so that if it becomes variable in the future, we
! 1094: don't have to change that code. */
! 1095:
! 1096: extra = (pcre_extra *)(pcre_malloc)
! 1097: (sizeof(pcre_extra) + sizeof(pcre_study_data));
! 1098:
! 1099: if (extra == NULL)
! 1100: {
! 1101: *errorptr = "failed to get memory";
! 1102: return NULL;
! 1103: }
! 1104:
! 1105: study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
! 1106: extra->flags = PCRE_EXTRA_STUDY_DATA;
! 1107: extra->study_data = study;
! 1108:
! 1109: study->size = sizeof(pcre_study_data);
! 1110: study->flags = 0;
! 1111:
! 1112: if (bits_set)
! 1113: {
! 1114: study->flags |= PCRE_STUDY_MAPPED;
! 1115: memcpy(study->start_bits, start_bits, sizeof(start_bits));
! 1116: }
! 1117:
! 1118: if (min >= 0)
! 1119: {
! 1120: study->flags |= PCRE_STUDY_MINLEN;
! 1121: study->minlength = min;
! 1122: }
! 1123:
! 1124: return extra;
! 1125: }
! 1126:
! 1127: /* End of pcre_study.c */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>