Annotation of embedaddon/pcre/pcre_dfa_exec.c, revision 1.1
1.1 ! misho 1: /*************************************************
! 2: * Perl-Compatible Regular Expressions *
! 3: *************************************************/
! 4:
! 5: /* PCRE is a library of functions to support regular expressions whose syntax
! 6: and semantics are as close as possible to those of the Perl 5 language (but see
! 7: below for why this module is different).
! 8:
! 9: Written by Philip Hazel
! 10: Copyright (c) 1997-2011 University of Cambridge
! 11:
! 12: -----------------------------------------------------------------------------
! 13: Redistribution and use in source and binary forms, with or without
! 14: modification, are permitted provided that the following conditions are met:
! 15:
! 16: * Redistributions of source code must retain the above copyright notice,
! 17: this list of conditions and the following disclaimer.
! 18:
! 19: * Redistributions in binary form must reproduce the above copyright
! 20: notice, this list of conditions and the following disclaimer in the
! 21: documentation and/or other materials provided with the distribution.
! 22:
! 23: * Neither the name of the University of Cambridge nor the names of its
! 24: contributors may be used to endorse or promote products derived from
! 25: this software without specific prior written permission.
! 26:
! 27: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
! 28: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
! 29: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
! 30: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
! 31: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
! 32: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
! 33: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
! 34: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
! 35: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
! 36: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
! 37: POSSIBILITY OF SUCH DAMAGE.
! 38: -----------------------------------------------------------------------------
! 39: */
! 40:
! 41:
! 42: /* This module contains the external function pcre_dfa_exec(), which is an
! 43: alternative matching function that uses a sort of DFA algorithm (not a true
! 44: FSM). This is NOT Perl- compatible, but it has advantages in certain
! 45: applications. */
! 46:
! 47:
! 48: /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
! 49: the performance of his patterns greatly. I could not use it as it stood, as it
! 50: was not thread safe, and made assumptions about pattern sizes. Also, it caused
! 51: test 7 to loop, and test 9 to crash with a segfault.
! 52:
! 53: The issue is the check for duplicate states, which is done by a simple linear
! 54: search up the state list. (Grep for "duplicate" below to find the code.) For
! 55: many patterns, there will never be many states active at one time, so a simple
! 56: linear search is fine. In patterns that have many active states, it might be a
! 57: bottleneck. The suggested code used an indexing scheme to remember which states
! 58: had previously been used for each character, and avoided the linear search when
! 59: it knew there was no chance of a duplicate. This was implemented when adding
! 60: states to the state lists.
! 61:
! 62: I wrote some thread-safe, not-limited code to try something similar at the time
! 63: of checking for duplicates (instead of when adding states), using index vectors
! 64: on the stack. It did give a 13% improvement with one specially constructed
! 65: pattern for certain subject strings, but on other strings and on many of the
! 66: simpler patterns in the test suite it did worse. The major problem, I think,
! 67: was the extra time to initialize the index. This had to be done for each call
! 68: of internal_dfa_exec(). (The supplied patch used a static vector, initialized
! 69: only once - I suspect this was the cause of the problems with the tests.)
! 70:
! 71: Overall, I concluded that the gains in some cases did not outweigh the losses
! 72: in others, so I abandoned this code. */
! 73:
! 74:
! 75:
! 76: #ifdef HAVE_CONFIG_H
! 77: #include "config.h"
! 78: #endif
! 79:
! 80: #define NLBLOCK md /* Block containing newline information */
! 81: #define PSSTART start_subject /* Field containing processed string start */
! 82: #define PSEND end_subject /* Field containing processed string end */
! 83:
! 84: #include "pcre_internal.h"
! 85:
! 86:
! 87: /* For use to indent debugging output */
! 88:
! 89: #define SP " "
! 90:
! 91:
! 92: /*************************************************
! 93: * Code parameters and static tables *
! 94: *************************************************/
! 95:
! 96: /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
! 97: into others, under special conditions. A gap of 20 between the blocks should be
! 98: enough. The resulting opcodes don't have to be less than 256 because they are
! 99: never stored, so we push them well clear of the normal opcodes. */
! 100:
! 101: #define OP_PROP_EXTRA 300
! 102: #define OP_EXTUNI_EXTRA 320
! 103: #define OP_ANYNL_EXTRA 340
! 104: #define OP_HSPACE_EXTRA 360
! 105: #define OP_VSPACE_EXTRA 380
! 106:
! 107:
! 108: /* This table identifies those opcodes that are followed immediately by a
! 109: character that is to be tested in some way. This makes it possible to
! 110: centralize the loading of these characters. In the case of Type * etc, the
! 111: "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
! 112: small value. Non-zero values in the table are the offsets from the opcode where
! 113: the character is to be found. ***NOTE*** If the start of this table is
! 114: modified, the three tables that follow must also be modified. */
! 115:
! 116: static const uschar coptable[] = {
! 117: 0, /* End */
! 118: 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
! 119: 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
! 120: 0, 0, 0, /* Any, AllAny, Anybyte */
! 121: 0, 0, /* \P, \p */
! 122: 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
! 123: 0, /* \X */
! 124: 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
! 125: 1, /* Char */
! 126: 1, /* Chari */
! 127: 1, /* not */
! 128: 1, /* noti */
! 129: /* Positive single-char repeats */
! 130: 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
! 131: 3, 3, 3, /* upto, minupto, exact */
! 132: 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
! 133: 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
! 134: 3, 3, 3, /* upto I, minupto I, exact I */
! 135: 1, 1, 1, 3, /* *+I, ++I, ?+I, upto+I */
! 136: /* Negative single-char repeats - only for chars < 256 */
! 137: 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
! 138: 3, 3, 3, /* NOT upto, minupto, exact */
! 139: 1, 1, 1, 3, /* NOT *+, ++, ?+, upto+ */
! 140: 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
! 141: 3, 3, 3, /* NOT upto I, minupto I, exact I */
! 142: 1, 1, 1, 3, /* NOT *+I, ++I, ?+I, upto+I */
! 143: /* Positive type repeats */
! 144: 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
! 145: 3, 3, 3, /* Type upto, minupto, exact */
! 146: 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
! 147: /* Character class & ref repeats */
! 148: 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
! 149: 0, 0, /* CRRANGE, CRMINRANGE */
! 150: 0, /* CLASS */
! 151: 0, /* NCLASS */
! 152: 0, /* XCLASS - variable length */
! 153: 0, /* REF */
! 154: 0, /* REFI */
! 155: 0, /* RECURSE */
! 156: 0, /* CALLOUT */
! 157: 0, /* Alt */
! 158: 0, /* Ket */
! 159: 0, /* KetRmax */
! 160: 0, /* KetRmin */
! 161: 0, /* KetRpos */
! 162: 0, /* Reverse */
! 163: 0, /* Assert */
! 164: 0, /* Assert not */
! 165: 0, /* Assert behind */
! 166: 0, /* Assert behind not */
! 167: 0, 0, /* ONCE, ONCE_NC */
! 168: 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
! 169: 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
! 170: 0, 0, /* CREF, NCREF */
! 171: 0, 0, /* RREF, NRREF */
! 172: 0, /* DEF */
! 173: 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
! 174: 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
! 175: 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
! 176: 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
! 177: 0, 0 /* CLOSE, SKIPZERO */
! 178: };
! 179:
! 180: /* This table identifies those opcodes that inspect a character. It is used to
! 181: remember the fact that a character could have been inspected when the end of
! 182: the subject is reached. ***NOTE*** If the start of this table is modified, the
! 183: two tables that follow must also be modified. */
! 184:
! 185: static const uschar poptable[] = {
! 186: 0, /* End */
! 187: 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
! 188: 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
! 189: 1, 1, 1, /* Any, AllAny, Anybyte */
! 190: 1, 1, /* \P, \p */
! 191: 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
! 192: 1, /* \X */
! 193: 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
! 194: 1, /* Char */
! 195: 1, /* Chari */
! 196: 1, /* not */
! 197: 1, /* noti */
! 198: /* Positive single-char repeats */
! 199: 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
! 200: 1, 1, 1, /* upto, minupto, exact */
! 201: 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
! 202: 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
! 203: 1, 1, 1, /* upto I, minupto I, exact I */
! 204: 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
! 205: /* Negative single-char repeats - only for chars < 256 */
! 206: 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
! 207: 1, 1, 1, /* NOT upto, minupto, exact */
! 208: 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
! 209: 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
! 210: 1, 1, 1, /* NOT upto I, minupto I, exact I */
! 211: 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
! 212: /* Positive type repeats */
! 213: 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
! 214: 1, 1, 1, /* Type upto, minupto, exact */
! 215: 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
! 216: /* Character class & ref repeats */
! 217: 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
! 218: 1, 1, /* CRRANGE, CRMINRANGE */
! 219: 1, /* CLASS */
! 220: 1, /* NCLASS */
! 221: 1, /* XCLASS - variable length */
! 222: 0, /* REF */
! 223: 0, /* REFI */
! 224: 0, /* RECURSE */
! 225: 0, /* CALLOUT */
! 226: 0, /* Alt */
! 227: 0, /* Ket */
! 228: 0, /* KetRmax */
! 229: 0, /* KetRmin */
! 230: 0, /* KetRpos */
! 231: 0, /* Reverse */
! 232: 0, /* Assert */
! 233: 0, /* Assert not */
! 234: 0, /* Assert behind */
! 235: 0, /* Assert behind not */
! 236: 0, 0, /* ONCE, ONCE_NC */
! 237: 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
! 238: 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
! 239: 0, 0, /* CREF, NCREF */
! 240: 0, 0, /* RREF, NRREF */
! 241: 0, /* DEF */
! 242: 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
! 243: 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
! 244: 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
! 245: 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
! 246: 0, 0 /* CLOSE, SKIPZERO */
! 247: };
! 248:
! 249: /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
! 250: and \w */
! 251:
! 252: static const uschar toptable1[] = {
! 253: 0, 0, 0, 0, 0, 0,
! 254: ctype_digit, ctype_digit,
! 255: ctype_space, ctype_space,
! 256: ctype_word, ctype_word,
! 257: 0, 0 /* OP_ANY, OP_ALLANY */
! 258: };
! 259:
! 260: static const uschar toptable2[] = {
! 261: 0, 0, 0, 0, 0, 0,
! 262: ctype_digit, 0,
! 263: ctype_space, 0,
! 264: ctype_word, 0,
! 265: 1, 1 /* OP_ANY, OP_ALLANY */
! 266: };
! 267:
! 268:
! 269: /* Structure for holding data about a particular state, which is in effect the
! 270: current data for an active path through the match tree. It must consist
! 271: entirely of ints because the working vector we are passed, and which we put
! 272: these structures in, is a vector of ints. */
! 273:
! 274: typedef struct stateblock {
! 275: int offset; /* Offset to opcode */
! 276: int count; /* Count for repeats */
! 277: int data; /* Some use extra data */
! 278: } stateblock;
! 279:
! 280: #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
! 281:
! 282:
! 283: #ifdef PCRE_DEBUG
! 284: /*************************************************
! 285: * Print character string *
! 286: *************************************************/
! 287:
! 288: /* Character string printing function for debugging.
! 289:
! 290: Arguments:
! 291: p points to string
! 292: length number of bytes
! 293: f where to print
! 294:
! 295: Returns: nothing
! 296: */
! 297:
! 298: static void
! 299: pchars(unsigned char *p, int length, FILE *f)
! 300: {
! 301: int c;
! 302: while (length-- > 0)
! 303: {
! 304: if (isprint(c = *(p++)))
! 305: fprintf(f, "%c", c);
! 306: else
! 307: fprintf(f, "\\x%02x", c);
! 308: }
! 309: }
! 310: #endif
! 311:
! 312:
! 313:
! 314: /*************************************************
! 315: * Execute a Regular Expression - DFA engine *
! 316: *************************************************/
! 317:
! 318: /* This internal function applies a compiled pattern to a subject string,
! 319: starting at a given point, using a DFA engine. This function is called from the
! 320: external one, possibly multiple times if the pattern is not anchored. The
! 321: function calls itself recursively for some kinds of subpattern.
! 322:
! 323: Arguments:
! 324: md the match_data block with fixed information
! 325: this_start_code the opening bracket of this subexpression's code
! 326: current_subject where we currently are in the subject string
! 327: start_offset start offset in the subject string
! 328: offsets vector to contain the matching string offsets
! 329: offsetcount size of same
! 330: workspace vector of workspace
! 331: wscount size of same
! 332: rlevel function call recursion level
! 333:
! 334: Returns: > 0 => number of match offset pairs placed in offsets
! 335: = 0 => offsets overflowed; longest matches are present
! 336: -1 => failed to match
! 337: < -1 => some kind of unexpected problem
! 338:
! 339: The following macros are used for adding states to the two state vectors (one
! 340: for the current character, one for the following character). */
! 341:
! 342: #define ADD_ACTIVE(x,y) \
! 343: if (active_count++ < wscount) \
! 344: { \
! 345: next_active_state->offset = (x); \
! 346: next_active_state->count = (y); \
! 347: next_active_state++; \
! 348: DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
! 349: } \
! 350: else return PCRE_ERROR_DFA_WSSIZE
! 351:
! 352: #define ADD_ACTIVE_DATA(x,y,z) \
! 353: if (active_count++ < wscount) \
! 354: { \
! 355: next_active_state->offset = (x); \
! 356: next_active_state->count = (y); \
! 357: next_active_state->data = (z); \
! 358: next_active_state++; \
! 359: DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
! 360: } \
! 361: else return PCRE_ERROR_DFA_WSSIZE
! 362:
! 363: #define ADD_NEW(x,y) \
! 364: if (new_count++ < wscount) \
! 365: { \
! 366: next_new_state->offset = (x); \
! 367: next_new_state->count = (y); \
! 368: next_new_state++; \
! 369: DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
! 370: } \
! 371: else return PCRE_ERROR_DFA_WSSIZE
! 372:
! 373: #define ADD_NEW_DATA(x,y,z) \
! 374: if (new_count++ < wscount) \
! 375: { \
! 376: next_new_state->offset = (x); \
! 377: next_new_state->count = (y); \
! 378: next_new_state->data = (z); \
! 379: next_new_state++; \
! 380: DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
! 381: } \
! 382: else return PCRE_ERROR_DFA_WSSIZE
! 383:
! 384: /* And now, here is the code */
! 385:
! 386: static int
! 387: internal_dfa_exec(
! 388: dfa_match_data *md,
! 389: const uschar *this_start_code,
! 390: const uschar *current_subject,
! 391: int start_offset,
! 392: int *offsets,
! 393: int offsetcount,
! 394: int *workspace,
! 395: int wscount,
! 396: int rlevel)
! 397: {
! 398: stateblock *active_states, *new_states, *temp_states;
! 399: stateblock *next_active_state, *next_new_state;
! 400:
! 401: const uschar *ctypes, *lcc, *fcc;
! 402: const uschar *ptr;
! 403: const uschar *end_code, *first_op;
! 404:
! 405: dfa_recursion_info new_recursive;
! 406:
! 407: int active_count, new_count, match_count;
! 408:
! 409: /* Some fields in the md block are frequently referenced, so we load them into
! 410: independent variables in the hope that this will perform better. */
! 411:
! 412: const uschar *start_subject = md->start_subject;
! 413: const uschar *end_subject = md->end_subject;
! 414: const uschar *start_code = md->start_code;
! 415:
! 416: #ifdef SUPPORT_UTF8
! 417: BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
! 418: #else
! 419: BOOL utf8 = FALSE;
! 420: #endif
! 421:
! 422: rlevel++;
! 423: offsetcount &= (-2);
! 424:
! 425: wscount -= 2;
! 426: wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
! 427: (2 * INTS_PER_STATEBLOCK);
! 428:
! 429: DPRINTF(("\n%.*s---------------------\n"
! 430: "%.*sCall to internal_dfa_exec f=%d\n",
! 431: rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
! 432:
! 433: ctypes = md->tables + ctypes_offset;
! 434: lcc = md->tables + lcc_offset;
! 435: fcc = md->tables + fcc_offset;
! 436:
! 437: match_count = PCRE_ERROR_NOMATCH; /* A negative number */
! 438:
! 439: active_states = (stateblock *)(workspace + 2);
! 440: next_new_state = new_states = active_states + wscount;
! 441: new_count = 0;
! 442:
! 443: first_op = this_start_code + 1 + LINK_SIZE +
! 444: ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
! 445: *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)? 2:0);
! 446:
! 447: /* The first thing in any (sub) pattern is a bracket of some sort. Push all
! 448: the alternative states onto the list, and find out where the end is. This
! 449: makes is possible to use this function recursively, when we want to stop at a
! 450: matching internal ket rather than at the end.
! 451:
! 452: If the first opcode in the first alternative is OP_REVERSE, we are dealing with
! 453: a backward assertion. In that case, we have to find out the maximum amount to
! 454: move back, and set up each alternative appropriately. */
! 455:
! 456: if (*first_op == OP_REVERSE)
! 457: {
! 458: int max_back = 0;
! 459: int gone_back;
! 460:
! 461: end_code = this_start_code;
! 462: do
! 463: {
! 464: int back = GET(end_code, 2+LINK_SIZE);
! 465: if (back > max_back) max_back = back;
! 466: end_code += GET(end_code, 1);
! 467: }
! 468: while (*end_code == OP_ALT);
! 469:
! 470: /* If we can't go back the amount required for the longest lookbehind
! 471: pattern, go back as far as we can; some alternatives may still be viable. */
! 472:
! 473: #ifdef SUPPORT_UTF8
! 474: /* In character mode we have to step back character by character */
! 475:
! 476: if (utf8)
! 477: {
! 478: for (gone_back = 0; gone_back < max_back; gone_back++)
! 479: {
! 480: if (current_subject <= start_subject) break;
! 481: current_subject--;
! 482: while (current_subject > start_subject &&
! 483: (*current_subject & 0xc0) == 0x80)
! 484: current_subject--;
! 485: }
! 486: }
! 487: else
! 488: #endif
! 489:
! 490: /* In byte-mode we can do this quickly. */
! 491:
! 492: {
! 493: gone_back = (current_subject - max_back < start_subject)?
! 494: (int)(current_subject - start_subject) : max_back;
! 495: current_subject -= gone_back;
! 496: }
! 497:
! 498: /* Save the earliest consulted character */
! 499:
! 500: if (current_subject < md->start_used_ptr)
! 501: md->start_used_ptr = current_subject;
! 502:
! 503: /* Now we can process the individual branches. */
! 504:
! 505: end_code = this_start_code;
! 506: do
! 507: {
! 508: int back = GET(end_code, 2+LINK_SIZE);
! 509: if (back <= gone_back)
! 510: {
! 511: int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
! 512: ADD_NEW_DATA(-bstate, 0, gone_back - back);
! 513: }
! 514: end_code += GET(end_code, 1);
! 515: }
! 516: while (*end_code == OP_ALT);
! 517: }
! 518:
! 519: /* This is the code for a "normal" subpattern (not a backward assertion). The
! 520: start of a whole pattern is always one of these. If we are at the top level,
! 521: we may be asked to restart matching from the same point that we reached for a
! 522: previous partial match. We still have to scan through the top-level branches to
! 523: find the end state. */
! 524:
! 525: else
! 526: {
! 527: end_code = this_start_code;
! 528:
! 529: /* Restarting */
! 530:
! 531: if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
! 532: {
! 533: do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
! 534: new_count = workspace[1];
! 535: if (!workspace[0])
! 536: memcpy(new_states, active_states, new_count * sizeof(stateblock));
! 537: }
! 538:
! 539: /* Not restarting */
! 540:
! 541: else
! 542: {
! 543: int length = 1 + LINK_SIZE +
! 544: ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
! 545: *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)?
! 546: 2:0);
! 547: do
! 548: {
! 549: ADD_NEW((int)(end_code - start_code + length), 0);
! 550: end_code += GET(end_code, 1);
! 551: length = 1 + LINK_SIZE;
! 552: }
! 553: while (*end_code == OP_ALT);
! 554: }
! 555: }
! 556:
! 557: workspace[0] = 0; /* Bit indicating which vector is current */
! 558:
! 559: DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
! 560:
! 561: /* Loop for scanning the subject */
! 562:
! 563: ptr = current_subject;
! 564: for (;;)
! 565: {
! 566: int i, j;
! 567: int clen, dlen;
! 568: unsigned int c, d;
! 569: int forced_fail = 0;
! 570: BOOL could_continue = FALSE;
! 571:
! 572: /* Make the new state list into the active state list and empty the
! 573: new state list. */
! 574:
! 575: temp_states = active_states;
! 576: active_states = new_states;
! 577: new_states = temp_states;
! 578: active_count = new_count;
! 579: new_count = 0;
! 580:
! 581: workspace[0] ^= 1; /* Remember for the restarting feature */
! 582: workspace[1] = active_count;
! 583:
! 584: #ifdef PCRE_DEBUG
! 585: printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
! 586: pchars((uschar *)ptr, strlen((char *)ptr), stdout);
! 587: printf("\"\n");
! 588:
! 589: printf("%.*sActive states: ", rlevel*2-2, SP);
! 590: for (i = 0; i < active_count; i++)
! 591: printf("%d/%d ", active_states[i].offset, active_states[i].count);
! 592: printf("\n");
! 593: #endif
! 594:
! 595: /* Set the pointers for adding new states */
! 596:
! 597: next_active_state = active_states + active_count;
! 598: next_new_state = new_states;
! 599:
! 600: /* Load the current character from the subject outside the loop, as many
! 601: different states may want to look at it, and we assume that at least one
! 602: will. */
! 603:
! 604: if (ptr < end_subject)
! 605: {
! 606: clen = 1; /* Number of bytes in the character */
! 607: #ifdef SUPPORT_UTF8
! 608: if (utf8) { GETCHARLEN(c, ptr, clen); } else
! 609: #endif /* SUPPORT_UTF8 */
! 610: c = *ptr;
! 611: }
! 612: else
! 613: {
! 614: clen = 0; /* This indicates the end of the subject */
! 615: c = NOTACHAR; /* This value should never actually be used */
! 616: }
! 617:
! 618: /* Scan up the active states and act on each one. The result of an action
! 619: may be to add more states to the currently active list (e.g. on hitting a
! 620: parenthesis) or it may be to put states on the new list, for considering
! 621: when we move the character pointer on. */
! 622:
! 623: for (i = 0; i < active_count; i++)
! 624: {
! 625: stateblock *current_state = active_states + i;
! 626: BOOL caseless = FALSE;
! 627: const uschar *code;
! 628: int state_offset = current_state->offset;
! 629: int count, codevalue, rrc;
! 630:
! 631: #ifdef PCRE_DEBUG
! 632: printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
! 633: if (clen == 0) printf("EOL\n");
! 634: else if (c > 32 && c < 127) printf("'%c'\n", c);
! 635: else printf("0x%02x\n", c);
! 636: #endif
! 637:
! 638: /* A negative offset is a special case meaning "hold off going to this
! 639: (negated) state until the number of characters in the data field have
! 640: been skipped". */
! 641:
! 642: if (state_offset < 0)
! 643: {
! 644: if (current_state->data > 0)
! 645: {
! 646: DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
! 647: ADD_NEW_DATA(state_offset, current_state->count,
! 648: current_state->data - 1);
! 649: continue;
! 650: }
! 651: else
! 652: {
! 653: current_state->offset = state_offset = -state_offset;
! 654: }
! 655: }
! 656:
! 657: /* Check for a duplicate state with the same count, and skip if found.
! 658: See the note at the head of this module about the possibility of improving
! 659: performance here. */
! 660:
! 661: for (j = 0; j < i; j++)
! 662: {
! 663: if (active_states[j].offset == state_offset &&
! 664: active_states[j].count == current_state->count)
! 665: {
! 666: DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
! 667: goto NEXT_ACTIVE_STATE;
! 668: }
! 669: }
! 670:
! 671: /* The state offset is the offset to the opcode */
! 672:
! 673: code = start_code + state_offset;
! 674: codevalue = *code;
! 675:
! 676: /* If this opcode inspects a character, but we are at the end of the
! 677: subject, remember the fact for use when testing for a partial match. */
! 678:
! 679: if (clen == 0 && poptable[codevalue] != 0)
! 680: could_continue = TRUE;
! 681:
! 682: /* If this opcode is followed by an inline character, load it. It is
! 683: tempting to test for the presence of a subject character here, but that
! 684: is wrong, because sometimes zero repetitions of the subject are
! 685: permitted.
! 686:
! 687: We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
! 688: argument that is not a data character - but is always one byte long. We
! 689: have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
! 690: this case. To keep the other cases fast, convert these ones to new opcodes.
! 691: */
! 692:
! 693: if (coptable[codevalue] > 0)
! 694: {
! 695: dlen = 1;
! 696: #ifdef SUPPORT_UTF8
! 697: if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
! 698: #endif /* SUPPORT_UTF8 */
! 699: d = code[coptable[codevalue]];
! 700: if (codevalue >= OP_TYPESTAR)
! 701: {
! 702: switch(d)
! 703: {
! 704: case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
! 705: case OP_NOTPROP:
! 706: case OP_PROP: codevalue += OP_PROP_EXTRA; break;
! 707: case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
! 708: case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
! 709: case OP_NOT_HSPACE:
! 710: case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
! 711: case OP_NOT_VSPACE:
! 712: case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
! 713: default: break;
! 714: }
! 715: }
! 716: }
! 717: else
! 718: {
! 719: dlen = 0; /* Not strictly necessary, but compilers moan */
! 720: d = NOTACHAR; /* if these variables are not set. */
! 721: }
! 722:
! 723:
! 724: /* Now process the individual opcodes */
! 725:
! 726: switch (codevalue)
! 727: {
! 728: /* ========================================================================== */
! 729: /* These cases are never obeyed. This is a fudge that causes a compile-
! 730: time error if the vectors coptable or poptable, which are indexed by
! 731: opcode, are not the correct length. It seems to be the only way to do
! 732: such a check at compile time, as the sizeof() operator does not work
! 733: in the C preprocessor. */
! 734:
! 735: case OP_TABLE_LENGTH:
! 736: case OP_TABLE_LENGTH +
! 737: ((sizeof(coptable) == OP_TABLE_LENGTH) &&
! 738: (sizeof(poptable) == OP_TABLE_LENGTH)):
! 739: break;
! 740:
! 741: /* ========================================================================== */
! 742: /* Reached a closing bracket. If not at the end of the pattern, carry
! 743: on with the next opcode. For repeating opcodes, also add the repeat
! 744: state. Note that KETRPOS will always be encountered at the end of the
! 745: subpattern, because the possessive subpattern repeats are always handled
! 746: using recursive calls. Thus, it never adds any new states.
! 747:
! 748: At the end of the (sub)pattern, unless we have an empty string and
! 749: PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
! 750: start of the subject, save the match data, shifting up all previous
! 751: matches so we always have the longest first. */
! 752:
! 753: case OP_KET:
! 754: case OP_KETRMIN:
! 755: case OP_KETRMAX:
! 756: case OP_KETRPOS:
! 757: if (code != end_code)
! 758: {
! 759: ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
! 760: if (codevalue != OP_KET)
! 761: {
! 762: ADD_ACTIVE(state_offset - GET(code, 1), 0);
! 763: }
! 764: }
! 765: else
! 766: {
! 767: if (ptr > current_subject ||
! 768: ((md->moptions & PCRE_NOTEMPTY) == 0 &&
! 769: ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
! 770: current_subject > start_subject + md->start_offset)))
! 771: {
! 772: if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
! 773: else if (match_count > 0 && ++match_count * 2 > offsetcount)
! 774: match_count = 0;
! 775: count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
! 776: if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
! 777: if (offsetcount >= 2)
! 778: {
! 779: offsets[0] = (int)(current_subject - start_subject);
! 780: offsets[1] = (int)(ptr - start_subject);
! 781: DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
! 782: offsets[1] - offsets[0], current_subject));
! 783: }
! 784: if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
! 785: {
! 786: DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
! 787: "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
! 788: match_count, rlevel*2-2, SP));
! 789: return match_count;
! 790: }
! 791: }
! 792: }
! 793: break;
! 794:
! 795: /* ========================================================================== */
! 796: /* These opcodes add to the current list of states without looking
! 797: at the current character. */
! 798:
! 799: /*-----------------------------------------------------------------*/
! 800: case OP_ALT:
! 801: do { code += GET(code, 1); } while (*code == OP_ALT);
! 802: ADD_ACTIVE((int)(code - start_code), 0);
! 803: break;
! 804:
! 805: /*-----------------------------------------------------------------*/
! 806: case OP_BRA:
! 807: case OP_SBRA:
! 808: do
! 809: {
! 810: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
! 811: code += GET(code, 1);
! 812: }
! 813: while (*code == OP_ALT);
! 814: break;
! 815:
! 816: /*-----------------------------------------------------------------*/
! 817: case OP_CBRA:
! 818: case OP_SCBRA:
! 819: ADD_ACTIVE((int)(code - start_code + 3 + LINK_SIZE), 0);
! 820: code += GET(code, 1);
! 821: while (*code == OP_ALT)
! 822: {
! 823: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
! 824: code += GET(code, 1);
! 825: }
! 826: break;
! 827:
! 828: /*-----------------------------------------------------------------*/
! 829: case OP_BRAZERO:
! 830: case OP_BRAMINZERO:
! 831: ADD_ACTIVE(state_offset + 1, 0);
! 832: code += 1 + GET(code, 2);
! 833: while (*code == OP_ALT) code += GET(code, 1);
! 834: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
! 835: break;
! 836:
! 837: /*-----------------------------------------------------------------*/
! 838: case OP_SKIPZERO:
! 839: code += 1 + GET(code, 2);
! 840: while (*code == OP_ALT) code += GET(code, 1);
! 841: ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
! 842: break;
! 843:
! 844: /*-----------------------------------------------------------------*/
! 845: case OP_CIRC:
! 846: if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
! 847: { ADD_ACTIVE(state_offset + 1, 0); }
! 848: break;
! 849:
! 850: /*-----------------------------------------------------------------*/
! 851: case OP_CIRCM:
! 852: if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
! 853: (ptr != end_subject && WAS_NEWLINE(ptr)))
! 854: { ADD_ACTIVE(state_offset + 1, 0); }
! 855: break;
! 856:
! 857: /*-----------------------------------------------------------------*/
! 858: case OP_EOD:
! 859: if (ptr >= end_subject)
! 860: {
! 861: if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
! 862: could_continue = TRUE;
! 863: else { ADD_ACTIVE(state_offset + 1, 0); }
! 864: }
! 865: break;
! 866:
! 867: /*-----------------------------------------------------------------*/
! 868: case OP_SOD:
! 869: if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
! 870: break;
! 871:
! 872: /*-----------------------------------------------------------------*/
! 873: case OP_SOM:
! 874: if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
! 875: break;
! 876:
! 877:
! 878: /* ========================================================================== */
! 879: /* These opcodes inspect the next subject character, and sometimes
! 880: the previous one as well, but do not have an argument. The variable
! 881: clen contains the length of the current character and is zero if we are
! 882: at the end of the subject. */
! 883:
! 884: /*-----------------------------------------------------------------*/
! 885: case OP_ANY:
! 886: if (clen > 0 && !IS_NEWLINE(ptr))
! 887: { ADD_NEW(state_offset + 1, 0); }
! 888: break;
! 889:
! 890: /*-----------------------------------------------------------------*/
! 891: case OP_ALLANY:
! 892: if (clen > 0)
! 893: { ADD_NEW(state_offset + 1, 0); }
! 894: break;
! 895:
! 896: /*-----------------------------------------------------------------*/
! 897: case OP_EODN:
! 898: if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
! 899: could_continue = TRUE;
! 900: else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
! 901: { ADD_ACTIVE(state_offset + 1, 0); }
! 902: break;
! 903:
! 904: /*-----------------------------------------------------------------*/
! 905: case OP_DOLL:
! 906: if ((md->moptions & PCRE_NOTEOL) == 0)
! 907: {
! 908: if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
! 909: could_continue = TRUE;
! 910: else if (clen == 0 ||
! 911: ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
! 912: (ptr == end_subject - md->nllen)
! 913: ))
! 914: { ADD_ACTIVE(state_offset + 1, 0); }
! 915: }
! 916: break;
! 917:
! 918: /*-----------------------------------------------------------------*/
! 919: case OP_DOLLM:
! 920: if ((md->moptions & PCRE_NOTEOL) == 0)
! 921: {
! 922: if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
! 923: could_continue = TRUE;
! 924: else if (clen == 0 ||
! 925: ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
! 926: { ADD_ACTIVE(state_offset + 1, 0); }
! 927: }
! 928: else if (IS_NEWLINE(ptr))
! 929: { ADD_ACTIVE(state_offset + 1, 0); }
! 930: break;
! 931:
! 932: /*-----------------------------------------------------------------*/
! 933:
! 934: case OP_DIGIT:
! 935: case OP_WHITESPACE:
! 936: case OP_WORDCHAR:
! 937: if (clen > 0 && c < 256 &&
! 938: ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
! 939: { ADD_NEW(state_offset + 1, 0); }
! 940: break;
! 941:
! 942: /*-----------------------------------------------------------------*/
! 943: case OP_NOT_DIGIT:
! 944: case OP_NOT_WHITESPACE:
! 945: case OP_NOT_WORDCHAR:
! 946: if (clen > 0 && (c >= 256 ||
! 947: ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
! 948: { ADD_NEW(state_offset + 1, 0); }
! 949: break;
! 950:
! 951: /*-----------------------------------------------------------------*/
! 952: case OP_WORD_BOUNDARY:
! 953: case OP_NOT_WORD_BOUNDARY:
! 954: {
! 955: int left_word, right_word;
! 956:
! 957: if (ptr > start_subject)
! 958: {
! 959: const uschar *temp = ptr - 1;
! 960: if (temp < md->start_used_ptr) md->start_used_ptr = temp;
! 961: #ifdef SUPPORT_UTF8
! 962: if (utf8) BACKCHAR(temp);
! 963: #endif
! 964: GETCHARTEST(d, temp);
! 965: #ifdef SUPPORT_UCP
! 966: if ((md->poptions & PCRE_UCP) != 0)
! 967: {
! 968: if (d == '_') left_word = TRUE; else
! 969: {
! 970: int cat = UCD_CATEGORY(d);
! 971: left_word = (cat == ucp_L || cat == ucp_N);
! 972: }
! 973: }
! 974: else
! 975: #endif
! 976: left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
! 977: }
! 978: else left_word = FALSE;
! 979:
! 980: if (clen > 0)
! 981: {
! 982: #ifdef SUPPORT_UCP
! 983: if ((md->poptions & PCRE_UCP) != 0)
! 984: {
! 985: if (c == '_') right_word = TRUE; else
! 986: {
! 987: int cat = UCD_CATEGORY(c);
! 988: right_word = (cat == ucp_L || cat == ucp_N);
! 989: }
! 990: }
! 991: else
! 992: #endif
! 993: right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
! 994: }
! 995: else right_word = FALSE;
! 996:
! 997: if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
! 998: { ADD_ACTIVE(state_offset + 1, 0); }
! 999: }
! 1000: break;
! 1001:
! 1002:
! 1003: /*-----------------------------------------------------------------*/
! 1004: /* Check the next character by Unicode property. We will get here only
! 1005: if the support is in the binary; otherwise a compile-time error occurs.
! 1006: */
! 1007:
! 1008: #ifdef SUPPORT_UCP
! 1009: case OP_PROP:
! 1010: case OP_NOTPROP:
! 1011: if (clen > 0)
! 1012: {
! 1013: BOOL OK;
! 1014: const ucd_record * prop = GET_UCD(c);
! 1015: switch(code[1])
! 1016: {
! 1017: case PT_ANY:
! 1018: OK = TRUE;
! 1019: break;
! 1020:
! 1021: case PT_LAMP:
! 1022: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
! 1023: prop->chartype == ucp_Lt;
! 1024: break;
! 1025:
! 1026: case PT_GC:
! 1027: OK = _pcre_ucp_gentype[prop->chartype] == code[2];
! 1028: break;
! 1029:
! 1030: case PT_PC:
! 1031: OK = prop->chartype == code[2];
! 1032: break;
! 1033:
! 1034: case PT_SC:
! 1035: OK = prop->script == code[2];
! 1036: break;
! 1037:
! 1038: /* These are specials for combination cases. */
! 1039:
! 1040: case PT_ALNUM:
! 1041: OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
! 1042: _pcre_ucp_gentype[prop->chartype] == ucp_N;
! 1043: break;
! 1044:
! 1045: case PT_SPACE: /* Perl space */
! 1046: OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
! 1047: c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
! 1048: break;
! 1049:
! 1050: case PT_PXSPACE: /* POSIX space */
! 1051: OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
! 1052: c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
! 1053: c == CHAR_FF || c == CHAR_CR;
! 1054: break;
! 1055:
! 1056: case PT_WORD:
! 1057: OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
! 1058: _pcre_ucp_gentype[prop->chartype] == ucp_N ||
! 1059: c == CHAR_UNDERSCORE;
! 1060: break;
! 1061:
! 1062: /* Should never occur, but keep compilers from grumbling. */
! 1063:
! 1064: default:
! 1065: OK = codevalue != OP_PROP;
! 1066: break;
! 1067: }
! 1068:
! 1069: if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
! 1070: }
! 1071: break;
! 1072: #endif
! 1073:
! 1074:
! 1075:
! 1076: /* ========================================================================== */
! 1077: /* These opcodes likewise inspect the subject character, but have an
! 1078: argument that is not a data character. It is one of these opcodes:
! 1079: OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
! 1080: OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
! 1081:
! 1082: case OP_TYPEPLUS:
! 1083: case OP_TYPEMINPLUS:
! 1084: case OP_TYPEPOSPLUS:
! 1085: count = current_state->count; /* Already matched */
! 1086: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
! 1087: if (clen > 0)
! 1088: {
! 1089: if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
! 1090: (c < 256 &&
! 1091: (d != OP_ANY || !IS_NEWLINE(ptr)) &&
! 1092: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
! 1093: {
! 1094: if (count > 0 && codevalue == OP_TYPEPOSPLUS)
! 1095: {
! 1096: active_count--; /* Remove non-match possibility */
! 1097: next_active_state--;
! 1098: }
! 1099: count++;
! 1100: ADD_NEW(state_offset, count);
! 1101: }
! 1102: }
! 1103: break;
! 1104:
! 1105: /*-----------------------------------------------------------------*/
! 1106: case OP_TYPEQUERY:
! 1107: case OP_TYPEMINQUERY:
! 1108: case OP_TYPEPOSQUERY:
! 1109: ADD_ACTIVE(state_offset + 2, 0);
! 1110: if (clen > 0)
! 1111: {
! 1112: if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
! 1113: (c < 256 &&
! 1114: (d != OP_ANY || !IS_NEWLINE(ptr)) &&
! 1115: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
! 1116: {
! 1117: if (codevalue == OP_TYPEPOSQUERY)
! 1118: {
! 1119: active_count--; /* Remove non-match possibility */
! 1120: next_active_state--;
! 1121: }
! 1122: ADD_NEW(state_offset + 2, 0);
! 1123: }
! 1124: }
! 1125: break;
! 1126:
! 1127: /*-----------------------------------------------------------------*/
! 1128: case OP_TYPESTAR:
! 1129: case OP_TYPEMINSTAR:
! 1130: case OP_TYPEPOSSTAR:
! 1131: ADD_ACTIVE(state_offset + 2, 0);
! 1132: if (clen > 0)
! 1133: {
! 1134: if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
! 1135: (c < 256 &&
! 1136: (d != OP_ANY || !IS_NEWLINE(ptr)) &&
! 1137: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
! 1138: {
! 1139: if (codevalue == OP_TYPEPOSSTAR)
! 1140: {
! 1141: active_count--; /* Remove non-match possibility */
! 1142: next_active_state--;
! 1143: }
! 1144: ADD_NEW(state_offset, 0);
! 1145: }
! 1146: }
! 1147: break;
! 1148:
! 1149: /*-----------------------------------------------------------------*/
! 1150: case OP_TYPEEXACT:
! 1151: count = current_state->count; /* Number already matched */
! 1152: if (clen > 0)
! 1153: {
! 1154: if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
! 1155: (c < 256 &&
! 1156: (d != OP_ANY || !IS_NEWLINE(ptr)) &&
! 1157: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
! 1158: {
! 1159: if (++count >= GET2(code, 1))
! 1160: { ADD_NEW(state_offset + 4, 0); }
! 1161: else
! 1162: { ADD_NEW(state_offset, count); }
! 1163: }
! 1164: }
! 1165: break;
! 1166:
! 1167: /*-----------------------------------------------------------------*/
! 1168: case OP_TYPEUPTO:
! 1169: case OP_TYPEMINUPTO:
! 1170: case OP_TYPEPOSUPTO:
! 1171: ADD_ACTIVE(state_offset + 4, 0);
! 1172: count = current_state->count; /* Number already matched */
! 1173: if (clen > 0)
! 1174: {
! 1175: if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
! 1176: (c < 256 &&
! 1177: (d != OP_ANY || !IS_NEWLINE(ptr)) &&
! 1178: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
! 1179: {
! 1180: if (codevalue == OP_TYPEPOSUPTO)
! 1181: {
! 1182: active_count--; /* Remove non-match possibility */
! 1183: next_active_state--;
! 1184: }
! 1185: if (++count >= GET2(code, 1))
! 1186: { ADD_NEW(state_offset + 4, 0); }
! 1187: else
! 1188: { ADD_NEW(state_offset, count); }
! 1189: }
! 1190: }
! 1191: break;
! 1192:
! 1193: /* ========================================================================== */
! 1194: /* These are virtual opcodes that are used when something like
! 1195: OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
! 1196: argument. It keeps the code above fast for the other cases. The argument
! 1197: is in the d variable. */
! 1198:
! 1199: #ifdef SUPPORT_UCP
! 1200: case OP_PROP_EXTRA + OP_TYPEPLUS:
! 1201: case OP_PROP_EXTRA + OP_TYPEMINPLUS:
! 1202: case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
! 1203: count = current_state->count; /* Already matched */
! 1204: if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
! 1205: if (clen > 0)
! 1206: {
! 1207: BOOL OK;
! 1208: const ucd_record * prop = GET_UCD(c);
! 1209: switch(code[2])
! 1210: {
! 1211: case PT_ANY:
! 1212: OK = TRUE;
! 1213: break;
! 1214:
! 1215: case PT_LAMP:
! 1216: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
! 1217: prop->chartype == ucp_Lt;
! 1218: break;
! 1219:
! 1220: case PT_GC:
! 1221: OK = _pcre_ucp_gentype[prop->chartype] == code[3];
! 1222: break;
! 1223:
! 1224: case PT_PC:
! 1225: OK = prop->chartype == code[3];
! 1226: break;
! 1227:
! 1228: case PT_SC:
! 1229: OK = prop->script == code[3];
! 1230: break;
! 1231:
! 1232: /* These are specials for combination cases. */
! 1233:
! 1234: case PT_ALNUM:
! 1235: OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
! 1236: _pcre_ucp_gentype[prop->chartype] == ucp_N;
! 1237: break;
! 1238:
! 1239: case PT_SPACE: /* Perl space */
! 1240: OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
! 1241: c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
! 1242: break;
! 1243:
! 1244: case PT_PXSPACE: /* POSIX space */
! 1245: OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
! 1246: c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
! 1247: c == CHAR_FF || c == CHAR_CR;
! 1248: break;
! 1249:
! 1250: case PT_WORD:
! 1251: OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
! 1252: _pcre_ucp_gentype[prop->chartype] == ucp_N ||
! 1253: c == CHAR_UNDERSCORE;
! 1254: break;
! 1255:
! 1256: /* Should never occur, but keep compilers from grumbling. */
! 1257:
! 1258: default:
! 1259: OK = codevalue != OP_PROP;
! 1260: break;
! 1261: }
! 1262:
! 1263: if (OK == (d == OP_PROP))
! 1264: {
! 1265: if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
! 1266: {
! 1267: active_count--; /* Remove non-match possibility */
! 1268: next_active_state--;
! 1269: }
! 1270: count++;
! 1271: ADD_NEW(state_offset, count);
! 1272: }
! 1273: }
! 1274: break;
! 1275:
! 1276: /*-----------------------------------------------------------------*/
! 1277: case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
! 1278: case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
! 1279: case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
! 1280: count = current_state->count; /* Already matched */
! 1281: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
! 1282: if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
! 1283: {
! 1284: const uschar *nptr = ptr + clen;
! 1285: int ncount = 0;
! 1286: if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
! 1287: {
! 1288: active_count--; /* Remove non-match possibility */
! 1289: next_active_state--;
! 1290: }
! 1291: while (nptr < end_subject)
! 1292: {
! 1293: int nd;
! 1294: int ndlen = 1;
! 1295: GETCHARLEN(nd, nptr, ndlen);
! 1296: if (UCD_CATEGORY(nd) != ucp_M) break;
! 1297: ncount++;
! 1298: nptr += ndlen;
! 1299: }
! 1300: count++;
! 1301: ADD_NEW_DATA(-state_offset, count, ncount);
! 1302: }
! 1303: break;
! 1304: #endif
! 1305:
! 1306: /*-----------------------------------------------------------------*/
! 1307: case OP_ANYNL_EXTRA + OP_TYPEPLUS:
! 1308: case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
! 1309: case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
! 1310: count = current_state->count; /* Already matched */
! 1311: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
! 1312: if (clen > 0)
! 1313: {
! 1314: int ncount = 0;
! 1315: switch (c)
! 1316: {
! 1317: case 0x000b:
! 1318: case 0x000c:
! 1319: case 0x0085:
! 1320: case 0x2028:
! 1321: case 0x2029:
! 1322: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
! 1323: goto ANYNL01;
! 1324:
! 1325: case 0x000d:
! 1326: if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
! 1327: /* Fall through */
! 1328:
! 1329: ANYNL01:
! 1330: case 0x000a:
! 1331: if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
! 1332: {
! 1333: active_count--; /* Remove non-match possibility */
! 1334: next_active_state--;
! 1335: }
! 1336: count++;
! 1337: ADD_NEW_DATA(-state_offset, count, ncount);
! 1338: break;
! 1339:
! 1340: default:
! 1341: break;
! 1342: }
! 1343: }
! 1344: break;
! 1345:
! 1346: /*-----------------------------------------------------------------*/
! 1347: case OP_VSPACE_EXTRA + OP_TYPEPLUS:
! 1348: case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
! 1349: case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
! 1350: count = current_state->count; /* Already matched */
! 1351: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
! 1352: if (clen > 0)
! 1353: {
! 1354: BOOL OK;
! 1355: switch (c)
! 1356: {
! 1357: case 0x000a:
! 1358: case 0x000b:
! 1359: case 0x000c:
! 1360: case 0x000d:
! 1361: case 0x0085:
! 1362: case 0x2028:
! 1363: case 0x2029:
! 1364: OK = TRUE;
! 1365: break;
! 1366:
! 1367: default:
! 1368: OK = FALSE;
! 1369: break;
! 1370: }
! 1371:
! 1372: if (OK == (d == OP_VSPACE))
! 1373: {
! 1374: if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
! 1375: {
! 1376: active_count--; /* Remove non-match possibility */
! 1377: next_active_state--;
! 1378: }
! 1379: count++;
! 1380: ADD_NEW_DATA(-state_offset, count, 0);
! 1381: }
! 1382: }
! 1383: break;
! 1384:
! 1385: /*-----------------------------------------------------------------*/
! 1386: case OP_HSPACE_EXTRA + OP_TYPEPLUS:
! 1387: case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
! 1388: case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
! 1389: count = current_state->count; /* Already matched */
! 1390: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
! 1391: if (clen > 0)
! 1392: {
! 1393: BOOL OK;
! 1394: switch (c)
! 1395: {
! 1396: case 0x09: /* HT */
! 1397: case 0x20: /* SPACE */
! 1398: case 0xa0: /* NBSP */
! 1399: case 0x1680: /* OGHAM SPACE MARK */
! 1400: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
! 1401: case 0x2000: /* EN QUAD */
! 1402: case 0x2001: /* EM QUAD */
! 1403: case 0x2002: /* EN SPACE */
! 1404: case 0x2003: /* EM SPACE */
! 1405: case 0x2004: /* THREE-PER-EM SPACE */
! 1406: case 0x2005: /* FOUR-PER-EM SPACE */
! 1407: case 0x2006: /* SIX-PER-EM SPACE */
! 1408: case 0x2007: /* FIGURE SPACE */
! 1409: case 0x2008: /* PUNCTUATION SPACE */
! 1410: case 0x2009: /* THIN SPACE */
! 1411: case 0x200A: /* HAIR SPACE */
! 1412: case 0x202f: /* NARROW NO-BREAK SPACE */
! 1413: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
! 1414: case 0x3000: /* IDEOGRAPHIC SPACE */
! 1415: OK = TRUE;
! 1416: break;
! 1417:
! 1418: default:
! 1419: OK = FALSE;
! 1420: break;
! 1421: }
! 1422:
! 1423: if (OK == (d == OP_HSPACE))
! 1424: {
! 1425: if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
! 1426: {
! 1427: active_count--; /* Remove non-match possibility */
! 1428: next_active_state--;
! 1429: }
! 1430: count++;
! 1431: ADD_NEW_DATA(-state_offset, count, 0);
! 1432: }
! 1433: }
! 1434: break;
! 1435:
! 1436: /*-----------------------------------------------------------------*/
! 1437: #ifdef SUPPORT_UCP
! 1438: case OP_PROP_EXTRA + OP_TYPEQUERY:
! 1439: case OP_PROP_EXTRA + OP_TYPEMINQUERY:
! 1440: case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
! 1441: count = 4;
! 1442: goto QS1;
! 1443:
! 1444: case OP_PROP_EXTRA + OP_TYPESTAR:
! 1445: case OP_PROP_EXTRA + OP_TYPEMINSTAR:
! 1446: case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
! 1447: count = 0;
! 1448:
! 1449: QS1:
! 1450:
! 1451: ADD_ACTIVE(state_offset + 4, 0);
! 1452: if (clen > 0)
! 1453: {
! 1454: BOOL OK;
! 1455: const ucd_record * prop = GET_UCD(c);
! 1456: switch(code[2])
! 1457: {
! 1458: case PT_ANY:
! 1459: OK = TRUE;
! 1460: break;
! 1461:
! 1462: case PT_LAMP:
! 1463: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
! 1464: prop->chartype == ucp_Lt;
! 1465: break;
! 1466:
! 1467: case PT_GC:
! 1468: OK = _pcre_ucp_gentype[prop->chartype] == code[3];
! 1469: break;
! 1470:
! 1471: case PT_PC:
! 1472: OK = prop->chartype == code[3];
! 1473: break;
! 1474:
! 1475: case PT_SC:
! 1476: OK = prop->script == code[3];
! 1477: break;
! 1478:
! 1479: /* These are specials for combination cases. */
! 1480:
! 1481: case PT_ALNUM:
! 1482: OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
! 1483: _pcre_ucp_gentype[prop->chartype] == ucp_N;
! 1484: break;
! 1485:
! 1486: case PT_SPACE: /* Perl space */
! 1487: OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
! 1488: c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
! 1489: break;
! 1490:
! 1491: case PT_PXSPACE: /* POSIX space */
! 1492: OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
! 1493: c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
! 1494: c == CHAR_FF || c == CHAR_CR;
! 1495: break;
! 1496:
! 1497: case PT_WORD:
! 1498: OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
! 1499: _pcre_ucp_gentype[prop->chartype] == ucp_N ||
! 1500: c == CHAR_UNDERSCORE;
! 1501: break;
! 1502:
! 1503: /* Should never occur, but keep compilers from grumbling. */
! 1504:
! 1505: default:
! 1506: OK = codevalue != OP_PROP;
! 1507: break;
! 1508: }
! 1509:
! 1510: if (OK == (d == OP_PROP))
! 1511: {
! 1512: if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
! 1513: codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
! 1514: {
! 1515: active_count--; /* Remove non-match possibility */
! 1516: next_active_state--;
! 1517: }
! 1518: ADD_NEW(state_offset + count, 0);
! 1519: }
! 1520: }
! 1521: break;
! 1522:
! 1523: /*-----------------------------------------------------------------*/
! 1524: case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
! 1525: case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
! 1526: case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
! 1527: count = 2;
! 1528: goto QS2;
! 1529:
! 1530: case OP_EXTUNI_EXTRA + OP_TYPESTAR:
! 1531: case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
! 1532: case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
! 1533: count = 0;
! 1534:
! 1535: QS2:
! 1536:
! 1537: ADD_ACTIVE(state_offset + 2, 0);
! 1538: if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
! 1539: {
! 1540: const uschar *nptr = ptr + clen;
! 1541: int ncount = 0;
! 1542: if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
! 1543: codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
! 1544: {
! 1545: active_count--; /* Remove non-match possibility */
! 1546: next_active_state--;
! 1547: }
! 1548: while (nptr < end_subject)
! 1549: {
! 1550: int nd;
! 1551: int ndlen = 1;
! 1552: GETCHARLEN(nd, nptr, ndlen);
! 1553: if (UCD_CATEGORY(nd) != ucp_M) break;
! 1554: ncount++;
! 1555: nptr += ndlen;
! 1556: }
! 1557: ADD_NEW_DATA(-(state_offset + count), 0, ncount);
! 1558: }
! 1559: break;
! 1560: #endif
! 1561:
! 1562: /*-----------------------------------------------------------------*/
! 1563: case OP_ANYNL_EXTRA + OP_TYPEQUERY:
! 1564: case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
! 1565: case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
! 1566: count = 2;
! 1567: goto QS3;
! 1568:
! 1569: case OP_ANYNL_EXTRA + OP_TYPESTAR:
! 1570: case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
! 1571: case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
! 1572: count = 0;
! 1573:
! 1574: QS3:
! 1575: ADD_ACTIVE(state_offset + 2, 0);
! 1576: if (clen > 0)
! 1577: {
! 1578: int ncount = 0;
! 1579: switch (c)
! 1580: {
! 1581: case 0x000b:
! 1582: case 0x000c:
! 1583: case 0x0085:
! 1584: case 0x2028:
! 1585: case 0x2029:
! 1586: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
! 1587: goto ANYNL02;
! 1588:
! 1589: case 0x000d:
! 1590: if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
! 1591: /* Fall through */
! 1592:
! 1593: ANYNL02:
! 1594: case 0x000a:
! 1595: if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
! 1596: codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
! 1597: {
! 1598: active_count--; /* Remove non-match possibility */
! 1599: next_active_state--;
! 1600: }
! 1601: ADD_NEW_DATA(-(state_offset + count), 0, ncount);
! 1602: break;
! 1603:
! 1604: default:
! 1605: break;
! 1606: }
! 1607: }
! 1608: break;
! 1609:
! 1610: /*-----------------------------------------------------------------*/
! 1611: case OP_VSPACE_EXTRA + OP_TYPEQUERY:
! 1612: case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
! 1613: case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
! 1614: count = 2;
! 1615: goto QS4;
! 1616:
! 1617: case OP_VSPACE_EXTRA + OP_TYPESTAR:
! 1618: case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
! 1619: case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
! 1620: count = 0;
! 1621:
! 1622: QS4:
! 1623: ADD_ACTIVE(state_offset + 2, 0);
! 1624: if (clen > 0)
! 1625: {
! 1626: BOOL OK;
! 1627: switch (c)
! 1628: {
! 1629: case 0x000a:
! 1630: case 0x000b:
! 1631: case 0x000c:
! 1632: case 0x000d:
! 1633: case 0x0085:
! 1634: case 0x2028:
! 1635: case 0x2029:
! 1636: OK = TRUE;
! 1637: break;
! 1638:
! 1639: default:
! 1640: OK = FALSE;
! 1641: break;
! 1642: }
! 1643: if (OK == (d == OP_VSPACE))
! 1644: {
! 1645: if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
! 1646: codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
! 1647: {
! 1648: active_count--; /* Remove non-match possibility */
! 1649: next_active_state--;
! 1650: }
! 1651: ADD_NEW_DATA(-(state_offset + count), 0, 0);
! 1652: }
! 1653: }
! 1654: break;
! 1655:
! 1656: /*-----------------------------------------------------------------*/
! 1657: case OP_HSPACE_EXTRA + OP_TYPEQUERY:
! 1658: case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
! 1659: case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
! 1660: count = 2;
! 1661: goto QS5;
! 1662:
! 1663: case OP_HSPACE_EXTRA + OP_TYPESTAR:
! 1664: case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
! 1665: case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
! 1666: count = 0;
! 1667:
! 1668: QS5:
! 1669: ADD_ACTIVE(state_offset + 2, 0);
! 1670: if (clen > 0)
! 1671: {
! 1672: BOOL OK;
! 1673: switch (c)
! 1674: {
! 1675: case 0x09: /* HT */
! 1676: case 0x20: /* SPACE */
! 1677: case 0xa0: /* NBSP */
! 1678: case 0x1680: /* OGHAM SPACE MARK */
! 1679: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
! 1680: case 0x2000: /* EN QUAD */
! 1681: case 0x2001: /* EM QUAD */
! 1682: case 0x2002: /* EN SPACE */
! 1683: case 0x2003: /* EM SPACE */
! 1684: case 0x2004: /* THREE-PER-EM SPACE */
! 1685: case 0x2005: /* FOUR-PER-EM SPACE */
! 1686: case 0x2006: /* SIX-PER-EM SPACE */
! 1687: case 0x2007: /* FIGURE SPACE */
! 1688: case 0x2008: /* PUNCTUATION SPACE */
! 1689: case 0x2009: /* THIN SPACE */
! 1690: case 0x200A: /* HAIR SPACE */
! 1691: case 0x202f: /* NARROW NO-BREAK SPACE */
! 1692: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
! 1693: case 0x3000: /* IDEOGRAPHIC SPACE */
! 1694: OK = TRUE;
! 1695: break;
! 1696:
! 1697: default:
! 1698: OK = FALSE;
! 1699: break;
! 1700: }
! 1701:
! 1702: if (OK == (d == OP_HSPACE))
! 1703: {
! 1704: if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
! 1705: codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
! 1706: {
! 1707: active_count--; /* Remove non-match possibility */
! 1708: next_active_state--;
! 1709: }
! 1710: ADD_NEW_DATA(-(state_offset + count), 0, 0);
! 1711: }
! 1712: }
! 1713: break;
! 1714:
! 1715: /*-----------------------------------------------------------------*/
! 1716: #ifdef SUPPORT_UCP
! 1717: case OP_PROP_EXTRA + OP_TYPEEXACT:
! 1718: case OP_PROP_EXTRA + OP_TYPEUPTO:
! 1719: case OP_PROP_EXTRA + OP_TYPEMINUPTO:
! 1720: case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
! 1721: if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
! 1722: { ADD_ACTIVE(state_offset + 6, 0); }
! 1723: count = current_state->count; /* Number already matched */
! 1724: if (clen > 0)
! 1725: {
! 1726: BOOL OK;
! 1727: const ucd_record * prop = GET_UCD(c);
! 1728: switch(code[4])
! 1729: {
! 1730: case PT_ANY:
! 1731: OK = TRUE;
! 1732: break;
! 1733:
! 1734: case PT_LAMP:
! 1735: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
! 1736: prop->chartype == ucp_Lt;
! 1737: break;
! 1738:
! 1739: case PT_GC:
! 1740: OK = _pcre_ucp_gentype[prop->chartype] == code[5];
! 1741: break;
! 1742:
! 1743: case PT_PC:
! 1744: OK = prop->chartype == code[5];
! 1745: break;
! 1746:
! 1747: case PT_SC:
! 1748: OK = prop->script == code[5];
! 1749: break;
! 1750:
! 1751: /* These are specials for combination cases. */
! 1752:
! 1753: case PT_ALNUM:
! 1754: OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
! 1755: _pcre_ucp_gentype[prop->chartype] == ucp_N;
! 1756: break;
! 1757:
! 1758: case PT_SPACE: /* Perl space */
! 1759: OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
! 1760: c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
! 1761: break;
! 1762:
! 1763: case PT_PXSPACE: /* POSIX space */
! 1764: OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
! 1765: c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
! 1766: c == CHAR_FF || c == CHAR_CR;
! 1767: break;
! 1768:
! 1769: case PT_WORD:
! 1770: OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
! 1771: _pcre_ucp_gentype[prop->chartype] == ucp_N ||
! 1772: c == CHAR_UNDERSCORE;
! 1773: break;
! 1774:
! 1775: /* Should never occur, but keep compilers from grumbling. */
! 1776:
! 1777: default:
! 1778: OK = codevalue != OP_PROP;
! 1779: break;
! 1780: }
! 1781:
! 1782: if (OK == (d == OP_PROP))
! 1783: {
! 1784: if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
! 1785: {
! 1786: active_count--; /* Remove non-match possibility */
! 1787: next_active_state--;
! 1788: }
! 1789: if (++count >= GET2(code, 1))
! 1790: { ADD_NEW(state_offset + 6, 0); }
! 1791: else
! 1792: { ADD_NEW(state_offset, count); }
! 1793: }
! 1794: }
! 1795: break;
! 1796:
! 1797: /*-----------------------------------------------------------------*/
! 1798: case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
! 1799: case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
! 1800: case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
! 1801: case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
! 1802: if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
! 1803: { ADD_ACTIVE(state_offset + 4, 0); }
! 1804: count = current_state->count; /* Number already matched */
! 1805: if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
! 1806: {
! 1807: const uschar *nptr = ptr + clen;
! 1808: int ncount = 0;
! 1809: if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
! 1810: {
! 1811: active_count--; /* Remove non-match possibility */
! 1812: next_active_state--;
! 1813: }
! 1814: while (nptr < end_subject)
! 1815: {
! 1816: int nd;
! 1817: int ndlen = 1;
! 1818: GETCHARLEN(nd, nptr, ndlen);
! 1819: if (UCD_CATEGORY(nd) != ucp_M) break;
! 1820: ncount++;
! 1821: nptr += ndlen;
! 1822: }
! 1823: if (++count >= GET2(code, 1))
! 1824: { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
! 1825: else
! 1826: { ADD_NEW_DATA(-state_offset, count, ncount); }
! 1827: }
! 1828: break;
! 1829: #endif
! 1830:
! 1831: /*-----------------------------------------------------------------*/
! 1832: case OP_ANYNL_EXTRA + OP_TYPEEXACT:
! 1833: case OP_ANYNL_EXTRA + OP_TYPEUPTO:
! 1834: case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
! 1835: case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
! 1836: if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
! 1837: { ADD_ACTIVE(state_offset + 4, 0); }
! 1838: count = current_state->count; /* Number already matched */
! 1839: if (clen > 0)
! 1840: {
! 1841: int ncount = 0;
! 1842: switch (c)
! 1843: {
! 1844: case 0x000b:
! 1845: case 0x000c:
! 1846: case 0x0085:
! 1847: case 0x2028:
! 1848: case 0x2029:
! 1849: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
! 1850: goto ANYNL03;
! 1851:
! 1852: case 0x000d:
! 1853: if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
! 1854: /* Fall through */
! 1855:
! 1856: ANYNL03:
! 1857: case 0x000a:
! 1858: if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
! 1859: {
! 1860: active_count--; /* Remove non-match possibility */
! 1861: next_active_state--;
! 1862: }
! 1863: if (++count >= GET2(code, 1))
! 1864: { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
! 1865: else
! 1866: { ADD_NEW_DATA(-state_offset, count, ncount); }
! 1867: break;
! 1868:
! 1869: default:
! 1870: break;
! 1871: }
! 1872: }
! 1873: break;
! 1874:
! 1875: /*-----------------------------------------------------------------*/
! 1876: case OP_VSPACE_EXTRA + OP_TYPEEXACT:
! 1877: case OP_VSPACE_EXTRA + OP_TYPEUPTO:
! 1878: case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
! 1879: case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
! 1880: if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
! 1881: { ADD_ACTIVE(state_offset + 4, 0); }
! 1882: count = current_state->count; /* Number already matched */
! 1883: if (clen > 0)
! 1884: {
! 1885: BOOL OK;
! 1886: switch (c)
! 1887: {
! 1888: case 0x000a:
! 1889: case 0x000b:
! 1890: case 0x000c:
! 1891: case 0x000d:
! 1892: case 0x0085:
! 1893: case 0x2028:
! 1894: case 0x2029:
! 1895: OK = TRUE;
! 1896: break;
! 1897:
! 1898: default:
! 1899: OK = FALSE;
! 1900: }
! 1901:
! 1902: if (OK == (d == OP_VSPACE))
! 1903: {
! 1904: if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
! 1905: {
! 1906: active_count--; /* Remove non-match possibility */
! 1907: next_active_state--;
! 1908: }
! 1909: if (++count >= GET2(code, 1))
! 1910: { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
! 1911: else
! 1912: { ADD_NEW_DATA(-state_offset, count, 0); }
! 1913: }
! 1914: }
! 1915: break;
! 1916:
! 1917: /*-----------------------------------------------------------------*/
! 1918: case OP_HSPACE_EXTRA + OP_TYPEEXACT:
! 1919: case OP_HSPACE_EXTRA + OP_TYPEUPTO:
! 1920: case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
! 1921: case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
! 1922: if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
! 1923: { ADD_ACTIVE(state_offset + 4, 0); }
! 1924: count = current_state->count; /* Number already matched */
! 1925: if (clen > 0)
! 1926: {
! 1927: BOOL OK;
! 1928: switch (c)
! 1929: {
! 1930: case 0x09: /* HT */
! 1931: case 0x20: /* SPACE */
! 1932: case 0xa0: /* NBSP */
! 1933: case 0x1680: /* OGHAM SPACE MARK */
! 1934: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
! 1935: case 0x2000: /* EN QUAD */
! 1936: case 0x2001: /* EM QUAD */
! 1937: case 0x2002: /* EN SPACE */
! 1938: case 0x2003: /* EM SPACE */
! 1939: case 0x2004: /* THREE-PER-EM SPACE */
! 1940: case 0x2005: /* FOUR-PER-EM SPACE */
! 1941: case 0x2006: /* SIX-PER-EM SPACE */
! 1942: case 0x2007: /* FIGURE SPACE */
! 1943: case 0x2008: /* PUNCTUATION SPACE */
! 1944: case 0x2009: /* THIN SPACE */
! 1945: case 0x200A: /* HAIR SPACE */
! 1946: case 0x202f: /* NARROW NO-BREAK SPACE */
! 1947: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
! 1948: case 0x3000: /* IDEOGRAPHIC SPACE */
! 1949: OK = TRUE;
! 1950: break;
! 1951:
! 1952: default:
! 1953: OK = FALSE;
! 1954: break;
! 1955: }
! 1956:
! 1957: if (OK == (d == OP_HSPACE))
! 1958: {
! 1959: if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
! 1960: {
! 1961: active_count--; /* Remove non-match possibility */
! 1962: next_active_state--;
! 1963: }
! 1964: if (++count >= GET2(code, 1))
! 1965: { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
! 1966: else
! 1967: { ADD_NEW_DATA(-state_offset, count, 0); }
! 1968: }
! 1969: }
! 1970: break;
! 1971:
! 1972: /* ========================================================================== */
! 1973: /* These opcodes are followed by a character that is usually compared
! 1974: to the current subject character; it is loaded into d. We still get
! 1975: here even if there is no subject character, because in some cases zero
! 1976: repetitions are permitted. */
! 1977:
! 1978: /*-----------------------------------------------------------------*/
! 1979: case OP_CHAR:
! 1980: if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
! 1981: break;
! 1982:
! 1983: /*-----------------------------------------------------------------*/
! 1984: case OP_CHARI:
! 1985: if (clen == 0) break;
! 1986:
! 1987: #ifdef SUPPORT_UTF8
! 1988: if (utf8)
! 1989: {
! 1990: if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
! 1991: {
! 1992: unsigned int othercase;
! 1993: if (c < 128) othercase = fcc[c]; else
! 1994:
! 1995: /* If we have Unicode property support, we can use it to test the
! 1996: other case of the character. */
! 1997:
! 1998: #ifdef SUPPORT_UCP
! 1999: othercase = UCD_OTHERCASE(c);
! 2000: #else
! 2001: othercase = NOTACHAR;
! 2002: #endif
! 2003:
! 2004: if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
! 2005: }
! 2006: }
! 2007: else
! 2008: #endif /* SUPPORT_UTF8 */
! 2009:
! 2010: /* Non-UTF-8 mode */
! 2011: {
! 2012: if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
! 2013: }
! 2014: break;
! 2015:
! 2016:
! 2017: #ifdef SUPPORT_UCP
! 2018: /*-----------------------------------------------------------------*/
! 2019: /* This is a tricky one because it can match more than one character.
! 2020: Find out how many characters to skip, and then set up a negative state
! 2021: to wait for them to pass before continuing. */
! 2022:
! 2023: case OP_EXTUNI:
! 2024: if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
! 2025: {
! 2026: const uschar *nptr = ptr + clen;
! 2027: int ncount = 0;
! 2028: while (nptr < end_subject)
! 2029: {
! 2030: int nclen = 1;
! 2031: GETCHARLEN(c, nptr, nclen);
! 2032: if (UCD_CATEGORY(c) != ucp_M) break;
! 2033: ncount++;
! 2034: nptr += nclen;
! 2035: }
! 2036: ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
! 2037: }
! 2038: break;
! 2039: #endif
! 2040:
! 2041: /*-----------------------------------------------------------------*/
! 2042: /* This is a tricky like EXTUNI because it too can match more than one
! 2043: character (when CR is followed by LF). In this case, set up a negative
! 2044: state to wait for one character to pass before continuing. */
! 2045:
! 2046: case OP_ANYNL:
! 2047: if (clen > 0) switch(c)
! 2048: {
! 2049: case 0x000b:
! 2050: case 0x000c:
! 2051: case 0x0085:
! 2052: case 0x2028:
! 2053: case 0x2029:
! 2054: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
! 2055:
! 2056: case 0x000a:
! 2057: ADD_NEW(state_offset + 1, 0);
! 2058: break;
! 2059:
! 2060: case 0x000d:
! 2061: if (ptr + 1 < end_subject && ptr[1] == 0x0a)
! 2062: {
! 2063: ADD_NEW_DATA(-(state_offset + 1), 0, 1);
! 2064: }
! 2065: else
! 2066: {
! 2067: ADD_NEW(state_offset + 1, 0);
! 2068: }
! 2069: break;
! 2070: }
! 2071: break;
! 2072:
! 2073: /*-----------------------------------------------------------------*/
! 2074: case OP_NOT_VSPACE:
! 2075: if (clen > 0) switch(c)
! 2076: {
! 2077: case 0x000a:
! 2078: case 0x000b:
! 2079: case 0x000c:
! 2080: case 0x000d:
! 2081: case 0x0085:
! 2082: case 0x2028:
! 2083: case 0x2029:
! 2084: break;
! 2085:
! 2086: default:
! 2087: ADD_NEW(state_offset + 1, 0);
! 2088: break;
! 2089: }
! 2090: break;
! 2091:
! 2092: /*-----------------------------------------------------------------*/
! 2093: case OP_VSPACE:
! 2094: if (clen > 0) switch(c)
! 2095: {
! 2096: case 0x000a:
! 2097: case 0x000b:
! 2098: case 0x000c:
! 2099: case 0x000d:
! 2100: case 0x0085:
! 2101: case 0x2028:
! 2102: case 0x2029:
! 2103: ADD_NEW(state_offset + 1, 0);
! 2104: break;
! 2105:
! 2106: default: break;
! 2107: }
! 2108: break;
! 2109:
! 2110: /*-----------------------------------------------------------------*/
! 2111: case OP_NOT_HSPACE:
! 2112: if (clen > 0) switch(c)
! 2113: {
! 2114: case 0x09: /* HT */
! 2115: case 0x20: /* SPACE */
! 2116: case 0xa0: /* NBSP */
! 2117: case 0x1680: /* OGHAM SPACE MARK */
! 2118: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
! 2119: case 0x2000: /* EN QUAD */
! 2120: case 0x2001: /* EM QUAD */
! 2121: case 0x2002: /* EN SPACE */
! 2122: case 0x2003: /* EM SPACE */
! 2123: case 0x2004: /* THREE-PER-EM SPACE */
! 2124: case 0x2005: /* FOUR-PER-EM SPACE */
! 2125: case 0x2006: /* SIX-PER-EM SPACE */
! 2126: case 0x2007: /* FIGURE SPACE */
! 2127: case 0x2008: /* PUNCTUATION SPACE */
! 2128: case 0x2009: /* THIN SPACE */
! 2129: case 0x200A: /* HAIR SPACE */
! 2130: case 0x202f: /* NARROW NO-BREAK SPACE */
! 2131: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
! 2132: case 0x3000: /* IDEOGRAPHIC SPACE */
! 2133: break;
! 2134:
! 2135: default:
! 2136: ADD_NEW(state_offset + 1, 0);
! 2137: break;
! 2138: }
! 2139: break;
! 2140:
! 2141: /*-----------------------------------------------------------------*/
! 2142: case OP_HSPACE:
! 2143: if (clen > 0) switch(c)
! 2144: {
! 2145: case 0x09: /* HT */
! 2146: case 0x20: /* SPACE */
! 2147: case 0xa0: /* NBSP */
! 2148: case 0x1680: /* OGHAM SPACE MARK */
! 2149: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
! 2150: case 0x2000: /* EN QUAD */
! 2151: case 0x2001: /* EM QUAD */
! 2152: case 0x2002: /* EN SPACE */
! 2153: case 0x2003: /* EM SPACE */
! 2154: case 0x2004: /* THREE-PER-EM SPACE */
! 2155: case 0x2005: /* FOUR-PER-EM SPACE */
! 2156: case 0x2006: /* SIX-PER-EM SPACE */
! 2157: case 0x2007: /* FIGURE SPACE */
! 2158: case 0x2008: /* PUNCTUATION SPACE */
! 2159: case 0x2009: /* THIN SPACE */
! 2160: case 0x200A: /* HAIR SPACE */
! 2161: case 0x202f: /* NARROW NO-BREAK SPACE */
! 2162: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
! 2163: case 0x3000: /* IDEOGRAPHIC SPACE */
! 2164: ADD_NEW(state_offset + 1, 0);
! 2165: break;
! 2166: }
! 2167: break;
! 2168:
! 2169: /*-----------------------------------------------------------------*/
! 2170: /* Match a negated single character casefully. This is only used for
! 2171: one-byte characters, that is, we know that d < 256. The character we are
! 2172: checking (c) can be multibyte. */
! 2173:
! 2174: case OP_NOT:
! 2175: if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
! 2176: break;
! 2177:
! 2178: /*-----------------------------------------------------------------*/
! 2179: /* Match a negated single character caselessly. This is only used for
! 2180: one-byte characters, that is, we know that d < 256. The character we are
! 2181: checking (c) can be multibyte. */
! 2182:
! 2183: case OP_NOTI:
! 2184: if (clen > 0 && c != d && c != fcc[d])
! 2185: { ADD_NEW(state_offset + dlen + 1, 0); }
! 2186: break;
! 2187:
! 2188: /*-----------------------------------------------------------------*/
! 2189: case OP_PLUSI:
! 2190: case OP_MINPLUSI:
! 2191: case OP_POSPLUSI:
! 2192: case OP_NOTPLUSI:
! 2193: case OP_NOTMINPLUSI:
! 2194: case OP_NOTPOSPLUSI:
! 2195: caseless = TRUE;
! 2196: codevalue -= OP_STARI - OP_STAR;
! 2197:
! 2198: /* Fall through */
! 2199: case OP_PLUS:
! 2200: case OP_MINPLUS:
! 2201: case OP_POSPLUS:
! 2202: case OP_NOTPLUS:
! 2203: case OP_NOTMINPLUS:
! 2204: case OP_NOTPOSPLUS:
! 2205: count = current_state->count; /* Already matched */
! 2206: if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
! 2207: if (clen > 0)
! 2208: {
! 2209: unsigned int otherd = NOTACHAR;
! 2210: if (caseless)
! 2211: {
! 2212: #ifdef SUPPORT_UTF8
! 2213: if (utf8 && d >= 128)
! 2214: {
! 2215: #ifdef SUPPORT_UCP
! 2216: otherd = UCD_OTHERCASE(d);
! 2217: #endif /* SUPPORT_UCP */
! 2218: }
! 2219: else
! 2220: #endif /* SUPPORT_UTF8 */
! 2221: otherd = fcc[d];
! 2222: }
! 2223: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
! 2224: {
! 2225: if (count > 0 &&
! 2226: (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
! 2227: {
! 2228: active_count--; /* Remove non-match possibility */
! 2229: next_active_state--;
! 2230: }
! 2231: count++;
! 2232: ADD_NEW(state_offset, count);
! 2233: }
! 2234: }
! 2235: break;
! 2236:
! 2237: /*-----------------------------------------------------------------*/
! 2238: case OP_QUERYI:
! 2239: case OP_MINQUERYI:
! 2240: case OP_POSQUERYI:
! 2241: case OP_NOTQUERYI:
! 2242: case OP_NOTMINQUERYI:
! 2243: case OP_NOTPOSQUERYI:
! 2244: caseless = TRUE;
! 2245: codevalue -= OP_STARI - OP_STAR;
! 2246: /* Fall through */
! 2247: case OP_QUERY:
! 2248: case OP_MINQUERY:
! 2249: case OP_POSQUERY:
! 2250: case OP_NOTQUERY:
! 2251: case OP_NOTMINQUERY:
! 2252: case OP_NOTPOSQUERY:
! 2253: ADD_ACTIVE(state_offset + dlen + 1, 0);
! 2254: if (clen > 0)
! 2255: {
! 2256: unsigned int otherd = NOTACHAR;
! 2257: if (caseless)
! 2258: {
! 2259: #ifdef SUPPORT_UTF8
! 2260: if (utf8 && d >= 128)
! 2261: {
! 2262: #ifdef SUPPORT_UCP
! 2263: otherd = UCD_OTHERCASE(d);
! 2264: #endif /* SUPPORT_UCP */
! 2265: }
! 2266: else
! 2267: #endif /* SUPPORT_UTF8 */
! 2268: otherd = fcc[d];
! 2269: }
! 2270: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
! 2271: {
! 2272: if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
! 2273: {
! 2274: active_count--; /* Remove non-match possibility */
! 2275: next_active_state--;
! 2276: }
! 2277: ADD_NEW(state_offset + dlen + 1, 0);
! 2278: }
! 2279: }
! 2280: break;
! 2281:
! 2282: /*-----------------------------------------------------------------*/
! 2283: case OP_STARI:
! 2284: case OP_MINSTARI:
! 2285: case OP_POSSTARI:
! 2286: case OP_NOTSTARI:
! 2287: case OP_NOTMINSTARI:
! 2288: case OP_NOTPOSSTARI:
! 2289: caseless = TRUE;
! 2290: codevalue -= OP_STARI - OP_STAR;
! 2291: /* Fall through */
! 2292: case OP_STAR:
! 2293: case OP_MINSTAR:
! 2294: case OP_POSSTAR:
! 2295: case OP_NOTSTAR:
! 2296: case OP_NOTMINSTAR:
! 2297: case OP_NOTPOSSTAR:
! 2298: ADD_ACTIVE(state_offset + dlen + 1, 0);
! 2299: if (clen > 0)
! 2300: {
! 2301: unsigned int otherd = NOTACHAR;
! 2302: if (caseless)
! 2303: {
! 2304: #ifdef SUPPORT_UTF8
! 2305: if (utf8 && d >= 128)
! 2306: {
! 2307: #ifdef SUPPORT_UCP
! 2308: otherd = UCD_OTHERCASE(d);
! 2309: #endif /* SUPPORT_UCP */
! 2310: }
! 2311: else
! 2312: #endif /* SUPPORT_UTF8 */
! 2313: otherd = fcc[d];
! 2314: }
! 2315: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
! 2316: {
! 2317: if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
! 2318: {
! 2319: active_count--; /* Remove non-match possibility */
! 2320: next_active_state--;
! 2321: }
! 2322: ADD_NEW(state_offset, 0);
! 2323: }
! 2324: }
! 2325: break;
! 2326:
! 2327: /*-----------------------------------------------------------------*/
! 2328: case OP_EXACTI:
! 2329: case OP_NOTEXACTI:
! 2330: caseless = TRUE;
! 2331: codevalue -= OP_STARI - OP_STAR;
! 2332: /* Fall through */
! 2333: case OP_EXACT:
! 2334: case OP_NOTEXACT:
! 2335: count = current_state->count; /* Number already matched */
! 2336: if (clen > 0)
! 2337: {
! 2338: unsigned int otherd = NOTACHAR;
! 2339: if (caseless)
! 2340: {
! 2341: #ifdef SUPPORT_UTF8
! 2342: if (utf8 && d >= 128)
! 2343: {
! 2344: #ifdef SUPPORT_UCP
! 2345: otherd = UCD_OTHERCASE(d);
! 2346: #endif /* SUPPORT_UCP */
! 2347: }
! 2348: else
! 2349: #endif /* SUPPORT_UTF8 */
! 2350: otherd = fcc[d];
! 2351: }
! 2352: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
! 2353: {
! 2354: if (++count >= GET2(code, 1))
! 2355: { ADD_NEW(state_offset + dlen + 3, 0); }
! 2356: else
! 2357: { ADD_NEW(state_offset, count); }
! 2358: }
! 2359: }
! 2360: break;
! 2361:
! 2362: /*-----------------------------------------------------------------*/
! 2363: case OP_UPTOI:
! 2364: case OP_MINUPTOI:
! 2365: case OP_POSUPTOI:
! 2366: case OP_NOTUPTOI:
! 2367: case OP_NOTMINUPTOI:
! 2368: case OP_NOTPOSUPTOI:
! 2369: caseless = TRUE;
! 2370: codevalue -= OP_STARI - OP_STAR;
! 2371: /* Fall through */
! 2372: case OP_UPTO:
! 2373: case OP_MINUPTO:
! 2374: case OP_POSUPTO:
! 2375: case OP_NOTUPTO:
! 2376: case OP_NOTMINUPTO:
! 2377: case OP_NOTPOSUPTO:
! 2378: ADD_ACTIVE(state_offset + dlen + 3, 0);
! 2379: count = current_state->count; /* Number already matched */
! 2380: if (clen > 0)
! 2381: {
! 2382: unsigned int otherd = NOTACHAR;
! 2383: if (caseless)
! 2384: {
! 2385: #ifdef SUPPORT_UTF8
! 2386: if (utf8 && d >= 128)
! 2387: {
! 2388: #ifdef SUPPORT_UCP
! 2389: otherd = UCD_OTHERCASE(d);
! 2390: #endif /* SUPPORT_UCP */
! 2391: }
! 2392: else
! 2393: #endif /* SUPPORT_UTF8 */
! 2394: otherd = fcc[d];
! 2395: }
! 2396: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
! 2397: {
! 2398: if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
! 2399: {
! 2400: active_count--; /* Remove non-match possibility */
! 2401: next_active_state--;
! 2402: }
! 2403: if (++count >= GET2(code, 1))
! 2404: { ADD_NEW(state_offset + dlen + 3, 0); }
! 2405: else
! 2406: { ADD_NEW(state_offset, count); }
! 2407: }
! 2408: }
! 2409: break;
! 2410:
! 2411:
! 2412: /* ========================================================================== */
! 2413: /* These are the class-handling opcodes */
! 2414:
! 2415: case OP_CLASS:
! 2416: case OP_NCLASS:
! 2417: case OP_XCLASS:
! 2418: {
! 2419: BOOL isinclass = FALSE;
! 2420: int next_state_offset;
! 2421: const uschar *ecode;
! 2422:
! 2423: /* For a simple class, there is always just a 32-byte table, and we
! 2424: can set isinclass from it. */
! 2425:
! 2426: if (codevalue != OP_XCLASS)
! 2427: {
! 2428: ecode = code + 33;
! 2429: if (clen > 0)
! 2430: {
! 2431: isinclass = (c > 255)? (codevalue == OP_NCLASS) :
! 2432: ((code[1 + c/8] & (1 << (c&7))) != 0);
! 2433: }
! 2434: }
! 2435:
! 2436: /* An extended class may have a table or a list of single characters,
! 2437: ranges, or both, and it may be positive or negative. There's a
! 2438: function that sorts all this out. */
! 2439:
! 2440: else
! 2441: {
! 2442: ecode = code + GET(code, 1);
! 2443: if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
! 2444: }
! 2445:
! 2446: /* At this point, isinclass is set for all kinds of class, and ecode
! 2447: points to the byte after the end of the class. If there is a
! 2448: quantifier, this is where it will be. */
! 2449:
! 2450: next_state_offset = (int)(ecode - start_code);
! 2451:
! 2452: switch (*ecode)
! 2453: {
! 2454: case OP_CRSTAR:
! 2455: case OP_CRMINSTAR:
! 2456: ADD_ACTIVE(next_state_offset + 1, 0);
! 2457: if (isinclass) { ADD_NEW(state_offset, 0); }
! 2458: break;
! 2459:
! 2460: case OP_CRPLUS:
! 2461: case OP_CRMINPLUS:
! 2462: count = current_state->count; /* Already matched */
! 2463: if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
! 2464: if (isinclass) { count++; ADD_NEW(state_offset, count); }
! 2465: break;
! 2466:
! 2467: case OP_CRQUERY:
! 2468: case OP_CRMINQUERY:
! 2469: ADD_ACTIVE(next_state_offset + 1, 0);
! 2470: if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
! 2471: break;
! 2472:
! 2473: case OP_CRRANGE:
! 2474: case OP_CRMINRANGE:
! 2475: count = current_state->count; /* Already matched */
! 2476: if (count >= GET2(ecode, 1))
! 2477: { ADD_ACTIVE(next_state_offset + 5, 0); }
! 2478: if (isinclass)
! 2479: {
! 2480: int max = GET2(ecode, 3);
! 2481: if (++count >= max && max != 0) /* Max 0 => no limit */
! 2482: { ADD_NEW(next_state_offset + 5, 0); }
! 2483: else
! 2484: { ADD_NEW(state_offset, count); }
! 2485: }
! 2486: break;
! 2487:
! 2488: default:
! 2489: if (isinclass) { ADD_NEW(next_state_offset, 0); }
! 2490: break;
! 2491: }
! 2492: }
! 2493: break;
! 2494:
! 2495: /* ========================================================================== */
! 2496: /* These are the opcodes for fancy brackets of various kinds. We have
! 2497: to use recursion in order to handle them. The "always failing" assertion
! 2498: (?!) is optimised to OP_FAIL when compiling, so we have to support that,
! 2499: though the other "backtracking verbs" are not supported. */
! 2500:
! 2501: case OP_FAIL:
! 2502: forced_fail++; /* Count FAILs for multiple states */
! 2503: break;
! 2504:
! 2505: case OP_ASSERT:
! 2506: case OP_ASSERT_NOT:
! 2507: case OP_ASSERTBACK:
! 2508: case OP_ASSERTBACK_NOT:
! 2509: {
! 2510: int rc;
! 2511: int local_offsets[2];
! 2512: int local_workspace[1000];
! 2513: const uschar *endasscode = code + GET(code, 1);
! 2514:
! 2515: while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
! 2516:
! 2517: rc = internal_dfa_exec(
! 2518: md, /* static match data */
! 2519: code, /* this subexpression's code */
! 2520: ptr, /* where we currently are */
! 2521: (int)(ptr - start_subject), /* start offset */
! 2522: local_offsets, /* offset vector */
! 2523: sizeof(local_offsets)/sizeof(int), /* size of same */
! 2524: local_workspace, /* workspace vector */
! 2525: sizeof(local_workspace)/sizeof(int), /* size of same */
! 2526: rlevel); /* function recursion level */
! 2527:
! 2528: if (rc == PCRE_ERROR_DFA_UITEM) return rc;
! 2529: if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
! 2530: { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
! 2531: }
! 2532: break;
! 2533:
! 2534: /*-----------------------------------------------------------------*/
! 2535: case OP_COND:
! 2536: case OP_SCOND:
! 2537: {
! 2538: int local_offsets[1000];
! 2539: int local_workspace[1000];
! 2540: int codelink = GET(code, 1);
! 2541: int condcode;
! 2542:
! 2543: /* Because of the way auto-callout works during compile, a callout item
! 2544: is inserted between OP_COND and an assertion condition. This does not
! 2545: happen for the other conditions. */
! 2546:
! 2547: if (code[LINK_SIZE+1] == OP_CALLOUT)
! 2548: {
! 2549: rrc = 0;
! 2550: if (pcre_callout != NULL)
! 2551: {
! 2552: pcre_callout_block cb;
! 2553: cb.version = 1; /* Version 1 of the callout block */
! 2554: cb.callout_number = code[LINK_SIZE+2];
! 2555: cb.offset_vector = offsets;
! 2556: cb.subject = (PCRE_SPTR)start_subject;
! 2557: cb.subject_length = (int)(end_subject - start_subject);
! 2558: cb.start_match = (int)(current_subject - start_subject);
! 2559: cb.current_position = (int)(ptr - start_subject);
! 2560: cb.pattern_position = GET(code, LINK_SIZE + 3);
! 2561: cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
! 2562: cb.capture_top = 1;
! 2563: cb.capture_last = -1;
! 2564: cb.callout_data = md->callout_data;
! 2565: cb.mark = NULL; /* No (*MARK) support */
! 2566: if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
! 2567: }
! 2568: if (rrc > 0) break; /* Fail this thread */
! 2569: code += _pcre_OP_lengths[OP_CALLOUT]; /* Skip callout data */
! 2570: }
! 2571:
! 2572: condcode = code[LINK_SIZE+1];
! 2573:
! 2574: /* Back reference conditions are not supported */
! 2575:
! 2576: if (condcode == OP_CREF || condcode == OP_NCREF)
! 2577: return PCRE_ERROR_DFA_UCOND;
! 2578:
! 2579: /* The DEFINE condition is always false */
! 2580:
! 2581: if (condcode == OP_DEF)
! 2582: { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
! 2583:
! 2584: /* The only supported version of OP_RREF is for the value RREF_ANY,
! 2585: which means "test if in any recursion". We can't test for specifically
! 2586: recursed groups. */
! 2587:
! 2588: else if (condcode == OP_RREF || condcode == OP_NRREF)
! 2589: {
! 2590: int value = GET2(code, LINK_SIZE+2);
! 2591: if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
! 2592: if (md->recursive != NULL)
! 2593: { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
! 2594: else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
! 2595: }
! 2596:
! 2597: /* Otherwise, the condition is an assertion */
! 2598:
! 2599: else
! 2600: {
! 2601: int rc;
! 2602: const uschar *asscode = code + LINK_SIZE + 1;
! 2603: const uschar *endasscode = asscode + GET(asscode, 1);
! 2604:
! 2605: while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
! 2606:
! 2607: rc = internal_dfa_exec(
! 2608: md, /* fixed match data */
! 2609: asscode, /* this subexpression's code */
! 2610: ptr, /* where we currently are */
! 2611: (int)(ptr - start_subject), /* start offset */
! 2612: local_offsets, /* offset vector */
! 2613: sizeof(local_offsets)/sizeof(int), /* size of same */
! 2614: local_workspace, /* workspace vector */
! 2615: sizeof(local_workspace)/sizeof(int), /* size of same */
! 2616: rlevel); /* function recursion level */
! 2617:
! 2618: if (rc == PCRE_ERROR_DFA_UITEM) return rc;
! 2619: if ((rc >= 0) ==
! 2620: (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
! 2621: { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
! 2622: else
! 2623: { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
! 2624: }
! 2625: }
! 2626: break;
! 2627:
! 2628: /*-----------------------------------------------------------------*/
! 2629: case OP_RECURSE:
! 2630: {
! 2631: dfa_recursion_info *ri;
! 2632: int local_offsets[1000];
! 2633: int local_workspace[1000];
! 2634: const uschar *callpat = start_code + GET(code, 1);
! 2635: int recno = (callpat == md->start_code)? 0 :
! 2636: GET2(callpat, 1 + LINK_SIZE);
! 2637: int rc;
! 2638:
! 2639: DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
! 2640:
! 2641: /* Check for repeating a recursion without advancing the subject
! 2642: pointer. This should catch convoluted mutual recursions. (Some simple
! 2643: cases are caught at compile time.) */
! 2644:
! 2645: for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
! 2646: if (recno == ri->group_num && ptr == ri->subject_position)
! 2647: return PCRE_ERROR_RECURSELOOP;
! 2648:
! 2649: /* Remember this recursion and where we started it so as to
! 2650: catch infinite loops. */
! 2651:
! 2652: new_recursive.group_num = recno;
! 2653: new_recursive.subject_position = ptr;
! 2654: new_recursive.prevrec = md->recursive;
! 2655: md->recursive = &new_recursive;
! 2656:
! 2657: rc = internal_dfa_exec(
! 2658: md, /* fixed match data */
! 2659: callpat, /* this subexpression's code */
! 2660: ptr, /* where we currently are */
! 2661: (int)(ptr - start_subject), /* start offset */
! 2662: local_offsets, /* offset vector */
! 2663: sizeof(local_offsets)/sizeof(int), /* size of same */
! 2664: local_workspace, /* workspace vector */
! 2665: sizeof(local_workspace)/sizeof(int), /* size of same */
! 2666: rlevel); /* function recursion level */
! 2667:
! 2668: md->recursive = new_recursive.prevrec; /* Done this recursion */
! 2669:
! 2670: DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
! 2671: rc));
! 2672:
! 2673: /* Ran out of internal offsets */
! 2674:
! 2675: if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
! 2676:
! 2677: /* For each successful matched substring, set up the next state with a
! 2678: count of characters to skip before trying it. Note that the count is in
! 2679: characters, not bytes. */
! 2680:
! 2681: if (rc > 0)
! 2682: {
! 2683: for (rc = rc*2 - 2; rc >= 0; rc -= 2)
! 2684: {
! 2685: const uschar *p = start_subject + local_offsets[rc];
! 2686: const uschar *pp = start_subject + local_offsets[rc+1];
! 2687: int charcount = local_offsets[rc+1] - local_offsets[rc];
! 2688: while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
! 2689: if (charcount > 0)
! 2690: {
! 2691: ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
! 2692: }
! 2693: else
! 2694: {
! 2695: ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
! 2696: }
! 2697: }
! 2698: }
! 2699: else if (rc != PCRE_ERROR_NOMATCH) return rc;
! 2700: }
! 2701: break;
! 2702:
! 2703: /*-----------------------------------------------------------------*/
! 2704: case OP_BRAPOS:
! 2705: case OP_SBRAPOS:
! 2706: case OP_CBRAPOS:
! 2707: case OP_SCBRAPOS:
! 2708: case OP_BRAPOSZERO:
! 2709: {
! 2710: int charcount, matched_count;
! 2711: const uschar *local_ptr = ptr;
! 2712: BOOL allow_zero;
! 2713:
! 2714: if (codevalue == OP_BRAPOSZERO)
! 2715: {
! 2716: allow_zero = TRUE;
! 2717: codevalue = *(++code); /* Codevalue will be one of above BRAs */
! 2718: }
! 2719: else allow_zero = FALSE;
! 2720:
! 2721: /* Loop to match the subpattern as many times as possible as if it were
! 2722: a complete pattern. */
! 2723:
! 2724: for (matched_count = 0;; matched_count++)
! 2725: {
! 2726: int local_offsets[2];
! 2727: int local_workspace[1000];
! 2728:
! 2729: int rc = internal_dfa_exec(
! 2730: md, /* fixed match data */
! 2731: code, /* this subexpression's code */
! 2732: local_ptr, /* where we currently are */
! 2733: (int)(ptr - start_subject), /* start offset */
! 2734: local_offsets, /* offset vector */
! 2735: sizeof(local_offsets)/sizeof(int), /* size of same */
! 2736: local_workspace, /* workspace vector */
! 2737: sizeof(local_workspace)/sizeof(int), /* size of same */
! 2738: rlevel); /* function recursion level */
! 2739:
! 2740: /* Failed to match */
! 2741:
! 2742: if (rc < 0)
! 2743: {
! 2744: if (rc != PCRE_ERROR_NOMATCH) return rc;
! 2745: break;
! 2746: }
! 2747:
! 2748: /* Matched: break the loop if zero characters matched. */
! 2749:
! 2750: charcount = local_offsets[1] - local_offsets[0];
! 2751: if (charcount == 0) break;
! 2752: local_ptr += charcount; /* Advance temporary position ptr */
! 2753: }
! 2754:
! 2755: /* At this point we have matched the subpattern matched_count
! 2756: times, and local_ptr is pointing to the character after the end of the
! 2757: last match. */
! 2758:
! 2759: if (matched_count > 0 || allow_zero)
! 2760: {
! 2761: const uschar *end_subpattern = code;
! 2762: int next_state_offset;
! 2763:
! 2764: do { end_subpattern += GET(end_subpattern, 1); }
! 2765: while (*end_subpattern == OP_ALT);
! 2766: next_state_offset =
! 2767: (int)(end_subpattern - start_code + LINK_SIZE + 1);
! 2768:
! 2769: /* Optimization: if there are no more active states, and there
! 2770: are no new states yet set up, then skip over the subject string
! 2771: right here, to save looping. Otherwise, set up the new state to swing
! 2772: into action when the end of the matched substring is reached. */
! 2773:
! 2774: if (i + 1 >= active_count && new_count == 0)
! 2775: {
! 2776: ptr = local_ptr;
! 2777: clen = 0;
! 2778: ADD_NEW(next_state_offset, 0);
! 2779: }
! 2780: else
! 2781: {
! 2782: const uschar *p = ptr;
! 2783: const uschar *pp = local_ptr;
! 2784: charcount = (int)(pp - p);
! 2785: while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
! 2786: ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
! 2787: }
! 2788: }
! 2789: }
! 2790: break;
! 2791:
! 2792: /*-----------------------------------------------------------------*/
! 2793: case OP_ONCE:
! 2794: case OP_ONCE_NC:
! 2795: {
! 2796: int local_offsets[2];
! 2797: int local_workspace[1000];
! 2798:
! 2799: int rc = internal_dfa_exec(
! 2800: md, /* fixed match data */
! 2801: code, /* this subexpression's code */
! 2802: ptr, /* where we currently are */
! 2803: (int)(ptr - start_subject), /* start offset */
! 2804: local_offsets, /* offset vector */
! 2805: sizeof(local_offsets)/sizeof(int), /* size of same */
! 2806: local_workspace, /* workspace vector */
! 2807: sizeof(local_workspace)/sizeof(int), /* size of same */
! 2808: rlevel); /* function recursion level */
! 2809:
! 2810: if (rc >= 0)
! 2811: {
! 2812: const uschar *end_subpattern = code;
! 2813: int charcount = local_offsets[1] - local_offsets[0];
! 2814: int next_state_offset, repeat_state_offset;
! 2815:
! 2816: do { end_subpattern += GET(end_subpattern, 1); }
! 2817: while (*end_subpattern == OP_ALT);
! 2818: next_state_offset =
! 2819: (int)(end_subpattern - start_code + LINK_SIZE + 1);
! 2820:
! 2821: /* If the end of this subpattern is KETRMAX or KETRMIN, we must
! 2822: arrange for the repeat state also to be added to the relevant list.
! 2823: Calculate the offset, or set -1 for no repeat. */
! 2824:
! 2825: repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
! 2826: *end_subpattern == OP_KETRMIN)?
! 2827: (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
! 2828:
! 2829: /* If we have matched an empty string, add the next state at the
! 2830: current character pointer. This is important so that the duplicate
! 2831: checking kicks in, which is what breaks infinite loops that match an
! 2832: empty string. */
! 2833:
! 2834: if (charcount == 0)
! 2835: {
! 2836: ADD_ACTIVE(next_state_offset, 0);
! 2837: }
! 2838:
! 2839: /* Optimization: if there are no more active states, and there
! 2840: are no new states yet set up, then skip over the subject string
! 2841: right here, to save looping. Otherwise, set up the new state to swing
! 2842: into action when the end of the matched substring is reached. */
! 2843:
! 2844: else if (i + 1 >= active_count && new_count == 0)
! 2845: {
! 2846: ptr += charcount;
! 2847: clen = 0;
! 2848: ADD_NEW(next_state_offset, 0);
! 2849:
! 2850: /* If we are adding a repeat state at the new character position,
! 2851: we must fudge things so that it is the only current state.
! 2852: Otherwise, it might be a duplicate of one we processed before, and
! 2853: that would cause it to be skipped. */
! 2854:
! 2855: if (repeat_state_offset >= 0)
! 2856: {
! 2857: next_active_state = active_states;
! 2858: active_count = 0;
! 2859: i = -1;
! 2860: ADD_ACTIVE(repeat_state_offset, 0);
! 2861: }
! 2862: }
! 2863: else
! 2864: {
! 2865: const uschar *p = start_subject + local_offsets[0];
! 2866: const uschar *pp = start_subject + local_offsets[1];
! 2867: while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
! 2868: ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
! 2869: if (repeat_state_offset >= 0)
! 2870: { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
! 2871: }
! 2872: }
! 2873: else if (rc != PCRE_ERROR_NOMATCH) return rc;
! 2874: }
! 2875: break;
! 2876:
! 2877:
! 2878: /* ========================================================================== */
! 2879: /* Handle callouts */
! 2880:
! 2881: case OP_CALLOUT:
! 2882: rrc = 0;
! 2883: if (pcre_callout != NULL)
! 2884: {
! 2885: pcre_callout_block cb;
! 2886: cb.version = 1; /* Version 1 of the callout block */
! 2887: cb.callout_number = code[1];
! 2888: cb.offset_vector = offsets;
! 2889: cb.subject = (PCRE_SPTR)start_subject;
! 2890: cb.subject_length = (int)(end_subject - start_subject);
! 2891: cb.start_match = (int)(current_subject - start_subject);
! 2892: cb.current_position = (int)(ptr - start_subject);
! 2893: cb.pattern_position = GET(code, 2);
! 2894: cb.next_item_length = GET(code, 2 + LINK_SIZE);
! 2895: cb.capture_top = 1;
! 2896: cb.capture_last = -1;
! 2897: cb.callout_data = md->callout_data;
! 2898: cb.mark = NULL; /* No (*MARK) support */
! 2899: if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
! 2900: }
! 2901: if (rrc == 0)
! 2902: { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
! 2903: break;
! 2904:
! 2905:
! 2906: /* ========================================================================== */
! 2907: default: /* Unsupported opcode */
! 2908: return PCRE_ERROR_DFA_UITEM;
! 2909: }
! 2910:
! 2911: NEXT_ACTIVE_STATE: continue;
! 2912:
! 2913: } /* End of loop scanning active states */
! 2914:
! 2915: /* We have finished the processing at the current subject character. If no
! 2916: new states have been set for the next character, we have found all the
! 2917: matches that we are going to find. If we are at the top level and partial
! 2918: matching has been requested, check for appropriate conditions.
! 2919:
! 2920: The "forced_ fail" variable counts the number of (*F) encountered for the
! 2921: character. If it is equal to the original active_count (saved in
! 2922: workspace[1]) it means that (*F) was found on every active state. In this
! 2923: case we don't want to give a partial match.
! 2924:
! 2925: The "could_continue" variable is true if a state could have continued but
! 2926: for the fact that the end of the subject was reached. */
! 2927:
! 2928: if (new_count <= 0)
! 2929: {
! 2930: if (rlevel == 1 && /* Top level, and */
! 2931: could_continue && /* Some could go on */
! 2932: forced_fail != workspace[1] && /* Not all forced fail & */
! 2933: ( /* either... */
! 2934: (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
! 2935: || /* or... */
! 2936: ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
! 2937: match_count < 0) /* no matches */
! 2938: ) && /* And... */
! 2939: ptr >= end_subject && /* Reached end of subject */
! 2940: ptr > md->start_used_ptr) /* Inspected non-empty string */
! 2941: {
! 2942: if (offsetcount >= 2)
! 2943: {
! 2944: offsets[0] = (int)(md->start_used_ptr - start_subject);
! 2945: offsets[1] = (int)(end_subject - start_subject);
! 2946: }
! 2947: match_count = PCRE_ERROR_PARTIAL;
! 2948: }
! 2949:
! 2950: DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
! 2951: "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
! 2952: rlevel*2-2, SP));
! 2953: break; /* In effect, "return", but see the comment below */
! 2954: }
! 2955:
! 2956: /* One or more states are active for the next character. */
! 2957:
! 2958: ptr += clen; /* Advance to next subject character */
! 2959: } /* Loop to move along the subject string */
! 2960:
! 2961: /* Control gets here from "break" a few lines above. We do it this way because
! 2962: if we use "return" above, we have compiler trouble. Some compilers warn if
! 2963: there's nothing here because they think the function doesn't return a value. On
! 2964: the other hand, if we put a dummy statement here, some more clever compilers
! 2965: complain that it can't be reached. Sigh. */
! 2966:
! 2967: return match_count;
! 2968: }
! 2969:
! 2970:
! 2971:
! 2972:
! 2973: /*************************************************
! 2974: * Execute a Regular Expression - DFA engine *
! 2975: *************************************************/
! 2976:
! 2977: /* This external function applies a compiled re to a subject string using a DFA
! 2978: engine. This function calls the internal function multiple times if the pattern
! 2979: is not anchored.
! 2980:
! 2981: Arguments:
! 2982: argument_re points to the compiled expression
! 2983: extra_data points to extra data or is NULL
! 2984: subject points to the subject string
! 2985: length length of subject string (may contain binary zeros)
! 2986: start_offset where to start in the subject string
! 2987: options option bits
! 2988: offsets vector of match offsets
! 2989: offsetcount size of same
! 2990: workspace workspace vector
! 2991: wscount size of same
! 2992:
! 2993: Returns: > 0 => number of match offset pairs placed in offsets
! 2994: = 0 => offsets overflowed; longest matches are present
! 2995: -1 => failed to match
! 2996: < -1 => some kind of unexpected problem
! 2997: */
! 2998:
! 2999: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
! 3000: pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
! 3001: const char *subject, int length, int start_offset, int options, int *offsets,
! 3002: int offsetcount, int *workspace, int wscount)
! 3003: {
! 3004: real_pcre *re = (real_pcre *)argument_re;
! 3005: dfa_match_data match_block;
! 3006: dfa_match_data *md = &match_block;
! 3007: BOOL utf8, anchored, startline, firstline;
! 3008: const uschar *current_subject, *end_subject, *lcc;
! 3009:
! 3010: pcre_study_data internal_study;
! 3011: const pcre_study_data *study = NULL;
! 3012: real_pcre internal_re;
! 3013:
! 3014: const uschar *req_byte_ptr;
! 3015: const uschar *start_bits = NULL;
! 3016: BOOL first_byte_caseless = FALSE;
! 3017: BOOL req_byte_caseless = FALSE;
! 3018: int first_byte = -1;
! 3019: int req_byte = -1;
! 3020: int req_byte2 = -1;
! 3021: int newline;
! 3022:
! 3023: /* Plausibility checks */
! 3024:
! 3025: if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
! 3026: if (re == NULL || subject == NULL || workspace == NULL ||
! 3027: (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
! 3028: if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
! 3029: if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
! 3030: if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
! 3031:
! 3032: /* We need to find the pointer to any study data before we test for byte
! 3033: flipping, so we scan the extra_data block first. This may set two fields in the
! 3034: match block, so we must initialize them beforehand. However, the other fields
! 3035: in the match block must not be set until after the byte flipping. */
! 3036:
! 3037: md->tables = re->tables;
! 3038: md->callout_data = NULL;
! 3039:
! 3040: if (extra_data != NULL)
! 3041: {
! 3042: unsigned int flags = extra_data->flags;
! 3043: if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
! 3044: study = (const pcre_study_data *)extra_data->study_data;
! 3045: if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
! 3046: if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
! 3047: return PCRE_ERROR_DFA_UMLIMIT;
! 3048: if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
! 3049: md->callout_data = extra_data->callout_data;
! 3050: if ((flags & PCRE_EXTRA_TABLES) != 0)
! 3051: md->tables = extra_data->tables;
! 3052: }
! 3053:
! 3054: /* Check that the first field in the block is the magic number. If it is not,
! 3055: test for a regex that was compiled on a host of opposite endianness. If this is
! 3056: the case, flipped values are put in internal_re and internal_study if there was
! 3057: study data too. */
! 3058:
! 3059: if (re->magic_number != MAGIC_NUMBER)
! 3060: {
! 3061: re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
! 3062: if (re == NULL) return PCRE_ERROR_BADMAGIC;
! 3063: if (study != NULL) study = &internal_study;
! 3064: }
! 3065:
! 3066: /* Set some local values */
! 3067:
! 3068: current_subject = (const unsigned char *)subject + start_offset;
! 3069: end_subject = (const unsigned char *)subject + length;
! 3070: req_byte_ptr = current_subject - 1;
! 3071:
! 3072: #ifdef SUPPORT_UTF8
! 3073: utf8 = (re->options & PCRE_UTF8) != 0;
! 3074: #else
! 3075: utf8 = FALSE;
! 3076: #endif
! 3077:
! 3078: anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
! 3079: (re->options & PCRE_ANCHORED) != 0;
! 3080:
! 3081: /* The remaining fixed data for passing around. */
! 3082:
! 3083: md->start_code = (const uschar *)argument_re +
! 3084: re->name_table_offset + re->name_count * re->name_entry_size;
! 3085: md->start_subject = (const unsigned char *)subject;
! 3086: md->end_subject = end_subject;
! 3087: md->start_offset = start_offset;
! 3088: md->moptions = options;
! 3089: md->poptions = re->options;
! 3090:
! 3091: /* If the BSR option is not set at match time, copy what was set
! 3092: at compile time. */
! 3093:
! 3094: if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
! 3095: {
! 3096: if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
! 3097: md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
! 3098: #ifdef BSR_ANYCRLF
! 3099: else md->moptions |= PCRE_BSR_ANYCRLF;
! 3100: #endif
! 3101: }
! 3102:
! 3103: /* Handle different types of newline. The three bits give eight cases. If
! 3104: nothing is set at run time, whatever was used at compile time applies. */
! 3105:
! 3106: switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
! 3107: PCRE_NEWLINE_BITS)
! 3108: {
! 3109: case 0: newline = NEWLINE; break; /* Compile-time default */
! 3110: case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
! 3111: case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
! 3112: case PCRE_NEWLINE_CR+
! 3113: PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
! 3114: case PCRE_NEWLINE_ANY: newline = -1; break;
! 3115: case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
! 3116: default: return PCRE_ERROR_BADNEWLINE;
! 3117: }
! 3118:
! 3119: if (newline == -2)
! 3120: {
! 3121: md->nltype = NLTYPE_ANYCRLF;
! 3122: }
! 3123: else if (newline < 0)
! 3124: {
! 3125: md->nltype = NLTYPE_ANY;
! 3126: }
! 3127: else
! 3128: {
! 3129: md->nltype = NLTYPE_FIXED;
! 3130: if (newline > 255)
! 3131: {
! 3132: md->nllen = 2;
! 3133: md->nl[0] = (newline >> 8) & 255;
! 3134: md->nl[1] = newline & 255;
! 3135: }
! 3136: else
! 3137: {
! 3138: md->nllen = 1;
! 3139: md->nl[0] = newline;
! 3140: }
! 3141: }
! 3142:
! 3143: /* Check a UTF-8 string if required. Unfortunately there's no way of passing
! 3144: back the character offset. */
! 3145:
! 3146: #ifdef SUPPORT_UTF8
! 3147: if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
! 3148: {
! 3149: int erroroffset;
! 3150: int errorcode = _pcre_valid_utf8((uschar *)subject, length, &erroroffset);
! 3151: if (errorcode != 0)
! 3152: {
! 3153: if (offsetcount >= 2)
! 3154: {
! 3155: offsets[0] = erroroffset;
! 3156: offsets[1] = errorcode;
! 3157: }
! 3158: return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
! 3159: PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
! 3160: }
! 3161: if (start_offset > 0 && start_offset < length &&
! 3162: (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
! 3163: return PCRE_ERROR_BADUTF8_OFFSET;
! 3164: }
! 3165: #endif
! 3166:
! 3167: /* If the exec call supplied NULL for tables, use the inbuilt ones. This
! 3168: is a feature that makes it possible to save compiled regex and re-use them
! 3169: in other programs later. */
! 3170:
! 3171: if (md->tables == NULL) md->tables = _pcre_default_tables;
! 3172:
! 3173: /* The lower casing table and the "must be at the start of a line" flag are
! 3174: used in a loop when finding where to start. */
! 3175:
! 3176: lcc = md->tables + lcc_offset;
! 3177: startline = (re->flags & PCRE_STARTLINE) != 0;
! 3178: firstline = (re->options & PCRE_FIRSTLINE) != 0;
! 3179:
! 3180: /* Set up the first character to match, if available. The first_byte value is
! 3181: never set for an anchored regular expression, but the anchoring may be forced
! 3182: at run time, so we have to test for anchoring. The first char may be unset for
! 3183: an unanchored pattern, of course. If there's no first char and the pattern was
! 3184: studied, there may be a bitmap of possible first characters. */
! 3185:
! 3186: if (!anchored)
! 3187: {
! 3188: if ((re->flags & PCRE_FIRSTSET) != 0)
! 3189: {
! 3190: first_byte = re->first_byte & 255;
! 3191: if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
! 3192: first_byte = lcc[first_byte];
! 3193: }
! 3194: else
! 3195: {
! 3196: if (!startline && study != NULL &&
! 3197: (study->flags & PCRE_STUDY_MAPPED) != 0)
! 3198: start_bits = study->start_bits;
! 3199: }
! 3200: }
! 3201:
! 3202: /* For anchored or unanchored matches, there may be a "last known required
! 3203: character" set. */
! 3204:
! 3205: if ((re->flags & PCRE_REQCHSET) != 0)
! 3206: {
! 3207: req_byte = re->req_byte & 255;
! 3208: req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
! 3209: req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
! 3210: }
! 3211:
! 3212: /* Call the main matching function, looping for a non-anchored regex after a
! 3213: failed match. If not restarting, perform certain optimizations at the start of
! 3214: a match. */
! 3215:
! 3216: for (;;)
! 3217: {
! 3218: int rc;
! 3219:
! 3220: if ((options & PCRE_DFA_RESTART) == 0)
! 3221: {
! 3222: const uschar *save_end_subject = end_subject;
! 3223:
! 3224: /* If firstline is TRUE, the start of the match is constrained to the first
! 3225: line of a multiline string. Implement this by temporarily adjusting
! 3226: end_subject so that we stop scanning at a newline. If the match fails at
! 3227: the newline, later code breaks this loop. */
! 3228:
! 3229: if (firstline)
! 3230: {
! 3231: USPTR t = current_subject;
! 3232: #ifdef SUPPORT_UTF8
! 3233: if (utf8)
! 3234: {
! 3235: while (t < md->end_subject && !IS_NEWLINE(t))
! 3236: {
! 3237: t++;
! 3238: while (t < end_subject && (*t & 0xc0) == 0x80) t++;
! 3239: }
! 3240: }
! 3241: else
! 3242: #endif
! 3243: while (t < md->end_subject && !IS_NEWLINE(t)) t++;
! 3244: end_subject = t;
! 3245: }
! 3246:
! 3247: /* There are some optimizations that avoid running the match if a known
! 3248: starting point is not found. However, there is an option that disables
! 3249: these, for testing and for ensuring that all callouts do actually occur.
! 3250: The option can be set in the regex by (*NO_START_OPT) or passed in
! 3251: match-time options. */
! 3252:
! 3253: if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
! 3254: {
! 3255: /* Advance to a known first byte. */
! 3256:
! 3257: if (first_byte >= 0)
! 3258: {
! 3259: if (first_byte_caseless)
! 3260: while (current_subject < end_subject &&
! 3261: lcc[*current_subject] != first_byte)
! 3262: current_subject++;
! 3263: else
! 3264: while (current_subject < end_subject &&
! 3265: *current_subject != first_byte)
! 3266: current_subject++;
! 3267: }
! 3268:
! 3269: /* Or to just after a linebreak for a multiline match if possible */
! 3270:
! 3271: else if (startline)
! 3272: {
! 3273: if (current_subject > md->start_subject + start_offset)
! 3274: {
! 3275: #ifdef SUPPORT_UTF8
! 3276: if (utf8)
! 3277: {
! 3278: while (current_subject < end_subject &&
! 3279: !WAS_NEWLINE(current_subject))
! 3280: {
! 3281: current_subject++;
! 3282: while(current_subject < end_subject &&
! 3283: (*current_subject & 0xc0) == 0x80)
! 3284: current_subject++;
! 3285: }
! 3286: }
! 3287: else
! 3288: #endif
! 3289: while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
! 3290: current_subject++;
! 3291:
! 3292: /* If we have just passed a CR and the newline option is ANY or
! 3293: ANYCRLF, and we are now at a LF, advance the match position by one
! 3294: more character. */
! 3295:
! 3296: if (current_subject[-1] == CHAR_CR &&
! 3297: (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
! 3298: current_subject < end_subject &&
! 3299: *current_subject == CHAR_NL)
! 3300: current_subject++;
! 3301: }
! 3302: }
! 3303:
! 3304: /* Or to a non-unique first char after study */
! 3305:
! 3306: else if (start_bits != NULL)
! 3307: {
! 3308: while (current_subject < end_subject)
! 3309: {
! 3310: register unsigned int c = *current_subject;
! 3311: if ((start_bits[c/8] & (1 << (c&7))) == 0)
! 3312: {
! 3313: current_subject++;
! 3314: #ifdef SUPPORT_UTF8
! 3315: if (utf8)
! 3316: while(current_subject < end_subject &&
! 3317: (*current_subject & 0xc0) == 0x80) current_subject++;
! 3318: #endif
! 3319: }
! 3320: else break;
! 3321: }
! 3322: }
! 3323: }
! 3324:
! 3325: /* Restore fudged end_subject */
! 3326:
! 3327: end_subject = save_end_subject;
! 3328:
! 3329: /* The following two optimizations are disabled for partial matching or if
! 3330: disabling is explicitly requested (and of course, by the test above, this
! 3331: code is not obeyed when restarting after a partial match). */
! 3332:
! 3333: if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
! 3334: (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
! 3335: {
! 3336: /* If the pattern was studied, a minimum subject length may be set. This
! 3337: is a lower bound; no actual string of that length may actually match the
! 3338: pattern. Although the value is, strictly, in characters, we treat it as
! 3339: bytes to avoid spending too much time in this optimization. */
! 3340:
! 3341: if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
! 3342: (pcre_uint32)(end_subject - current_subject) < study->minlength)
! 3343: return PCRE_ERROR_NOMATCH;
! 3344:
! 3345: /* If req_byte is set, we know that that character must appear in the
! 3346: subject for the match to succeed. If the first character is set, req_byte
! 3347: must be later in the subject; otherwise the test starts at the match
! 3348: point. This optimization can save a huge amount of work in patterns with
! 3349: nested unlimited repeats that aren't going to match. Writing separate
! 3350: code for cased/caseless versions makes it go faster, as does using an
! 3351: autoincrement and backing off on a match.
! 3352:
! 3353: HOWEVER: when the subject string is very, very long, searching to its end
! 3354: can take a long time, and give bad performance on quite ordinary
! 3355: patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
! 3356: string... so we don't do this when the string is sufficiently long. */
! 3357:
! 3358: if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)
! 3359: {
! 3360: register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
! 3361:
! 3362: /* We don't need to repeat the search if we haven't yet reached the
! 3363: place we found it at last time. */
! 3364:
! 3365: if (p > req_byte_ptr)
! 3366: {
! 3367: if (req_byte_caseless)
! 3368: {
! 3369: while (p < end_subject)
! 3370: {
! 3371: register int pp = *p++;
! 3372: if (pp == req_byte || pp == req_byte2) { p--; break; }
! 3373: }
! 3374: }
! 3375: else
! 3376: {
! 3377: while (p < end_subject)
! 3378: {
! 3379: if (*p++ == req_byte) { p--; break; }
! 3380: }
! 3381: }
! 3382:
! 3383: /* If we can't find the required character, break the matching loop,
! 3384: which will cause a return or PCRE_ERROR_NOMATCH. */
! 3385:
! 3386: if (p >= end_subject) break;
! 3387:
! 3388: /* If we have found the required character, save the point where we
! 3389: found it, so that we don't search again next time round the loop if
! 3390: the start hasn't passed this character yet. */
! 3391:
! 3392: req_byte_ptr = p;
! 3393: }
! 3394: }
! 3395: }
! 3396: } /* End of optimizations that are done when not restarting */
! 3397:
! 3398: /* OK, now we can do the business */
! 3399:
! 3400: md->start_used_ptr = current_subject;
! 3401: md->recursive = NULL;
! 3402:
! 3403: rc = internal_dfa_exec(
! 3404: md, /* fixed match data */
! 3405: md->start_code, /* this subexpression's code */
! 3406: current_subject, /* where we currently are */
! 3407: start_offset, /* start offset in subject */
! 3408: offsets, /* offset vector */
! 3409: offsetcount, /* size of same */
! 3410: workspace, /* workspace vector */
! 3411: wscount, /* size of same */
! 3412: 0); /* function recurse level */
! 3413:
! 3414: /* Anything other than "no match" means we are done, always; otherwise, carry
! 3415: on only if not anchored. */
! 3416:
! 3417: if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
! 3418:
! 3419: /* Advance to the next subject character unless we are at the end of a line
! 3420: and firstline is set. */
! 3421:
! 3422: if (firstline && IS_NEWLINE(current_subject)) break;
! 3423: current_subject++;
! 3424: if (utf8)
! 3425: {
! 3426: while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
! 3427: current_subject++;
! 3428: }
! 3429: if (current_subject > end_subject) break;
! 3430:
! 3431: /* If we have just passed a CR and we are now at a LF, and the pattern does
! 3432: not contain any explicit matches for \r or \n, and the newline option is CRLF
! 3433: or ANY or ANYCRLF, advance the match position by one more character. */
! 3434:
! 3435: if (current_subject[-1] == CHAR_CR &&
! 3436: current_subject < end_subject &&
! 3437: *current_subject == CHAR_NL &&
! 3438: (re->flags & PCRE_HASCRORLF) == 0 &&
! 3439: (md->nltype == NLTYPE_ANY ||
! 3440: md->nltype == NLTYPE_ANYCRLF ||
! 3441: md->nllen == 2))
! 3442: current_subject++;
! 3443:
! 3444: } /* "Bumpalong" loop */
! 3445:
! 3446: return PCRE_ERROR_NOMATCH;
! 3447: }
! 3448:
! 3449: /* End of pcre_dfa_exec.c */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>