File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / pcre / pcre_dfa_exec.c
Revision 1.1: download - view: text, annotated - select for diffs - revision graph
Tue Feb 21 23:05:51 2012 UTC (12 years, 7 months ago) by misho
CVS tags: MAIN, HEAD
Initial revision

    1: /*************************************************
    2: *      Perl-Compatible Regular Expressions       *
    3: *************************************************/
    4: 
    5: /* PCRE is a library of functions to support regular expressions whose syntax
    6: and semantics are as close as possible to those of the Perl 5 language (but see
    7: below for why this module is different).
    8: 
    9:                        Written by Philip Hazel
   10:            Copyright (c) 1997-2011 University of Cambridge
   11: 
   12: -----------------------------------------------------------------------------
   13: Redistribution and use in source and binary forms, with or without
   14: modification, are permitted provided that the following conditions are met:
   15: 
   16:     * Redistributions of source code must retain the above copyright notice,
   17:       this list of conditions and the following disclaimer.
   18: 
   19:     * Redistributions in binary form must reproduce the above copyright
   20:       notice, this list of conditions and the following disclaimer in the
   21:       documentation and/or other materials provided with the distribution.
   22: 
   23:     * Neither the name of the University of Cambridge nor the names of its
   24:       contributors may be used to endorse or promote products derived from
   25:       this software without specific prior written permission.
   26: 
   27: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
   28: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   29: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   30: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
   31: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   32: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   33: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   34: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   35: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   36: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   37: POSSIBILITY OF SUCH DAMAGE.
   38: -----------------------------------------------------------------------------
   39: */
   40: 
   41: 
   42: /* This module contains the external function pcre_dfa_exec(), which is an
   43: alternative matching function that uses a sort of DFA algorithm (not a true
   44: FSM). This is NOT Perl- compatible, but it has advantages in certain
   45: applications. */
   46: 
   47: 
   48: /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
   49: the performance of his patterns greatly. I could not use it as it stood, as it
   50: was not thread safe, and made assumptions about pattern sizes. Also, it caused
   51: test 7 to loop, and test 9 to crash with a segfault.
   52: 
   53: The issue is the check for duplicate states, which is done by a simple linear
   54: search up the state list. (Grep for "duplicate" below to find the code.) For
   55: many patterns, there will never be many states active at one time, so a simple
   56: linear search is fine. In patterns that have many active states, it might be a
   57: bottleneck. The suggested code used an indexing scheme to remember which states
   58: had previously been used for each character, and avoided the linear search when
   59: it knew there was no chance of a duplicate. This was implemented when adding
   60: states to the state lists.
   61: 
   62: I wrote some thread-safe, not-limited code to try something similar at the time
   63: of checking for duplicates (instead of when adding states), using index vectors
   64: on the stack. It did give a 13% improvement with one specially constructed
   65: pattern for certain subject strings, but on other strings and on many of the
   66: simpler patterns in the test suite it did worse. The major problem, I think,
   67: was the extra time to initialize the index. This had to be done for each call
   68: of internal_dfa_exec(). (The supplied patch used a static vector, initialized
   69: only once - I suspect this was the cause of the problems with the tests.)
   70: 
   71: Overall, I concluded that the gains in some cases did not outweigh the losses
   72: in others, so I abandoned this code. */
   73: 
   74: 
   75: 
   76: #ifdef HAVE_CONFIG_H
   77: #include "config.h"
   78: #endif
   79: 
   80: #define NLBLOCK md             /* Block containing newline information */
   81: #define PSSTART start_subject  /* Field containing processed string start */
   82: #define PSEND   end_subject    /* Field containing processed string end */
   83: 
   84: #include "pcre_internal.h"
   85: 
   86: 
   87: /* For use to indent debugging output */
   88: 
   89: #define SP "                   "
   90: 
   91: 
   92: /*************************************************
   93: *      Code parameters and static tables         *
   94: *************************************************/
   95: 
   96: /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
   97: into others, under special conditions. A gap of 20 between the blocks should be
   98: enough. The resulting opcodes don't have to be less than 256 because they are
   99: never stored, so we push them well clear of the normal opcodes. */
  100: 
  101: #define OP_PROP_EXTRA       300
  102: #define OP_EXTUNI_EXTRA     320
  103: #define OP_ANYNL_EXTRA      340
  104: #define OP_HSPACE_EXTRA     360
  105: #define OP_VSPACE_EXTRA     380
  106: 
  107: 
  108: /* This table identifies those opcodes that are followed immediately by a
  109: character that is to be tested in some way. This makes it possible to
  110: centralize the loading of these characters. In the case of Type * etc, the
  111: "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
  112: small value. Non-zero values in the table are the offsets from the opcode where
  113: the character is to be found. ***NOTE*** If the start of this table is
  114: modified, the three tables that follow must also be modified. */
  115: 
  116: static const uschar coptable[] = {
  117:   0,                             /* End                                    */
  118:   0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
  119:   0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
  120:   0, 0, 0,                       /* Any, AllAny, Anybyte                   */
  121:   0, 0,                          /* \P, \p                                 */
  122:   0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
  123:   0,                             /* \X                                     */
  124:   0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
  125:   1,                             /* Char                                   */
  126:   1,                             /* Chari                                  */
  127:   1,                             /* not                                    */
  128:   1,                             /* noti                                   */
  129:   /* Positive single-char repeats                                          */
  130:   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
  131:   3, 3, 3,                       /* upto, minupto, exact                   */
  132:   1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */
  133:   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
  134:   3, 3, 3,                       /* upto I, minupto I, exact I             */
  135:   1, 1, 1, 3,                    /* *+I, ++I, ?+I, upto+I                  */
  136:   /* Negative single-char repeats - only for chars < 256                   */
  137:   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
  138:   3, 3, 3,                       /* NOT upto, minupto, exact               */
  139:   1, 1, 1, 3,                    /* NOT *+, ++, ?+, upto+                  */
  140:   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
  141:   3, 3, 3,                       /* NOT upto I, minupto I, exact I         */
  142:   1, 1, 1, 3,                    /* NOT *+I, ++I, ?+I, upto+I              */
  143:   /* Positive type repeats                                                 */
  144:   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
  145:   3, 3, 3,                       /* Type upto, minupto, exact              */
  146:   1, 1, 1, 3,                    /* Type *+, ++, ?+, upto+                 */
  147:   /* Character class & ref repeats                                         */
  148:   0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
  149:   0, 0,                          /* CRRANGE, CRMINRANGE                    */
  150:   0,                             /* CLASS                                  */
  151:   0,                             /* NCLASS                                 */
  152:   0,                             /* XCLASS - variable length               */
  153:   0,                             /* REF                                    */
  154:   0,                             /* REFI                                   */
  155:   0,                             /* RECURSE                                */
  156:   0,                             /* CALLOUT                                */
  157:   0,                             /* Alt                                    */
  158:   0,                             /* Ket                                    */
  159:   0,                             /* KetRmax                                */
  160:   0,                             /* KetRmin                                */
  161:   0,                             /* KetRpos                                */
  162:   0,                             /* Reverse                                */
  163:   0,                             /* Assert                                 */
  164:   0,                             /* Assert not                             */
  165:   0,                             /* Assert behind                          */
  166:   0,                             /* Assert behind not                      */
  167:   0, 0,                          /* ONCE, ONCE_NC                          */
  168:   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
  169:   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
  170:   0, 0,                          /* CREF, NCREF                            */
  171:   0, 0,                          /* RREF, NRREF                            */
  172:   0,                             /* DEF                                    */
  173:   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
  174:   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
  175:   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
  176:   0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
  177:   0, 0                           /* CLOSE, SKIPZERO  */
  178: };
  179: 
  180: /* This table identifies those opcodes that inspect a character. It is used to
  181: remember the fact that a character could have been inspected when the end of
  182: the subject is reached. ***NOTE*** If the start of this table is modified, the
  183: two tables that follow must also be modified. */
  184: 
  185: static const uschar poptable[] = {
  186:   0,                             /* End                                    */
  187:   0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
  188:   1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
  189:   1, 1, 1,                       /* Any, AllAny, Anybyte                   */
  190:   1, 1,                          /* \P, \p                                 */
  191:   1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
  192:   1,                             /* \X                                     */
  193:   0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
  194:   1,                             /* Char                                   */
  195:   1,                             /* Chari                                  */
  196:   1,                             /* not                                    */
  197:   1,                             /* noti                                   */
  198:   /* Positive single-char repeats                                          */
  199:   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
  200:   1, 1, 1,                       /* upto, minupto, exact                   */
  201:   1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
  202:   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
  203:   1, 1, 1,                       /* upto I, minupto I, exact I             */
  204:   1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
  205:   /* Negative single-char repeats - only for chars < 256                   */
  206:   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
  207:   1, 1, 1,                       /* NOT upto, minupto, exact               */
  208:   1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
  209:   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
  210:   1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
  211:   1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
  212:   /* Positive type repeats                                                 */
  213:   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
  214:   1, 1, 1,                       /* Type upto, minupto, exact              */
  215:   1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
  216:   /* Character class & ref repeats                                         */
  217:   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
  218:   1, 1,                          /* CRRANGE, CRMINRANGE                    */
  219:   1,                             /* CLASS                                  */
  220:   1,                             /* NCLASS                                 */
  221:   1,                             /* XCLASS - variable length               */
  222:   0,                             /* REF                                    */
  223:   0,                             /* REFI                                   */
  224:   0,                             /* RECURSE                                */
  225:   0,                             /* CALLOUT                                */
  226:   0,                             /* Alt                                    */
  227:   0,                             /* Ket                                    */
  228:   0,                             /* KetRmax                                */
  229:   0,                             /* KetRmin                                */
  230:   0,                             /* KetRpos                                */
  231:   0,                             /* Reverse                                */
  232:   0,                             /* Assert                                 */
  233:   0,                             /* Assert not                             */
  234:   0,                             /* Assert behind                          */
  235:   0,                             /* Assert behind not                      */
  236:   0, 0,                          /* ONCE, ONCE_NC                          */
  237:   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
  238:   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
  239:   0, 0,                          /* CREF, NCREF                            */
  240:   0, 0,                          /* RREF, NRREF                            */
  241:   0,                             /* DEF                                    */
  242:   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
  243:   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
  244:   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
  245:   0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
  246:   0, 0                           /* CLOSE, SKIPZERO                        */
  247: };
  248: 
  249: /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
  250: and \w */
  251: 
  252: static const uschar toptable1[] = {
  253:   0, 0, 0, 0, 0, 0,
  254:   ctype_digit, ctype_digit,
  255:   ctype_space, ctype_space,
  256:   ctype_word,  ctype_word,
  257:   0, 0                            /* OP_ANY, OP_ALLANY */
  258: };
  259: 
  260: static const uschar toptable2[] = {
  261:   0, 0, 0, 0, 0, 0,
  262:   ctype_digit, 0,
  263:   ctype_space, 0,
  264:   ctype_word,  0,
  265:   1, 1                            /* OP_ANY, OP_ALLANY */
  266: };
  267: 
  268: 
  269: /* Structure for holding data about a particular state, which is in effect the
  270: current data for an active path through the match tree. It must consist
  271: entirely of ints because the working vector we are passed, and which we put
  272: these structures in, is a vector of ints. */
  273: 
  274: typedef struct stateblock {
  275:   int offset;                     /* Offset to opcode */
  276:   int count;                      /* Count for repeats */
  277:   int data;                       /* Some use extra data */
  278: } stateblock;
  279: 
  280: #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))
  281: 
  282: 
  283: #ifdef PCRE_DEBUG
  284: /*************************************************
  285: *             Print character string             *
  286: *************************************************/
  287: 
  288: /* Character string printing function for debugging.
  289: 
  290: Arguments:
  291:   p            points to string
  292:   length       number of bytes
  293:   f            where to print
  294: 
  295: Returns:       nothing
  296: */
  297: 
  298: static void
  299: pchars(unsigned char *p, int length, FILE *f)
  300: {
  301: int c;
  302: while (length-- > 0)
  303:   {
  304:   if (isprint(c = *(p++)))
  305:     fprintf(f, "%c", c);
  306:   else
  307:     fprintf(f, "\\x%02x", c);
  308:   }
  309: }
  310: #endif
  311: 
  312: 
  313: 
  314: /*************************************************
  315: *    Execute a Regular Expression - DFA engine   *
  316: *************************************************/
  317: 
  318: /* This internal function applies a compiled pattern to a subject string,
  319: starting at a given point, using a DFA engine. This function is called from the
  320: external one, possibly multiple times if the pattern is not anchored. The
  321: function calls itself recursively for some kinds of subpattern.
  322: 
  323: Arguments:
  324:   md                the match_data block with fixed information
  325:   this_start_code   the opening bracket of this subexpression's code
  326:   current_subject   where we currently are in the subject string
  327:   start_offset      start offset in the subject string
  328:   offsets           vector to contain the matching string offsets
  329:   offsetcount       size of same
  330:   workspace         vector of workspace
  331:   wscount           size of same
  332:   rlevel            function call recursion level
  333: 
  334: Returns:            > 0 => number of match offset pairs placed in offsets
  335:                     = 0 => offsets overflowed; longest matches are present
  336:                      -1 => failed to match
  337:                    < -1 => some kind of unexpected problem
  338: 
  339: The following macros are used for adding states to the two state vectors (one
  340: for the current character, one for the following character). */
  341: 
  342: #define ADD_ACTIVE(x,y) \
  343:   if (active_count++ < wscount) \
  344:     { \
  345:     next_active_state->offset = (x); \
  346:     next_active_state->count  = (y); \
  347:     next_active_state++; \
  348:     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
  349:     } \
  350:   else return PCRE_ERROR_DFA_WSSIZE
  351: 
  352: #define ADD_ACTIVE_DATA(x,y,z) \
  353:   if (active_count++ < wscount) \
  354:     { \
  355:     next_active_state->offset = (x); \
  356:     next_active_state->count  = (y); \
  357:     next_active_state->data   = (z); \
  358:     next_active_state++; \
  359:     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
  360:     } \
  361:   else return PCRE_ERROR_DFA_WSSIZE
  362: 
  363: #define ADD_NEW(x,y) \
  364:   if (new_count++ < wscount) \
  365:     { \
  366:     next_new_state->offset = (x); \
  367:     next_new_state->count  = (y); \
  368:     next_new_state++; \
  369:     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
  370:     } \
  371:   else return PCRE_ERROR_DFA_WSSIZE
  372: 
  373: #define ADD_NEW_DATA(x,y,z) \
  374:   if (new_count++ < wscount) \
  375:     { \
  376:     next_new_state->offset = (x); \
  377:     next_new_state->count  = (y); \
  378:     next_new_state->data   = (z); \
  379:     next_new_state++; \
  380:     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
  381:     } \
  382:   else return PCRE_ERROR_DFA_WSSIZE
  383: 
  384: /* And now, here is the code */
  385: 
  386: static int
  387: internal_dfa_exec(
  388:   dfa_match_data *md,
  389:   const uschar *this_start_code,
  390:   const uschar *current_subject,
  391:   int start_offset,
  392:   int *offsets,
  393:   int offsetcount,
  394:   int *workspace,
  395:   int wscount,
  396:   int  rlevel)
  397: {
  398: stateblock *active_states, *new_states, *temp_states;
  399: stateblock *next_active_state, *next_new_state;
  400: 
  401: const uschar *ctypes, *lcc, *fcc;
  402: const uschar *ptr;
  403: const uschar *end_code, *first_op;
  404: 
  405: dfa_recursion_info new_recursive;
  406: 
  407: int active_count, new_count, match_count;
  408: 
  409: /* Some fields in the md block are frequently referenced, so we load them into
  410: independent variables in the hope that this will perform better. */
  411: 
  412: const uschar *start_subject = md->start_subject;
  413: const uschar *end_subject = md->end_subject;
  414: const uschar *start_code = md->start_code;
  415: 
  416: #ifdef SUPPORT_UTF8
  417: BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
  418: #else
  419: BOOL utf8 = FALSE;
  420: #endif
  421: 
  422: rlevel++;
  423: offsetcount &= (-2);
  424: 
  425: wscount -= 2;
  426: wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
  427:           (2 * INTS_PER_STATEBLOCK);
  428: 
  429: DPRINTF(("\n%.*s---------------------\n"
  430:   "%.*sCall to internal_dfa_exec f=%d\n",
  431:   rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
  432: 
  433: ctypes = md->tables + ctypes_offset;
  434: lcc = md->tables + lcc_offset;
  435: fcc = md->tables + fcc_offset;
  436: 
  437: match_count = PCRE_ERROR_NOMATCH;   /* A negative number */
  438: 
  439: active_states = (stateblock *)(workspace + 2);
  440: next_new_state = new_states = active_states + wscount;
  441: new_count = 0;
  442: 
  443: first_op = this_start_code + 1 + LINK_SIZE +
  444:   ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
  445:     *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)? 2:0);
  446: 
  447: /* The first thing in any (sub) pattern is a bracket of some sort. Push all
  448: the alternative states onto the list, and find out where the end is. This
  449: makes is possible to use this function recursively, when we want to stop at a
  450: matching internal ket rather than at the end.
  451: 
  452: If the first opcode in the first alternative is OP_REVERSE, we are dealing with
  453: a backward assertion. In that case, we have to find out the maximum amount to
  454: move back, and set up each alternative appropriately. */
  455: 
  456: if (*first_op == OP_REVERSE)
  457:   {
  458:   int max_back = 0;
  459:   int gone_back;
  460: 
  461:   end_code = this_start_code;
  462:   do
  463:     {
  464:     int back = GET(end_code, 2+LINK_SIZE);
  465:     if (back > max_back) max_back = back;
  466:     end_code += GET(end_code, 1);
  467:     }
  468:   while (*end_code == OP_ALT);
  469: 
  470:   /* If we can't go back the amount required for the longest lookbehind
  471:   pattern, go back as far as we can; some alternatives may still be viable. */
  472: 
  473: #ifdef SUPPORT_UTF8
  474:   /* In character mode we have to step back character by character */
  475: 
  476:   if (utf8)
  477:     {
  478:     for (gone_back = 0; gone_back < max_back; gone_back++)
  479:       {
  480:       if (current_subject <= start_subject) break;
  481:       current_subject--;
  482:       while (current_subject > start_subject &&
  483:              (*current_subject & 0xc0) == 0x80)
  484:         current_subject--;
  485:       }
  486:     }
  487:   else
  488: #endif
  489: 
  490:   /* In byte-mode we can do this quickly. */
  491: 
  492:     {
  493:     gone_back = (current_subject - max_back < start_subject)?
  494:       (int)(current_subject - start_subject) : max_back;
  495:     current_subject -= gone_back;
  496:     }
  497: 
  498:   /* Save the earliest consulted character */
  499: 
  500:   if (current_subject < md->start_used_ptr)
  501:     md->start_used_ptr = current_subject;
  502: 
  503:   /* Now we can process the individual branches. */
  504: 
  505:   end_code = this_start_code;
  506:   do
  507:     {
  508:     int back = GET(end_code, 2+LINK_SIZE);
  509:     if (back <= gone_back)
  510:       {
  511:       int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
  512:       ADD_NEW_DATA(-bstate, 0, gone_back - back);
  513:       }
  514:     end_code += GET(end_code, 1);
  515:     }
  516:   while (*end_code == OP_ALT);
  517:  }
  518: 
  519: /* This is the code for a "normal" subpattern (not a backward assertion). The
  520: start of a whole pattern is always one of these. If we are at the top level,
  521: we may be asked to restart matching from the same point that we reached for a
  522: previous partial match. We still have to scan through the top-level branches to
  523: find the end state. */
  524: 
  525: else
  526:   {
  527:   end_code = this_start_code;
  528: 
  529:   /* Restarting */
  530: 
  531:   if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
  532:     {
  533:     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
  534:     new_count = workspace[1];
  535:     if (!workspace[0])
  536:       memcpy(new_states, active_states, new_count * sizeof(stateblock));
  537:     }
  538: 
  539:   /* Not restarting */
  540: 
  541:   else
  542:     {
  543:     int length = 1 + LINK_SIZE +
  544:       ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
  545:         *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)?
  546:         2:0);
  547:     do
  548:       {
  549:       ADD_NEW((int)(end_code - start_code + length), 0);
  550:       end_code += GET(end_code, 1);
  551:       length = 1 + LINK_SIZE;
  552:       }
  553:     while (*end_code == OP_ALT);
  554:     }
  555:   }
  556: 
  557: workspace[0] = 0;    /* Bit indicating which vector is current */
  558: 
  559: DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
  560: 
  561: /* Loop for scanning the subject */
  562: 
  563: ptr = current_subject;
  564: for (;;)
  565:   {
  566:   int i, j;
  567:   int clen, dlen;
  568:   unsigned int c, d;
  569:   int forced_fail = 0;
  570:   BOOL could_continue = FALSE;
  571: 
  572:   /* Make the new state list into the active state list and empty the
  573:   new state list. */
  574: 
  575:   temp_states = active_states;
  576:   active_states = new_states;
  577:   new_states = temp_states;
  578:   active_count = new_count;
  579:   new_count = 0;
  580: 
  581:   workspace[0] ^= 1;              /* Remember for the restarting feature */
  582:   workspace[1] = active_count;
  583: 
  584: #ifdef PCRE_DEBUG
  585:   printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
  586:   pchars((uschar *)ptr, strlen((char *)ptr), stdout);
  587:   printf("\"\n");
  588: 
  589:   printf("%.*sActive states: ", rlevel*2-2, SP);
  590:   for (i = 0; i < active_count; i++)
  591:     printf("%d/%d ", active_states[i].offset, active_states[i].count);
  592:   printf("\n");
  593: #endif
  594: 
  595:   /* Set the pointers for adding new states */
  596: 
  597:   next_active_state = active_states + active_count;
  598:   next_new_state = new_states;
  599: 
  600:   /* Load the current character from the subject outside the loop, as many
  601:   different states may want to look at it, and we assume that at least one
  602:   will. */
  603: 
  604:   if (ptr < end_subject)
  605:     {
  606:     clen = 1;        /* Number of bytes in the character */
  607: #ifdef SUPPORT_UTF8
  608:     if (utf8) { GETCHARLEN(c, ptr, clen); } else
  609: #endif  /* SUPPORT_UTF8 */
  610:     c = *ptr;
  611:     }
  612:   else
  613:     {
  614:     clen = 0;        /* This indicates the end of the subject */
  615:     c = NOTACHAR;    /* This value should never actually be used */
  616:     }
  617: 
  618:   /* Scan up the active states and act on each one. The result of an action
  619:   may be to add more states to the currently active list (e.g. on hitting a
  620:   parenthesis) or it may be to put states on the new list, for considering
  621:   when we move the character pointer on. */
  622: 
  623:   for (i = 0; i < active_count; i++)
  624:     {
  625:     stateblock *current_state = active_states + i;
  626:     BOOL caseless = FALSE;
  627:     const uschar *code;
  628:     int state_offset = current_state->offset;
  629:     int count, codevalue, rrc;
  630: 
  631: #ifdef PCRE_DEBUG
  632:     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
  633:     if (clen == 0) printf("EOL\n");
  634:       else if (c > 32 && c < 127) printf("'%c'\n", c);
  635:         else printf("0x%02x\n", c);
  636: #endif
  637: 
  638:     /* A negative offset is a special case meaning "hold off going to this
  639:     (negated) state until the number of characters in the data field have
  640:     been skipped". */
  641: 
  642:     if (state_offset < 0)
  643:       {
  644:       if (current_state->data > 0)
  645:         {
  646:         DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
  647:         ADD_NEW_DATA(state_offset, current_state->count,
  648:           current_state->data - 1);
  649:         continue;
  650:         }
  651:       else
  652:         {
  653:         current_state->offset = state_offset = -state_offset;
  654:         }
  655:       }
  656: 
  657:     /* Check for a duplicate state with the same count, and skip if found.
  658:     See the note at the head of this module about the possibility of improving
  659:     performance here. */
  660: 
  661:     for (j = 0; j < i; j++)
  662:       {
  663:       if (active_states[j].offset == state_offset &&
  664:           active_states[j].count == current_state->count)
  665:         {
  666:         DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
  667:         goto NEXT_ACTIVE_STATE;
  668:         }
  669:       }
  670: 
  671:     /* The state offset is the offset to the opcode */
  672: 
  673:     code = start_code + state_offset;
  674:     codevalue = *code;
  675: 
  676:     /* If this opcode inspects a character, but we are at the end of the
  677:     subject, remember the fact for use when testing for a partial match. */
  678: 
  679:     if (clen == 0 && poptable[codevalue] != 0)
  680:       could_continue = TRUE;
  681: 
  682:     /* If this opcode is followed by an inline character, load it. It is
  683:     tempting to test for the presence of a subject character here, but that
  684:     is wrong, because sometimes zero repetitions of the subject are
  685:     permitted.
  686: 
  687:     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
  688:     argument that is not a data character - but is always one byte long. We
  689:     have to take special action to deal with  \P, \p, \H, \h, \V, \v and \X in
  690:     this case. To keep the other cases fast, convert these ones to new opcodes.
  691:     */
  692: 
  693:     if (coptable[codevalue] > 0)
  694:       {
  695:       dlen = 1;
  696: #ifdef SUPPORT_UTF8
  697:       if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
  698: #endif  /* SUPPORT_UTF8 */
  699:       d = code[coptable[codevalue]];
  700:       if (codevalue >= OP_TYPESTAR)
  701:         {
  702:         switch(d)
  703:           {
  704:           case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
  705:           case OP_NOTPROP:
  706:           case OP_PROP: codevalue += OP_PROP_EXTRA; break;
  707:           case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
  708:           case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
  709:           case OP_NOT_HSPACE:
  710:           case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
  711:           case OP_NOT_VSPACE:
  712:           case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
  713:           default: break;
  714:           }
  715:         }
  716:       }
  717:     else
  718:       {
  719:       dlen = 0;         /* Not strictly necessary, but compilers moan */
  720:       d = NOTACHAR;     /* if these variables are not set. */
  721:       }
  722: 
  723: 
  724:     /* Now process the individual opcodes */
  725: 
  726:     switch (codevalue)
  727:       {
  728: /* ========================================================================== */
  729:       /* These cases are never obeyed. This is a fudge that causes a compile-
  730:       time error if the vectors coptable or poptable, which are indexed by
  731:       opcode, are not the correct length. It seems to be the only way to do
  732:       such a check at compile time, as the sizeof() operator does not work
  733:       in the C preprocessor. */
  734: 
  735:       case OP_TABLE_LENGTH:
  736:       case OP_TABLE_LENGTH +
  737:         ((sizeof(coptable) == OP_TABLE_LENGTH) &&
  738:          (sizeof(poptable) == OP_TABLE_LENGTH)):
  739:       break;
  740: 
  741: /* ========================================================================== */
  742:       /* Reached a closing bracket. If not at the end of the pattern, carry
  743:       on with the next opcode. For repeating opcodes, also add the repeat
  744:       state. Note that KETRPOS will always be encountered at the end of the
  745:       subpattern, because the possessive subpattern repeats are always handled
  746:       using recursive calls. Thus, it never adds any new states.
  747: 
  748:       At the end of the (sub)pattern, unless we have an empty string and
  749:       PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
  750:       start of the subject, save the match data, shifting up all previous
  751:       matches so we always have the longest first. */
  752: 
  753:       case OP_KET:
  754:       case OP_KETRMIN:
  755:       case OP_KETRMAX:
  756:       case OP_KETRPOS:
  757:       if (code != end_code)
  758:         {
  759:         ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
  760:         if (codevalue != OP_KET)
  761:           {
  762:           ADD_ACTIVE(state_offset - GET(code, 1), 0);
  763:           }
  764:         }
  765:       else
  766:         {
  767:         if (ptr > current_subject ||
  768:             ((md->moptions & PCRE_NOTEMPTY) == 0 &&
  769:               ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
  770:                 current_subject > start_subject + md->start_offset)))
  771:           {
  772:           if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
  773:             else if (match_count > 0 && ++match_count * 2 > offsetcount)
  774:               match_count = 0;
  775:           count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
  776:           if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
  777:           if (offsetcount >= 2)
  778:             {
  779:             offsets[0] = (int)(current_subject - start_subject);
  780:             offsets[1] = (int)(ptr - start_subject);
  781:             DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
  782:               offsets[1] - offsets[0], current_subject));
  783:             }
  784:           if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
  785:             {
  786:             DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
  787:               "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
  788:               match_count, rlevel*2-2, SP));
  789:             return match_count;
  790:             }
  791:           }
  792:         }
  793:       break;
  794: 
  795: /* ========================================================================== */
  796:       /* These opcodes add to the current list of states without looking
  797:       at the current character. */
  798: 
  799:       /*-----------------------------------------------------------------*/
  800:       case OP_ALT:
  801:       do { code += GET(code, 1); } while (*code == OP_ALT);
  802:       ADD_ACTIVE((int)(code - start_code), 0);
  803:       break;
  804: 
  805:       /*-----------------------------------------------------------------*/
  806:       case OP_BRA:
  807:       case OP_SBRA:
  808:       do
  809:         {
  810:         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
  811:         code += GET(code, 1);
  812:         }
  813:       while (*code == OP_ALT);
  814:       break;
  815: 
  816:       /*-----------------------------------------------------------------*/
  817:       case OP_CBRA:
  818:       case OP_SCBRA:
  819:       ADD_ACTIVE((int)(code - start_code + 3 + LINK_SIZE),  0);
  820:       code += GET(code, 1);
  821:       while (*code == OP_ALT)
  822:         {
  823:         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
  824:         code += GET(code, 1);
  825:         }
  826:       break;
  827: 
  828:       /*-----------------------------------------------------------------*/
  829:       case OP_BRAZERO:
  830:       case OP_BRAMINZERO:
  831:       ADD_ACTIVE(state_offset + 1, 0);
  832:       code += 1 + GET(code, 2);
  833:       while (*code == OP_ALT) code += GET(code, 1);
  834:       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
  835:       break;
  836: 
  837:       /*-----------------------------------------------------------------*/
  838:       case OP_SKIPZERO:
  839:       code += 1 + GET(code, 2);
  840:       while (*code == OP_ALT) code += GET(code, 1);
  841:       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
  842:       break;
  843: 
  844:       /*-----------------------------------------------------------------*/
  845:       case OP_CIRC:
  846:       if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
  847:         { ADD_ACTIVE(state_offset + 1, 0); }
  848:       break;
  849: 
  850:       /*-----------------------------------------------------------------*/
  851:       case OP_CIRCM:
  852:       if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
  853:           (ptr != end_subject && WAS_NEWLINE(ptr)))
  854:         { ADD_ACTIVE(state_offset + 1, 0); }
  855:       break;
  856: 
  857:       /*-----------------------------------------------------------------*/
  858:       case OP_EOD:
  859:       if (ptr >= end_subject)
  860:         {
  861:         if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
  862:           could_continue = TRUE;
  863:         else { ADD_ACTIVE(state_offset + 1, 0); }
  864:         }
  865:       break;
  866: 
  867:       /*-----------------------------------------------------------------*/
  868:       case OP_SOD:
  869:       if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
  870:       break;
  871: 
  872:       /*-----------------------------------------------------------------*/
  873:       case OP_SOM:
  874:       if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
  875:       break;
  876: 
  877: 
  878: /* ========================================================================== */
  879:       /* These opcodes inspect the next subject character, and sometimes
  880:       the previous one as well, but do not have an argument. The variable
  881:       clen contains the length of the current character and is zero if we are
  882:       at the end of the subject. */
  883: 
  884:       /*-----------------------------------------------------------------*/
  885:       case OP_ANY:
  886:       if (clen > 0 && !IS_NEWLINE(ptr))
  887:         { ADD_NEW(state_offset + 1, 0); }
  888:       break;
  889: 
  890:       /*-----------------------------------------------------------------*/
  891:       case OP_ALLANY:
  892:       if (clen > 0)
  893:         { ADD_NEW(state_offset + 1, 0); }
  894:       break;
  895: 
  896:       /*-----------------------------------------------------------------*/
  897:       case OP_EODN:
  898:       if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
  899:         could_continue = TRUE;
  900:       else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
  901:         { ADD_ACTIVE(state_offset + 1, 0); }
  902:       break;
  903: 
  904:       /*-----------------------------------------------------------------*/
  905:       case OP_DOLL:
  906:       if ((md->moptions & PCRE_NOTEOL) == 0)
  907:         {
  908:         if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
  909:           could_continue = TRUE;
  910:         else if (clen == 0 ||
  911:             ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
  912:                (ptr == end_subject - md->nllen)
  913:             ))
  914:           { ADD_ACTIVE(state_offset + 1, 0); }
  915:         }
  916:       break;
  917: 
  918:       /*-----------------------------------------------------------------*/
  919:       case OP_DOLLM:
  920:       if ((md->moptions & PCRE_NOTEOL) == 0)
  921:         {
  922:         if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
  923:           could_continue = TRUE;
  924:         else if (clen == 0 ||
  925:             ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
  926:           { ADD_ACTIVE(state_offset + 1, 0); }
  927:         }
  928:       else if (IS_NEWLINE(ptr))
  929:         { ADD_ACTIVE(state_offset + 1, 0); }
  930:       break;
  931: 
  932:       /*-----------------------------------------------------------------*/
  933: 
  934:       case OP_DIGIT:
  935:       case OP_WHITESPACE:
  936:       case OP_WORDCHAR:
  937:       if (clen > 0 && c < 256 &&
  938:             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
  939:         { ADD_NEW(state_offset + 1, 0); }
  940:       break;
  941: 
  942:       /*-----------------------------------------------------------------*/
  943:       case OP_NOT_DIGIT:
  944:       case OP_NOT_WHITESPACE:
  945:       case OP_NOT_WORDCHAR:
  946:       if (clen > 0 && (c >= 256 ||
  947:             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
  948:         { ADD_NEW(state_offset + 1, 0); }
  949:       break;
  950: 
  951:       /*-----------------------------------------------------------------*/
  952:       case OP_WORD_BOUNDARY:
  953:       case OP_NOT_WORD_BOUNDARY:
  954:         {
  955:         int left_word, right_word;
  956: 
  957:         if (ptr > start_subject)
  958:           {
  959:           const uschar *temp = ptr - 1;
  960:           if (temp < md->start_used_ptr) md->start_used_ptr = temp;
  961: #ifdef SUPPORT_UTF8
  962:           if (utf8) BACKCHAR(temp);
  963: #endif
  964:           GETCHARTEST(d, temp);
  965: #ifdef SUPPORT_UCP
  966:           if ((md->poptions & PCRE_UCP) != 0)
  967:             {
  968:             if (d == '_') left_word = TRUE; else
  969:               {
  970:               int cat = UCD_CATEGORY(d);
  971:               left_word = (cat == ucp_L || cat == ucp_N);
  972:               }
  973:             }
  974:           else
  975: #endif
  976:           left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
  977:           }
  978:         else left_word = FALSE;
  979: 
  980:         if (clen > 0)
  981:           {
  982: #ifdef SUPPORT_UCP
  983:           if ((md->poptions & PCRE_UCP) != 0)
  984:             {
  985:             if (c == '_') right_word = TRUE; else
  986:               {
  987:               int cat = UCD_CATEGORY(c);
  988:               right_word = (cat == ucp_L || cat == ucp_N);
  989:               }
  990:             }
  991:           else
  992: #endif
  993:           right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
  994:           }
  995:         else right_word = FALSE;
  996: 
  997:         if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
  998:           { ADD_ACTIVE(state_offset + 1, 0); }
  999:         }
 1000:       break;
 1001: 
 1002: 
 1003:       /*-----------------------------------------------------------------*/
 1004:       /* Check the next character by Unicode property. We will get here only
 1005:       if the support is in the binary; otherwise a compile-time error occurs.
 1006:       */
 1007: 
 1008: #ifdef SUPPORT_UCP
 1009:       case OP_PROP:
 1010:       case OP_NOTPROP:
 1011:       if (clen > 0)
 1012:         {
 1013:         BOOL OK;
 1014:         const ucd_record * prop = GET_UCD(c);
 1015:         switch(code[1])
 1016:           {
 1017:           case PT_ANY:
 1018:           OK = TRUE;
 1019:           break;
 1020: 
 1021:           case PT_LAMP:
 1022:           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
 1023:                prop->chartype == ucp_Lt;
 1024:           break;
 1025: 
 1026:           case PT_GC:
 1027:           OK = _pcre_ucp_gentype[prop->chartype] == code[2];
 1028:           break;
 1029: 
 1030:           case PT_PC:
 1031:           OK = prop->chartype == code[2];
 1032:           break;
 1033: 
 1034:           case PT_SC:
 1035:           OK = prop->script == code[2];
 1036:           break;
 1037: 
 1038:           /* These are specials for combination cases. */
 1039: 
 1040:           case PT_ALNUM:
 1041:           OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
 1042:                _pcre_ucp_gentype[prop->chartype] == ucp_N;
 1043:           break;
 1044: 
 1045:           case PT_SPACE:    /* Perl space */
 1046:           OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
 1047:                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
 1048:           break;
 1049: 
 1050:           case PT_PXSPACE:  /* POSIX space */
 1051:           OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
 1052:                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
 1053:                c == CHAR_FF || c == CHAR_CR;
 1054:           break;
 1055: 
 1056:           case PT_WORD:
 1057:           OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
 1058:                _pcre_ucp_gentype[prop->chartype] == ucp_N ||
 1059:                c == CHAR_UNDERSCORE;
 1060:           break;
 1061: 
 1062:           /* Should never occur, but keep compilers from grumbling. */
 1063: 
 1064:           default:
 1065:           OK = codevalue != OP_PROP;
 1066:           break;
 1067:           }
 1068: 
 1069:         if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
 1070:         }
 1071:       break;
 1072: #endif
 1073: 
 1074: 
 1075: 
 1076: /* ========================================================================== */
 1077:       /* These opcodes likewise inspect the subject character, but have an
 1078:       argument that is not a data character. It is one of these opcodes:
 1079:       OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
 1080:       OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
 1081: 
 1082:       case OP_TYPEPLUS:
 1083:       case OP_TYPEMINPLUS:
 1084:       case OP_TYPEPOSPLUS:
 1085:       count = current_state->count;  /* Already matched */
 1086:       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
 1087:       if (clen > 0)
 1088:         {
 1089:         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 1090:             (c < 256 &&
 1091:               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
 1092:               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 1093:           {
 1094:           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
 1095:             {
 1096:             active_count--;            /* Remove non-match possibility */
 1097:             next_active_state--;
 1098:             }
 1099:           count++;
 1100:           ADD_NEW(state_offset, count);
 1101:           }
 1102:         }
 1103:       break;
 1104: 
 1105:       /*-----------------------------------------------------------------*/
 1106:       case OP_TYPEQUERY:
 1107:       case OP_TYPEMINQUERY:
 1108:       case OP_TYPEPOSQUERY:
 1109:       ADD_ACTIVE(state_offset + 2, 0);
 1110:       if (clen > 0)
 1111:         {
 1112:         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 1113:             (c < 256 &&
 1114:               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
 1115:               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 1116:           {
 1117:           if (codevalue == OP_TYPEPOSQUERY)
 1118:             {
 1119:             active_count--;            /* Remove non-match possibility */
 1120:             next_active_state--;
 1121:             }
 1122:           ADD_NEW(state_offset + 2, 0);
 1123:           }
 1124:         }
 1125:       break;
 1126: 
 1127:       /*-----------------------------------------------------------------*/
 1128:       case OP_TYPESTAR:
 1129:       case OP_TYPEMINSTAR:
 1130:       case OP_TYPEPOSSTAR:
 1131:       ADD_ACTIVE(state_offset + 2, 0);
 1132:       if (clen > 0)
 1133:         {
 1134:         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 1135:             (c < 256 &&
 1136:               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
 1137:               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 1138:           {
 1139:           if (codevalue == OP_TYPEPOSSTAR)
 1140:             {
 1141:             active_count--;            /* Remove non-match possibility */
 1142:             next_active_state--;
 1143:             }
 1144:           ADD_NEW(state_offset, 0);
 1145:           }
 1146:         }
 1147:       break;
 1148: 
 1149:       /*-----------------------------------------------------------------*/
 1150:       case OP_TYPEEXACT:
 1151:       count = current_state->count;  /* Number already matched */
 1152:       if (clen > 0)
 1153:         {
 1154:         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 1155:             (c < 256 &&
 1156:               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
 1157:               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 1158:           {
 1159:           if (++count >= GET2(code, 1))
 1160:             { ADD_NEW(state_offset + 4, 0); }
 1161:           else
 1162:             { ADD_NEW(state_offset, count); }
 1163:           }
 1164:         }
 1165:       break;
 1166: 
 1167:       /*-----------------------------------------------------------------*/
 1168:       case OP_TYPEUPTO:
 1169:       case OP_TYPEMINUPTO:
 1170:       case OP_TYPEPOSUPTO:
 1171:       ADD_ACTIVE(state_offset + 4, 0);
 1172:       count = current_state->count;  /* Number already matched */
 1173:       if (clen > 0)
 1174:         {
 1175:         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 1176:             (c < 256 &&
 1177:               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
 1178:               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 1179:           {
 1180:           if (codevalue == OP_TYPEPOSUPTO)
 1181:             {
 1182:             active_count--;           /* Remove non-match possibility */
 1183:             next_active_state--;
 1184:             }
 1185:           if (++count >= GET2(code, 1))
 1186:             { ADD_NEW(state_offset + 4, 0); }
 1187:           else
 1188:             { ADD_NEW(state_offset, count); }
 1189:           }
 1190:         }
 1191:       break;
 1192: 
 1193: /* ========================================================================== */
 1194:       /* These are virtual opcodes that are used when something like
 1195:       OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
 1196:       argument. It keeps the code above fast for the other cases. The argument
 1197:       is in the d variable. */
 1198: 
 1199: #ifdef SUPPORT_UCP
 1200:       case OP_PROP_EXTRA + OP_TYPEPLUS:
 1201:       case OP_PROP_EXTRA + OP_TYPEMINPLUS:
 1202:       case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
 1203:       count = current_state->count;           /* Already matched */
 1204:       if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
 1205:       if (clen > 0)
 1206:         {
 1207:         BOOL OK;
 1208:         const ucd_record * prop = GET_UCD(c);
 1209:         switch(code[2])
 1210:           {
 1211:           case PT_ANY:
 1212:           OK = TRUE;
 1213:           break;
 1214: 
 1215:           case PT_LAMP:
 1216:           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
 1217:             prop->chartype == ucp_Lt;
 1218:           break;
 1219: 
 1220:           case PT_GC:
 1221:           OK = _pcre_ucp_gentype[prop->chartype] == code[3];
 1222:           break;
 1223: 
 1224:           case PT_PC:
 1225:           OK = prop->chartype == code[3];
 1226:           break;
 1227: 
 1228:           case PT_SC:
 1229:           OK = prop->script == code[3];
 1230:           break;
 1231: 
 1232:           /* These are specials for combination cases. */
 1233: 
 1234:           case PT_ALNUM:
 1235:           OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
 1236:                _pcre_ucp_gentype[prop->chartype] == ucp_N;
 1237:           break;
 1238: 
 1239:           case PT_SPACE:    /* Perl space */
 1240:           OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
 1241:                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
 1242:           break;
 1243: 
 1244:           case PT_PXSPACE:  /* POSIX space */
 1245:           OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
 1246:                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
 1247:                c == CHAR_FF || c == CHAR_CR;
 1248:           break;
 1249: 
 1250:           case PT_WORD:
 1251:           OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
 1252:                _pcre_ucp_gentype[prop->chartype] == ucp_N ||
 1253:                c == CHAR_UNDERSCORE;
 1254:           break;
 1255: 
 1256:           /* Should never occur, but keep compilers from grumbling. */
 1257: 
 1258:           default:
 1259:           OK = codevalue != OP_PROP;
 1260:           break;
 1261:           }
 1262: 
 1263:         if (OK == (d == OP_PROP))
 1264:           {
 1265:           if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
 1266:             {
 1267:             active_count--;           /* Remove non-match possibility */
 1268:             next_active_state--;
 1269:             }
 1270:           count++;
 1271:           ADD_NEW(state_offset, count);
 1272:           }
 1273:         }
 1274:       break;
 1275: 
 1276:       /*-----------------------------------------------------------------*/
 1277:       case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
 1278:       case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
 1279:       case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
 1280:       count = current_state->count;  /* Already matched */
 1281:       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
 1282:       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
 1283:         {
 1284:         const uschar *nptr = ptr + clen;
 1285:         int ncount = 0;
 1286:         if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
 1287:           {
 1288:           active_count--;           /* Remove non-match possibility */
 1289:           next_active_state--;
 1290:           }
 1291:         while (nptr < end_subject)
 1292:           {
 1293:           int nd;
 1294:           int ndlen = 1;
 1295:           GETCHARLEN(nd, nptr, ndlen);
 1296:           if (UCD_CATEGORY(nd) != ucp_M) break;
 1297:           ncount++;
 1298:           nptr += ndlen;
 1299:           }
 1300:         count++;
 1301:         ADD_NEW_DATA(-state_offset, count, ncount);
 1302:         }
 1303:       break;
 1304: #endif
 1305: 
 1306:       /*-----------------------------------------------------------------*/
 1307:       case OP_ANYNL_EXTRA + OP_TYPEPLUS:
 1308:       case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
 1309:       case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
 1310:       count = current_state->count;  /* Already matched */
 1311:       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
 1312:       if (clen > 0)
 1313:         {
 1314:         int ncount = 0;
 1315:         switch (c)
 1316:           {
 1317:           case 0x000b:
 1318:           case 0x000c:
 1319:           case 0x0085:
 1320:           case 0x2028:
 1321:           case 0x2029:
 1322:           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
 1323:           goto ANYNL01;
 1324: 
 1325:           case 0x000d:
 1326:           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
 1327:           /* Fall through */
 1328: 
 1329:           ANYNL01:
 1330:           case 0x000a:
 1331:           if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
 1332:             {
 1333:             active_count--;           /* Remove non-match possibility */
 1334:             next_active_state--;
 1335:             }
 1336:           count++;
 1337:           ADD_NEW_DATA(-state_offset, count, ncount);
 1338:           break;
 1339: 
 1340:           default:
 1341:           break;
 1342:           }
 1343:         }
 1344:       break;
 1345: 
 1346:       /*-----------------------------------------------------------------*/
 1347:       case OP_VSPACE_EXTRA + OP_TYPEPLUS:
 1348:       case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
 1349:       case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
 1350:       count = current_state->count;  /* Already matched */
 1351:       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
 1352:       if (clen > 0)
 1353:         {
 1354:         BOOL OK;
 1355:         switch (c)
 1356:           {
 1357:           case 0x000a:
 1358:           case 0x000b:
 1359:           case 0x000c:
 1360:           case 0x000d:
 1361:           case 0x0085:
 1362:           case 0x2028:
 1363:           case 0x2029:
 1364:           OK = TRUE;
 1365:           break;
 1366: 
 1367:           default:
 1368:           OK = FALSE;
 1369:           break;
 1370:           }
 1371: 
 1372:         if (OK == (d == OP_VSPACE))
 1373:           {
 1374:           if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
 1375:             {
 1376:             active_count--;           /* Remove non-match possibility */
 1377:             next_active_state--;
 1378:             }
 1379:           count++;
 1380:           ADD_NEW_DATA(-state_offset, count, 0);
 1381:           }
 1382:         }
 1383:       break;
 1384: 
 1385:       /*-----------------------------------------------------------------*/
 1386:       case OP_HSPACE_EXTRA + OP_TYPEPLUS:
 1387:       case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
 1388:       case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
 1389:       count = current_state->count;  /* Already matched */
 1390:       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
 1391:       if (clen > 0)
 1392:         {
 1393:         BOOL OK;
 1394:         switch (c)
 1395:           {
 1396:           case 0x09:      /* HT */
 1397:           case 0x20:      /* SPACE */
 1398:           case 0xa0:      /* NBSP */
 1399:           case 0x1680:    /* OGHAM SPACE MARK */
 1400:           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
 1401:           case 0x2000:    /* EN QUAD */
 1402:           case 0x2001:    /* EM QUAD */
 1403:           case 0x2002:    /* EN SPACE */
 1404:           case 0x2003:    /* EM SPACE */
 1405:           case 0x2004:    /* THREE-PER-EM SPACE */
 1406:           case 0x2005:    /* FOUR-PER-EM SPACE */
 1407:           case 0x2006:    /* SIX-PER-EM SPACE */
 1408:           case 0x2007:    /* FIGURE SPACE */
 1409:           case 0x2008:    /* PUNCTUATION SPACE */
 1410:           case 0x2009:    /* THIN SPACE */
 1411:           case 0x200A:    /* HAIR SPACE */
 1412:           case 0x202f:    /* NARROW NO-BREAK SPACE */
 1413:           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
 1414:           case 0x3000:    /* IDEOGRAPHIC SPACE */
 1415:           OK = TRUE;
 1416:           break;
 1417: 
 1418:           default:
 1419:           OK = FALSE;
 1420:           break;
 1421:           }
 1422: 
 1423:         if (OK == (d == OP_HSPACE))
 1424:           {
 1425:           if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
 1426:             {
 1427:             active_count--;           /* Remove non-match possibility */
 1428:             next_active_state--;
 1429:             }
 1430:           count++;
 1431:           ADD_NEW_DATA(-state_offset, count, 0);
 1432:           }
 1433:         }
 1434:       break;
 1435: 
 1436:       /*-----------------------------------------------------------------*/
 1437: #ifdef SUPPORT_UCP
 1438:       case OP_PROP_EXTRA + OP_TYPEQUERY:
 1439:       case OP_PROP_EXTRA + OP_TYPEMINQUERY:
 1440:       case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
 1441:       count = 4;
 1442:       goto QS1;
 1443: 
 1444:       case OP_PROP_EXTRA + OP_TYPESTAR:
 1445:       case OP_PROP_EXTRA + OP_TYPEMINSTAR:
 1446:       case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
 1447:       count = 0;
 1448: 
 1449:       QS1:
 1450: 
 1451:       ADD_ACTIVE(state_offset + 4, 0);
 1452:       if (clen > 0)
 1453:         {
 1454:         BOOL OK;
 1455:         const ucd_record * prop = GET_UCD(c);
 1456:         switch(code[2])
 1457:           {
 1458:           case PT_ANY:
 1459:           OK = TRUE;
 1460:           break;
 1461: 
 1462:           case PT_LAMP:
 1463:           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
 1464:             prop->chartype == ucp_Lt;
 1465:           break;
 1466: 
 1467:           case PT_GC:
 1468:           OK = _pcre_ucp_gentype[prop->chartype] == code[3];
 1469:           break;
 1470: 
 1471:           case PT_PC:
 1472:           OK = prop->chartype == code[3];
 1473:           break;
 1474: 
 1475:           case PT_SC:
 1476:           OK = prop->script == code[3];
 1477:           break;
 1478: 
 1479:           /* These are specials for combination cases. */
 1480: 
 1481:           case PT_ALNUM:
 1482:           OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
 1483:                _pcre_ucp_gentype[prop->chartype] == ucp_N;
 1484:           break;
 1485: 
 1486:           case PT_SPACE:    /* Perl space */
 1487:           OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
 1488:                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
 1489:           break;
 1490: 
 1491:           case PT_PXSPACE:  /* POSIX space */
 1492:           OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
 1493:                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
 1494:                c == CHAR_FF || c == CHAR_CR;
 1495:           break;
 1496: 
 1497:           case PT_WORD:
 1498:           OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
 1499:                _pcre_ucp_gentype[prop->chartype] == ucp_N ||
 1500:                c == CHAR_UNDERSCORE;
 1501:           break;
 1502: 
 1503:           /* Should never occur, but keep compilers from grumbling. */
 1504: 
 1505:           default:
 1506:           OK = codevalue != OP_PROP;
 1507:           break;
 1508:           }
 1509: 
 1510:         if (OK == (d == OP_PROP))
 1511:           {
 1512:           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
 1513:               codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
 1514:             {
 1515:             active_count--;           /* Remove non-match possibility */
 1516:             next_active_state--;
 1517:             }
 1518:           ADD_NEW(state_offset + count, 0);
 1519:           }
 1520:         }
 1521:       break;
 1522: 
 1523:       /*-----------------------------------------------------------------*/
 1524:       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
 1525:       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
 1526:       case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
 1527:       count = 2;
 1528:       goto QS2;
 1529: 
 1530:       case OP_EXTUNI_EXTRA + OP_TYPESTAR:
 1531:       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
 1532:       case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
 1533:       count = 0;
 1534: 
 1535:       QS2:
 1536: 
 1537:       ADD_ACTIVE(state_offset + 2, 0);
 1538:       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
 1539:         {
 1540:         const uschar *nptr = ptr + clen;
 1541:         int ncount = 0;
 1542:         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
 1543:             codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
 1544:           {
 1545:           active_count--;           /* Remove non-match possibility */
 1546:           next_active_state--;
 1547:           }
 1548:         while (nptr < end_subject)
 1549:           {
 1550:           int nd;
 1551:           int ndlen = 1;
 1552:           GETCHARLEN(nd, nptr, ndlen);
 1553:           if (UCD_CATEGORY(nd) != ucp_M) break;
 1554:           ncount++;
 1555:           nptr += ndlen;
 1556:           }
 1557:         ADD_NEW_DATA(-(state_offset + count), 0, ncount);
 1558:         }
 1559:       break;
 1560: #endif
 1561: 
 1562:       /*-----------------------------------------------------------------*/
 1563:       case OP_ANYNL_EXTRA + OP_TYPEQUERY:
 1564:       case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
 1565:       case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
 1566:       count = 2;
 1567:       goto QS3;
 1568: 
 1569:       case OP_ANYNL_EXTRA + OP_TYPESTAR:
 1570:       case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
 1571:       case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
 1572:       count = 0;
 1573: 
 1574:       QS3:
 1575:       ADD_ACTIVE(state_offset + 2, 0);
 1576:       if (clen > 0)
 1577:         {
 1578:         int ncount = 0;
 1579:         switch (c)
 1580:           {
 1581:           case 0x000b:
 1582:           case 0x000c:
 1583:           case 0x0085:
 1584:           case 0x2028:
 1585:           case 0x2029:
 1586:           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
 1587:           goto ANYNL02;
 1588: 
 1589:           case 0x000d:
 1590:           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
 1591:           /* Fall through */
 1592: 
 1593:           ANYNL02:
 1594:           case 0x000a:
 1595:           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
 1596:               codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
 1597:             {
 1598:             active_count--;           /* Remove non-match possibility */
 1599:             next_active_state--;
 1600:             }
 1601:           ADD_NEW_DATA(-(state_offset + count), 0, ncount);
 1602:           break;
 1603: 
 1604:           default:
 1605:           break;
 1606:           }
 1607:         }
 1608:       break;
 1609: 
 1610:       /*-----------------------------------------------------------------*/
 1611:       case OP_VSPACE_EXTRA + OP_TYPEQUERY:
 1612:       case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
 1613:       case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
 1614:       count = 2;
 1615:       goto QS4;
 1616: 
 1617:       case OP_VSPACE_EXTRA + OP_TYPESTAR:
 1618:       case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
 1619:       case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
 1620:       count = 0;
 1621: 
 1622:       QS4:
 1623:       ADD_ACTIVE(state_offset + 2, 0);
 1624:       if (clen > 0)
 1625:         {
 1626:         BOOL OK;
 1627:         switch (c)
 1628:           {
 1629:           case 0x000a:
 1630:           case 0x000b:
 1631:           case 0x000c:
 1632:           case 0x000d:
 1633:           case 0x0085:
 1634:           case 0x2028:
 1635:           case 0x2029:
 1636:           OK = TRUE;
 1637:           break;
 1638: 
 1639:           default:
 1640:           OK = FALSE;
 1641:           break;
 1642:           }
 1643:         if (OK == (d == OP_VSPACE))
 1644:           {
 1645:           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
 1646:               codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
 1647:             {
 1648:             active_count--;           /* Remove non-match possibility */
 1649:             next_active_state--;
 1650:             }
 1651:           ADD_NEW_DATA(-(state_offset + count), 0, 0);
 1652:           }
 1653:         }
 1654:       break;
 1655: 
 1656:       /*-----------------------------------------------------------------*/
 1657:       case OP_HSPACE_EXTRA + OP_TYPEQUERY:
 1658:       case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
 1659:       case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
 1660:       count = 2;
 1661:       goto QS5;
 1662: 
 1663:       case OP_HSPACE_EXTRA + OP_TYPESTAR:
 1664:       case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
 1665:       case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
 1666:       count = 0;
 1667: 
 1668:       QS5:
 1669:       ADD_ACTIVE(state_offset + 2, 0);
 1670:       if (clen > 0)
 1671:         {
 1672:         BOOL OK;
 1673:         switch (c)
 1674:           {
 1675:           case 0x09:      /* HT */
 1676:           case 0x20:      /* SPACE */
 1677:           case 0xa0:      /* NBSP */
 1678:           case 0x1680:    /* OGHAM SPACE MARK */
 1679:           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
 1680:           case 0x2000:    /* EN QUAD */
 1681:           case 0x2001:    /* EM QUAD */
 1682:           case 0x2002:    /* EN SPACE */
 1683:           case 0x2003:    /* EM SPACE */
 1684:           case 0x2004:    /* THREE-PER-EM SPACE */
 1685:           case 0x2005:    /* FOUR-PER-EM SPACE */
 1686:           case 0x2006:    /* SIX-PER-EM SPACE */
 1687:           case 0x2007:    /* FIGURE SPACE */
 1688:           case 0x2008:    /* PUNCTUATION SPACE */
 1689:           case 0x2009:    /* THIN SPACE */
 1690:           case 0x200A:    /* HAIR SPACE */
 1691:           case 0x202f:    /* NARROW NO-BREAK SPACE */
 1692:           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
 1693:           case 0x3000:    /* IDEOGRAPHIC SPACE */
 1694:           OK = TRUE;
 1695:           break;
 1696: 
 1697:           default:
 1698:           OK = FALSE;
 1699:           break;
 1700:           }
 1701: 
 1702:         if (OK == (d == OP_HSPACE))
 1703:           {
 1704:           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
 1705:               codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
 1706:             {
 1707:             active_count--;           /* Remove non-match possibility */
 1708:             next_active_state--;
 1709:             }
 1710:           ADD_NEW_DATA(-(state_offset + count), 0, 0);
 1711:           }
 1712:         }
 1713:       break;
 1714: 
 1715:       /*-----------------------------------------------------------------*/
 1716: #ifdef SUPPORT_UCP
 1717:       case OP_PROP_EXTRA + OP_TYPEEXACT:
 1718:       case OP_PROP_EXTRA + OP_TYPEUPTO:
 1719:       case OP_PROP_EXTRA + OP_TYPEMINUPTO:
 1720:       case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
 1721:       if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
 1722:         { ADD_ACTIVE(state_offset + 6, 0); }
 1723:       count = current_state->count;  /* Number already matched */
 1724:       if (clen > 0)
 1725:         {
 1726:         BOOL OK;
 1727:         const ucd_record * prop = GET_UCD(c);
 1728:         switch(code[4])
 1729:           {
 1730:           case PT_ANY:
 1731:           OK = TRUE;
 1732:           break;
 1733: 
 1734:           case PT_LAMP:
 1735:           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
 1736:             prop->chartype == ucp_Lt;
 1737:           break;
 1738: 
 1739:           case PT_GC:
 1740:           OK = _pcre_ucp_gentype[prop->chartype] == code[5];
 1741:           break;
 1742: 
 1743:           case PT_PC:
 1744:           OK = prop->chartype == code[5];
 1745:           break;
 1746: 
 1747:           case PT_SC:
 1748:           OK = prop->script == code[5];
 1749:           break;
 1750: 
 1751:           /* These are specials for combination cases. */
 1752: 
 1753:           case PT_ALNUM:
 1754:           OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
 1755:                _pcre_ucp_gentype[prop->chartype] == ucp_N;
 1756:           break;
 1757: 
 1758:           case PT_SPACE:    /* Perl space */
 1759:           OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
 1760:                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
 1761:           break;
 1762: 
 1763:           case PT_PXSPACE:  /* POSIX space */
 1764:           OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
 1765:                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
 1766:                c == CHAR_FF || c == CHAR_CR;
 1767:           break;
 1768: 
 1769:           case PT_WORD:
 1770:           OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
 1771:                _pcre_ucp_gentype[prop->chartype] == ucp_N ||
 1772:                c == CHAR_UNDERSCORE;
 1773:           break;
 1774: 
 1775:           /* Should never occur, but keep compilers from grumbling. */
 1776: 
 1777:           default:
 1778:           OK = codevalue != OP_PROP;
 1779:           break;
 1780:           }
 1781: 
 1782:         if (OK == (d == OP_PROP))
 1783:           {
 1784:           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
 1785:             {
 1786:             active_count--;           /* Remove non-match possibility */
 1787:             next_active_state--;
 1788:             }
 1789:           if (++count >= GET2(code, 1))
 1790:             { ADD_NEW(state_offset + 6, 0); }
 1791:           else
 1792:             { ADD_NEW(state_offset, count); }
 1793:           }
 1794:         }
 1795:       break;
 1796: 
 1797:       /*-----------------------------------------------------------------*/
 1798:       case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
 1799:       case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
 1800:       case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
 1801:       case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
 1802:       if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
 1803:         { ADD_ACTIVE(state_offset + 4, 0); }
 1804:       count = current_state->count;  /* Number already matched */
 1805:       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
 1806:         {
 1807:         const uschar *nptr = ptr + clen;
 1808:         int ncount = 0;
 1809:         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
 1810:           {
 1811:           active_count--;           /* Remove non-match possibility */
 1812:           next_active_state--;
 1813:           }
 1814:         while (nptr < end_subject)
 1815:           {
 1816:           int nd;
 1817:           int ndlen = 1;
 1818:           GETCHARLEN(nd, nptr, ndlen);
 1819:           if (UCD_CATEGORY(nd) != ucp_M) break;
 1820:           ncount++;
 1821:           nptr += ndlen;
 1822:           }
 1823:         if (++count >= GET2(code, 1))
 1824:           { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
 1825:         else
 1826:           { ADD_NEW_DATA(-state_offset, count, ncount); }
 1827:         }
 1828:       break;
 1829: #endif
 1830: 
 1831:       /*-----------------------------------------------------------------*/
 1832:       case OP_ANYNL_EXTRA + OP_TYPEEXACT:
 1833:       case OP_ANYNL_EXTRA + OP_TYPEUPTO:
 1834:       case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
 1835:       case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
 1836:       if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
 1837:         { ADD_ACTIVE(state_offset + 4, 0); }
 1838:       count = current_state->count;  /* Number already matched */
 1839:       if (clen > 0)
 1840:         {
 1841:         int ncount = 0;
 1842:         switch (c)
 1843:           {
 1844:           case 0x000b:
 1845:           case 0x000c:
 1846:           case 0x0085:
 1847:           case 0x2028:
 1848:           case 0x2029:
 1849:           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
 1850:           goto ANYNL03;
 1851: 
 1852:           case 0x000d:
 1853:           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
 1854:           /* Fall through */
 1855: 
 1856:           ANYNL03:
 1857:           case 0x000a:
 1858:           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
 1859:             {
 1860:             active_count--;           /* Remove non-match possibility */
 1861:             next_active_state--;
 1862:             }
 1863:           if (++count >= GET2(code, 1))
 1864:             { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
 1865:           else
 1866:             { ADD_NEW_DATA(-state_offset, count, ncount); }
 1867:           break;
 1868: 
 1869:           default:
 1870:           break;
 1871:           }
 1872:         }
 1873:       break;
 1874: 
 1875:       /*-----------------------------------------------------------------*/
 1876:       case OP_VSPACE_EXTRA + OP_TYPEEXACT:
 1877:       case OP_VSPACE_EXTRA + OP_TYPEUPTO:
 1878:       case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
 1879:       case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
 1880:       if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
 1881:         { ADD_ACTIVE(state_offset + 4, 0); }
 1882:       count = current_state->count;  /* Number already matched */
 1883:       if (clen > 0)
 1884:         {
 1885:         BOOL OK;
 1886:         switch (c)
 1887:           {
 1888:           case 0x000a:
 1889:           case 0x000b:
 1890:           case 0x000c:
 1891:           case 0x000d:
 1892:           case 0x0085:
 1893:           case 0x2028:
 1894:           case 0x2029:
 1895:           OK = TRUE;
 1896:           break;
 1897: 
 1898:           default:
 1899:           OK = FALSE;
 1900:           }
 1901: 
 1902:         if (OK == (d == OP_VSPACE))
 1903:           {
 1904:           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
 1905:             {
 1906:             active_count--;           /* Remove non-match possibility */
 1907:             next_active_state--;
 1908:             }
 1909:           if (++count >= GET2(code, 1))
 1910:             { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
 1911:           else
 1912:             { ADD_NEW_DATA(-state_offset, count, 0); }
 1913:           }
 1914:         }
 1915:       break;
 1916: 
 1917:       /*-----------------------------------------------------------------*/
 1918:       case OP_HSPACE_EXTRA + OP_TYPEEXACT:
 1919:       case OP_HSPACE_EXTRA + OP_TYPEUPTO:
 1920:       case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
 1921:       case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
 1922:       if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
 1923:         { ADD_ACTIVE(state_offset + 4, 0); }
 1924:       count = current_state->count;  /* Number already matched */
 1925:       if (clen > 0)
 1926:         {
 1927:         BOOL OK;
 1928:         switch (c)
 1929:           {
 1930:           case 0x09:      /* HT */
 1931:           case 0x20:      /* SPACE */
 1932:           case 0xa0:      /* NBSP */
 1933:           case 0x1680:    /* OGHAM SPACE MARK */
 1934:           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
 1935:           case 0x2000:    /* EN QUAD */
 1936:           case 0x2001:    /* EM QUAD */
 1937:           case 0x2002:    /* EN SPACE */
 1938:           case 0x2003:    /* EM SPACE */
 1939:           case 0x2004:    /* THREE-PER-EM SPACE */
 1940:           case 0x2005:    /* FOUR-PER-EM SPACE */
 1941:           case 0x2006:    /* SIX-PER-EM SPACE */
 1942:           case 0x2007:    /* FIGURE SPACE */
 1943:           case 0x2008:    /* PUNCTUATION SPACE */
 1944:           case 0x2009:    /* THIN SPACE */
 1945:           case 0x200A:    /* HAIR SPACE */
 1946:           case 0x202f:    /* NARROW NO-BREAK SPACE */
 1947:           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
 1948:           case 0x3000:    /* IDEOGRAPHIC SPACE */
 1949:           OK = TRUE;
 1950:           break;
 1951: 
 1952:           default:
 1953:           OK = FALSE;
 1954:           break;
 1955:           }
 1956: 
 1957:         if (OK == (d == OP_HSPACE))
 1958:           {
 1959:           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
 1960:             {
 1961:             active_count--;           /* Remove non-match possibility */
 1962:             next_active_state--;
 1963:             }
 1964:           if (++count >= GET2(code, 1))
 1965:             { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
 1966:           else
 1967:             { ADD_NEW_DATA(-state_offset, count, 0); }
 1968:           }
 1969:         }
 1970:       break;
 1971: 
 1972: /* ========================================================================== */
 1973:       /* These opcodes are followed by a character that is usually compared
 1974:       to the current subject character; it is loaded into d. We still get
 1975:       here even if there is no subject character, because in some cases zero
 1976:       repetitions are permitted. */
 1977: 
 1978:       /*-----------------------------------------------------------------*/
 1979:       case OP_CHAR:
 1980:       if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
 1981:       break;
 1982: 
 1983:       /*-----------------------------------------------------------------*/
 1984:       case OP_CHARI:
 1985:       if (clen == 0) break;
 1986: 
 1987: #ifdef SUPPORT_UTF8
 1988:       if (utf8)
 1989:         {
 1990:         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
 1991:           {
 1992:           unsigned int othercase;
 1993:           if (c < 128) othercase = fcc[c]; else
 1994: 
 1995:           /* If we have Unicode property support, we can use it to test the
 1996:           other case of the character. */
 1997: 
 1998: #ifdef SUPPORT_UCP
 1999:           othercase = UCD_OTHERCASE(c);
 2000: #else
 2001:           othercase = NOTACHAR;
 2002: #endif
 2003: 
 2004:           if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
 2005:           }
 2006:         }
 2007:       else
 2008: #endif  /* SUPPORT_UTF8 */
 2009: 
 2010:       /* Non-UTF-8 mode */
 2011:         {
 2012:         if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
 2013:         }
 2014:       break;
 2015: 
 2016: 
 2017: #ifdef SUPPORT_UCP
 2018:       /*-----------------------------------------------------------------*/
 2019:       /* This is a tricky one because it can match more than one character.
 2020:       Find out how many characters to skip, and then set up a negative state
 2021:       to wait for them to pass before continuing. */
 2022: 
 2023:       case OP_EXTUNI:
 2024:       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
 2025:         {
 2026:         const uschar *nptr = ptr + clen;
 2027:         int ncount = 0;
 2028:         while (nptr < end_subject)
 2029:           {
 2030:           int nclen = 1;
 2031:           GETCHARLEN(c, nptr, nclen);
 2032:           if (UCD_CATEGORY(c) != ucp_M) break;
 2033:           ncount++;
 2034:           nptr += nclen;
 2035:           }
 2036:         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
 2037:         }
 2038:       break;
 2039: #endif
 2040: 
 2041:       /*-----------------------------------------------------------------*/
 2042:       /* This is a tricky like EXTUNI because it too can match more than one
 2043:       character (when CR is followed by LF). In this case, set up a negative
 2044:       state to wait for one character to pass before continuing. */
 2045: 
 2046:       case OP_ANYNL:
 2047:       if (clen > 0) switch(c)
 2048:         {
 2049:         case 0x000b:
 2050:         case 0x000c:
 2051:         case 0x0085:
 2052:         case 0x2028:
 2053:         case 0x2029:
 2054:         if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
 2055: 
 2056:         case 0x000a:
 2057:         ADD_NEW(state_offset + 1, 0);
 2058:         break;
 2059: 
 2060:         case 0x000d:
 2061:         if (ptr + 1 < end_subject && ptr[1] == 0x0a)
 2062:           {
 2063:           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
 2064:           }
 2065:         else
 2066:           {
 2067:           ADD_NEW(state_offset + 1, 0);
 2068:           }
 2069:         break;
 2070:         }
 2071:       break;
 2072: 
 2073:       /*-----------------------------------------------------------------*/
 2074:       case OP_NOT_VSPACE:
 2075:       if (clen > 0) switch(c)
 2076:         {
 2077:         case 0x000a:
 2078:         case 0x000b:
 2079:         case 0x000c:
 2080:         case 0x000d:
 2081:         case 0x0085:
 2082:         case 0x2028:
 2083:         case 0x2029:
 2084:         break;
 2085: 
 2086:         default:
 2087:         ADD_NEW(state_offset + 1, 0);
 2088:         break;
 2089:         }
 2090:       break;
 2091: 
 2092:       /*-----------------------------------------------------------------*/
 2093:       case OP_VSPACE:
 2094:       if (clen > 0) switch(c)
 2095:         {
 2096:         case 0x000a:
 2097:         case 0x000b:
 2098:         case 0x000c:
 2099:         case 0x000d:
 2100:         case 0x0085:
 2101:         case 0x2028:
 2102:         case 0x2029:
 2103:         ADD_NEW(state_offset + 1, 0);
 2104:         break;
 2105: 
 2106:         default: break;
 2107:         }
 2108:       break;
 2109: 
 2110:       /*-----------------------------------------------------------------*/
 2111:       case OP_NOT_HSPACE:
 2112:       if (clen > 0) switch(c)
 2113:         {
 2114:         case 0x09:      /* HT */
 2115:         case 0x20:      /* SPACE */
 2116:         case 0xa0:      /* NBSP */
 2117:         case 0x1680:    /* OGHAM SPACE MARK */
 2118:         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
 2119:         case 0x2000:    /* EN QUAD */
 2120:         case 0x2001:    /* EM QUAD */
 2121:         case 0x2002:    /* EN SPACE */
 2122:         case 0x2003:    /* EM SPACE */
 2123:         case 0x2004:    /* THREE-PER-EM SPACE */
 2124:         case 0x2005:    /* FOUR-PER-EM SPACE */
 2125:         case 0x2006:    /* SIX-PER-EM SPACE */
 2126:         case 0x2007:    /* FIGURE SPACE */
 2127:         case 0x2008:    /* PUNCTUATION SPACE */
 2128:         case 0x2009:    /* THIN SPACE */
 2129:         case 0x200A:    /* HAIR SPACE */
 2130:         case 0x202f:    /* NARROW NO-BREAK SPACE */
 2131:         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
 2132:         case 0x3000:    /* IDEOGRAPHIC SPACE */
 2133:         break;
 2134: 
 2135:         default:
 2136:         ADD_NEW(state_offset + 1, 0);
 2137:         break;
 2138:         }
 2139:       break;
 2140: 
 2141:       /*-----------------------------------------------------------------*/
 2142:       case OP_HSPACE:
 2143:       if (clen > 0) switch(c)
 2144:         {
 2145:         case 0x09:      /* HT */
 2146:         case 0x20:      /* SPACE */
 2147:         case 0xa0:      /* NBSP */
 2148:         case 0x1680:    /* OGHAM SPACE MARK */
 2149:         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
 2150:         case 0x2000:    /* EN QUAD */
 2151:         case 0x2001:    /* EM QUAD */
 2152:         case 0x2002:    /* EN SPACE */
 2153:         case 0x2003:    /* EM SPACE */
 2154:         case 0x2004:    /* THREE-PER-EM SPACE */
 2155:         case 0x2005:    /* FOUR-PER-EM SPACE */
 2156:         case 0x2006:    /* SIX-PER-EM SPACE */
 2157:         case 0x2007:    /* FIGURE SPACE */
 2158:         case 0x2008:    /* PUNCTUATION SPACE */
 2159:         case 0x2009:    /* THIN SPACE */
 2160:         case 0x200A:    /* HAIR SPACE */
 2161:         case 0x202f:    /* NARROW NO-BREAK SPACE */
 2162:         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
 2163:         case 0x3000:    /* IDEOGRAPHIC SPACE */
 2164:         ADD_NEW(state_offset + 1, 0);
 2165:         break;
 2166:         }
 2167:       break;
 2168: 
 2169:       /*-----------------------------------------------------------------*/
 2170:       /* Match a negated single character casefully. This is only used for
 2171:       one-byte characters, that is, we know that d < 256. The character we are
 2172:       checking (c) can be multibyte. */
 2173: 
 2174:       case OP_NOT:
 2175:       if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
 2176:       break;
 2177: 
 2178:       /*-----------------------------------------------------------------*/
 2179:       /* Match a negated single character caselessly. This is only used for
 2180:       one-byte characters, that is, we know that d < 256. The character we are
 2181:       checking (c) can be multibyte. */
 2182: 
 2183:       case OP_NOTI:
 2184:       if (clen > 0 && c != d && c != fcc[d])
 2185:         { ADD_NEW(state_offset + dlen + 1, 0); }
 2186:       break;
 2187: 
 2188:       /*-----------------------------------------------------------------*/
 2189:       case OP_PLUSI:
 2190:       case OP_MINPLUSI:
 2191:       case OP_POSPLUSI:
 2192:       case OP_NOTPLUSI:
 2193:       case OP_NOTMINPLUSI:
 2194:       case OP_NOTPOSPLUSI:
 2195:       caseless = TRUE;
 2196:       codevalue -= OP_STARI - OP_STAR;
 2197: 
 2198:       /* Fall through */
 2199:       case OP_PLUS:
 2200:       case OP_MINPLUS:
 2201:       case OP_POSPLUS:
 2202:       case OP_NOTPLUS:
 2203:       case OP_NOTMINPLUS:
 2204:       case OP_NOTPOSPLUS:
 2205:       count = current_state->count;  /* Already matched */
 2206:       if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
 2207:       if (clen > 0)
 2208:         {
 2209:         unsigned int otherd = NOTACHAR;
 2210:         if (caseless)
 2211:           {
 2212: #ifdef SUPPORT_UTF8
 2213:           if (utf8 && d >= 128)
 2214:             {
 2215: #ifdef SUPPORT_UCP
 2216:             otherd = UCD_OTHERCASE(d);
 2217: #endif  /* SUPPORT_UCP */
 2218:             }
 2219:           else
 2220: #endif  /* SUPPORT_UTF8 */
 2221:           otherd = fcc[d];
 2222:           }
 2223:         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
 2224:           {
 2225:           if (count > 0 &&
 2226:               (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
 2227:             {
 2228:             active_count--;             /* Remove non-match possibility */
 2229:             next_active_state--;
 2230:             }
 2231:           count++;
 2232:           ADD_NEW(state_offset, count);
 2233:           }
 2234:         }
 2235:       break;
 2236: 
 2237:       /*-----------------------------------------------------------------*/
 2238:       case OP_QUERYI:
 2239:       case OP_MINQUERYI:
 2240:       case OP_POSQUERYI:
 2241:       case OP_NOTQUERYI:
 2242:       case OP_NOTMINQUERYI:
 2243:       case OP_NOTPOSQUERYI:
 2244:       caseless = TRUE;
 2245:       codevalue -= OP_STARI - OP_STAR;
 2246:       /* Fall through */
 2247:       case OP_QUERY:
 2248:       case OP_MINQUERY:
 2249:       case OP_POSQUERY:
 2250:       case OP_NOTQUERY:
 2251:       case OP_NOTMINQUERY:
 2252:       case OP_NOTPOSQUERY:
 2253:       ADD_ACTIVE(state_offset + dlen + 1, 0);
 2254:       if (clen > 0)
 2255:         {
 2256:         unsigned int otherd = NOTACHAR;
 2257:         if (caseless)
 2258:           {
 2259: #ifdef SUPPORT_UTF8
 2260:           if (utf8 && d >= 128)
 2261:             {
 2262: #ifdef SUPPORT_UCP
 2263:             otherd = UCD_OTHERCASE(d);
 2264: #endif  /* SUPPORT_UCP */
 2265:             }
 2266:           else
 2267: #endif  /* SUPPORT_UTF8 */
 2268:           otherd = fcc[d];
 2269:           }
 2270:         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
 2271:           {
 2272:           if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
 2273:             {
 2274:             active_count--;            /* Remove non-match possibility */
 2275:             next_active_state--;
 2276:             }
 2277:           ADD_NEW(state_offset + dlen + 1, 0);
 2278:           }
 2279:         }
 2280:       break;
 2281: 
 2282:       /*-----------------------------------------------------------------*/
 2283:       case OP_STARI:
 2284:       case OP_MINSTARI:
 2285:       case OP_POSSTARI:
 2286:       case OP_NOTSTARI:
 2287:       case OP_NOTMINSTARI:
 2288:       case OP_NOTPOSSTARI:
 2289:       caseless = TRUE;
 2290:       codevalue -= OP_STARI - OP_STAR;
 2291:       /* Fall through */
 2292:       case OP_STAR:
 2293:       case OP_MINSTAR:
 2294:       case OP_POSSTAR:
 2295:       case OP_NOTSTAR:
 2296:       case OP_NOTMINSTAR:
 2297:       case OP_NOTPOSSTAR:
 2298:       ADD_ACTIVE(state_offset + dlen + 1, 0);
 2299:       if (clen > 0)
 2300:         {
 2301:         unsigned int otherd = NOTACHAR;
 2302:         if (caseless)
 2303:           {
 2304: #ifdef SUPPORT_UTF8
 2305:           if (utf8 && d >= 128)
 2306:             {
 2307: #ifdef SUPPORT_UCP
 2308:             otherd = UCD_OTHERCASE(d);
 2309: #endif  /* SUPPORT_UCP */
 2310:             }
 2311:           else
 2312: #endif  /* SUPPORT_UTF8 */
 2313:           otherd = fcc[d];
 2314:           }
 2315:         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
 2316:           {
 2317:           if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
 2318:             {
 2319:             active_count--;            /* Remove non-match possibility */
 2320:             next_active_state--;
 2321:             }
 2322:           ADD_NEW(state_offset, 0);
 2323:           }
 2324:         }
 2325:       break;
 2326: 
 2327:       /*-----------------------------------------------------------------*/
 2328:       case OP_EXACTI:
 2329:       case OP_NOTEXACTI:
 2330:       caseless = TRUE;
 2331:       codevalue -= OP_STARI - OP_STAR;
 2332:       /* Fall through */
 2333:       case OP_EXACT:
 2334:       case OP_NOTEXACT:
 2335:       count = current_state->count;  /* Number already matched */
 2336:       if (clen > 0)
 2337:         {
 2338:         unsigned int otherd = NOTACHAR;
 2339:         if (caseless)
 2340:           {
 2341: #ifdef SUPPORT_UTF8
 2342:           if (utf8 && d >= 128)
 2343:             {
 2344: #ifdef SUPPORT_UCP
 2345:             otherd = UCD_OTHERCASE(d);
 2346: #endif  /* SUPPORT_UCP */
 2347:             }
 2348:           else
 2349: #endif  /* SUPPORT_UTF8 */
 2350:           otherd = fcc[d];
 2351:           }
 2352:         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
 2353:           {
 2354:           if (++count >= GET2(code, 1))
 2355:             { ADD_NEW(state_offset + dlen + 3, 0); }
 2356:           else
 2357:             { ADD_NEW(state_offset, count); }
 2358:           }
 2359:         }
 2360:       break;
 2361: 
 2362:       /*-----------------------------------------------------------------*/
 2363:       case OP_UPTOI:
 2364:       case OP_MINUPTOI:
 2365:       case OP_POSUPTOI:
 2366:       case OP_NOTUPTOI:
 2367:       case OP_NOTMINUPTOI:
 2368:       case OP_NOTPOSUPTOI:
 2369:       caseless = TRUE;
 2370:       codevalue -= OP_STARI - OP_STAR;
 2371:       /* Fall through */
 2372:       case OP_UPTO:
 2373:       case OP_MINUPTO:
 2374:       case OP_POSUPTO:
 2375:       case OP_NOTUPTO:
 2376:       case OP_NOTMINUPTO:
 2377:       case OP_NOTPOSUPTO:
 2378:       ADD_ACTIVE(state_offset + dlen + 3, 0);
 2379:       count = current_state->count;  /* Number already matched */
 2380:       if (clen > 0)
 2381:         {
 2382:         unsigned int otherd = NOTACHAR;
 2383:         if (caseless)
 2384:           {
 2385: #ifdef SUPPORT_UTF8
 2386:           if (utf8 && d >= 128)
 2387:             {
 2388: #ifdef SUPPORT_UCP
 2389:             otherd = UCD_OTHERCASE(d);
 2390: #endif  /* SUPPORT_UCP */
 2391:             }
 2392:           else
 2393: #endif  /* SUPPORT_UTF8 */
 2394:           otherd = fcc[d];
 2395:           }
 2396:         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
 2397:           {
 2398:           if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
 2399:             {
 2400:             active_count--;             /* Remove non-match possibility */
 2401:             next_active_state--;
 2402:             }
 2403:           if (++count >= GET2(code, 1))
 2404:             { ADD_NEW(state_offset + dlen + 3, 0); }
 2405:           else
 2406:             { ADD_NEW(state_offset, count); }
 2407:           }
 2408:         }
 2409:       break;
 2410: 
 2411: 
 2412: /* ========================================================================== */
 2413:       /* These are the class-handling opcodes */
 2414: 
 2415:       case OP_CLASS:
 2416:       case OP_NCLASS:
 2417:       case OP_XCLASS:
 2418:         {
 2419:         BOOL isinclass = FALSE;
 2420:         int next_state_offset;
 2421:         const uschar *ecode;
 2422: 
 2423:         /* For a simple class, there is always just a 32-byte table, and we
 2424:         can set isinclass from it. */
 2425: 
 2426:         if (codevalue != OP_XCLASS)
 2427:           {
 2428:           ecode = code + 33;
 2429:           if (clen > 0)
 2430:             {
 2431:             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
 2432:               ((code[1 + c/8] & (1 << (c&7))) != 0);
 2433:             }
 2434:           }
 2435: 
 2436:         /* An extended class may have a table or a list of single characters,
 2437:         ranges, or both, and it may be positive or negative. There's a
 2438:         function that sorts all this out. */
 2439: 
 2440:         else
 2441:          {
 2442:          ecode = code + GET(code, 1);
 2443:          if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
 2444:          }
 2445: 
 2446:         /* At this point, isinclass is set for all kinds of class, and ecode
 2447:         points to the byte after the end of the class. If there is a
 2448:         quantifier, this is where it will be. */
 2449: 
 2450:         next_state_offset = (int)(ecode - start_code);
 2451: 
 2452:         switch (*ecode)
 2453:           {
 2454:           case OP_CRSTAR:
 2455:           case OP_CRMINSTAR:
 2456:           ADD_ACTIVE(next_state_offset + 1, 0);
 2457:           if (isinclass) { ADD_NEW(state_offset, 0); }
 2458:           break;
 2459: 
 2460:           case OP_CRPLUS:
 2461:           case OP_CRMINPLUS:
 2462:           count = current_state->count;  /* Already matched */
 2463:           if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
 2464:           if (isinclass) { count++; ADD_NEW(state_offset, count); }
 2465:           break;
 2466: 
 2467:           case OP_CRQUERY:
 2468:           case OP_CRMINQUERY:
 2469:           ADD_ACTIVE(next_state_offset + 1, 0);
 2470:           if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
 2471:           break;
 2472: 
 2473:           case OP_CRRANGE:
 2474:           case OP_CRMINRANGE:
 2475:           count = current_state->count;  /* Already matched */
 2476:           if (count >= GET2(ecode, 1))
 2477:             { ADD_ACTIVE(next_state_offset + 5, 0); }
 2478:           if (isinclass)
 2479:             {
 2480:             int max = GET2(ecode, 3);
 2481:             if (++count >= max && max != 0)   /* Max 0 => no limit */
 2482:               { ADD_NEW(next_state_offset + 5, 0); }
 2483:             else
 2484:               { ADD_NEW(state_offset, count); }
 2485:             }
 2486:           break;
 2487: 
 2488:           default:
 2489:           if (isinclass) { ADD_NEW(next_state_offset, 0); }
 2490:           break;
 2491:           }
 2492:         }
 2493:       break;
 2494: 
 2495: /* ========================================================================== */
 2496:       /* These are the opcodes for fancy brackets of various kinds. We have
 2497:       to use recursion in order to handle them. The "always failing" assertion
 2498:       (?!) is optimised to OP_FAIL when compiling, so we have to support that,
 2499:       though the other "backtracking verbs" are not supported. */
 2500: 
 2501:       case OP_FAIL:
 2502:       forced_fail++;    /* Count FAILs for multiple states */
 2503:       break;
 2504: 
 2505:       case OP_ASSERT:
 2506:       case OP_ASSERT_NOT:
 2507:       case OP_ASSERTBACK:
 2508:       case OP_ASSERTBACK_NOT:
 2509:         {
 2510:         int rc;
 2511:         int local_offsets[2];
 2512:         int local_workspace[1000];
 2513:         const uschar *endasscode = code + GET(code, 1);
 2514: 
 2515:         while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
 2516: 
 2517:         rc = internal_dfa_exec(
 2518:           md,                                   /* static match data */
 2519:           code,                                 /* this subexpression's code */
 2520:           ptr,                                  /* where we currently are */
 2521:           (int)(ptr - start_subject),           /* start offset */
 2522:           local_offsets,                        /* offset vector */
 2523:           sizeof(local_offsets)/sizeof(int),    /* size of same */
 2524:           local_workspace,                      /* workspace vector */
 2525:           sizeof(local_workspace)/sizeof(int),  /* size of same */
 2526:           rlevel);                              /* function recursion level */
 2527: 
 2528:         if (rc == PCRE_ERROR_DFA_UITEM) return rc;
 2529:         if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
 2530:             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
 2531:         }
 2532:       break;
 2533: 
 2534:       /*-----------------------------------------------------------------*/
 2535:       case OP_COND:
 2536:       case OP_SCOND:
 2537:         {
 2538:         int local_offsets[1000];
 2539:         int local_workspace[1000];
 2540:         int codelink = GET(code, 1);
 2541:         int condcode;
 2542: 
 2543:         /* Because of the way auto-callout works during compile, a callout item
 2544:         is inserted between OP_COND and an assertion condition. This does not
 2545:         happen for the other conditions. */
 2546: 
 2547:         if (code[LINK_SIZE+1] == OP_CALLOUT)
 2548:           {
 2549:           rrc = 0;
 2550:           if (pcre_callout != NULL)
 2551:             {
 2552:             pcre_callout_block cb;
 2553:             cb.version          = 1;   /* Version 1 of the callout block */
 2554:             cb.callout_number   = code[LINK_SIZE+2];
 2555:             cb.offset_vector    = offsets;
 2556:             cb.subject          = (PCRE_SPTR)start_subject;
 2557:             cb.subject_length   = (int)(end_subject - start_subject);
 2558:             cb.start_match      = (int)(current_subject - start_subject);
 2559:             cb.current_position = (int)(ptr - start_subject);
 2560:             cb.pattern_position = GET(code, LINK_SIZE + 3);
 2561:             cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
 2562:             cb.capture_top      = 1;
 2563:             cb.capture_last     = -1;
 2564:             cb.callout_data     = md->callout_data;
 2565:             cb.mark             = NULL;   /* No (*MARK) support */
 2566:             if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
 2567:             }
 2568:           if (rrc > 0) break;                      /* Fail this thread */
 2569:           code += _pcre_OP_lengths[OP_CALLOUT];    /* Skip callout data */
 2570:           }
 2571: 
 2572:         condcode = code[LINK_SIZE+1];
 2573: 
 2574:         /* Back reference conditions are not supported */
 2575: 
 2576:         if (condcode == OP_CREF || condcode == OP_NCREF)
 2577:           return PCRE_ERROR_DFA_UCOND;
 2578: 
 2579:         /* The DEFINE condition is always false */
 2580: 
 2581:         if (condcode == OP_DEF)
 2582:           { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
 2583: 
 2584:         /* The only supported version of OP_RREF is for the value RREF_ANY,
 2585:         which means "test if in any recursion". We can't test for specifically
 2586:         recursed groups. */
 2587: 
 2588:         else if (condcode == OP_RREF || condcode == OP_NRREF)
 2589:           {
 2590:           int value = GET2(code, LINK_SIZE+2);
 2591:           if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
 2592:           if (md->recursive != NULL)
 2593:             { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
 2594:           else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
 2595:           }
 2596: 
 2597:         /* Otherwise, the condition is an assertion */
 2598: 
 2599:         else
 2600:           {
 2601:           int rc;
 2602:           const uschar *asscode = code + LINK_SIZE + 1;
 2603:           const uschar *endasscode = asscode + GET(asscode, 1);
 2604: 
 2605:           while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
 2606: 
 2607:           rc = internal_dfa_exec(
 2608:             md,                                   /* fixed match data */
 2609:             asscode,                              /* this subexpression's code */
 2610:             ptr,                                  /* where we currently are */
 2611:             (int)(ptr - start_subject),           /* start offset */
 2612:             local_offsets,                        /* offset vector */
 2613:             sizeof(local_offsets)/sizeof(int),    /* size of same */
 2614:             local_workspace,                      /* workspace vector */
 2615:             sizeof(local_workspace)/sizeof(int),  /* size of same */
 2616:             rlevel);                              /* function recursion level */
 2617: 
 2618:           if (rc == PCRE_ERROR_DFA_UITEM) return rc;
 2619:           if ((rc >= 0) ==
 2620:                 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
 2621:             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
 2622:           else
 2623:             { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
 2624:           }
 2625:         }
 2626:       break;
 2627: 
 2628:       /*-----------------------------------------------------------------*/
 2629:       case OP_RECURSE:
 2630:         {
 2631:         dfa_recursion_info *ri;
 2632:         int local_offsets[1000];
 2633:         int local_workspace[1000];
 2634:         const uschar *callpat = start_code + GET(code, 1);
 2635:         int recno = (callpat == md->start_code)? 0 :
 2636:           GET2(callpat, 1 + LINK_SIZE);
 2637:         int rc;
 2638: 
 2639:         DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
 2640: 
 2641:         /* Check for repeating a recursion without advancing the subject
 2642:         pointer. This should catch convoluted mutual recursions. (Some simple
 2643:         cases are caught at compile time.) */
 2644: 
 2645:         for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
 2646:           if (recno == ri->group_num && ptr == ri->subject_position)
 2647:             return PCRE_ERROR_RECURSELOOP;
 2648: 
 2649:         /* Remember this recursion and where we started it so as to
 2650:         catch infinite loops. */
 2651: 
 2652:         new_recursive.group_num = recno;
 2653:         new_recursive.subject_position = ptr;
 2654:         new_recursive.prevrec = md->recursive;
 2655:         md->recursive = &new_recursive;
 2656: 
 2657:         rc = internal_dfa_exec(
 2658:           md,                                   /* fixed match data */
 2659:           callpat,                              /* this subexpression's code */
 2660:           ptr,                                  /* where we currently are */
 2661:           (int)(ptr - start_subject),           /* start offset */
 2662:           local_offsets,                        /* offset vector */
 2663:           sizeof(local_offsets)/sizeof(int),    /* size of same */
 2664:           local_workspace,                      /* workspace vector */
 2665:           sizeof(local_workspace)/sizeof(int),  /* size of same */
 2666:           rlevel);                              /* function recursion level */
 2667: 
 2668:         md->recursive = new_recursive.prevrec;  /* Done this recursion */
 2669: 
 2670:         DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
 2671:           rc));
 2672: 
 2673:         /* Ran out of internal offsets */
 2674: 
 2675:         if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
 2676: 
 2677:         /* For each successful matched substring, set up the next state with a
 2678:         count of characters to skip before trying it. Note that the count is in
 2679:         characters, not bytes. */
 2680: 
 2681:         if (rc > 0)
 2682:           {
 2683:           for (rc = rc*2 - 2; rc >= 0; rc -= 2)
 2684:             {
 2685:             const uschar *p = start_subject + local_offsets[rc];
 2686:             const uschar *pp = start_subject + local_offsets[rc+1];
 2687:             int charcount = local_offsets[rc+1] - local_offsets[rc];
 2688:             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
 2689:             if (charcount > 0)
 2690:               {
 2691:               ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
 2692:               }
 2693:             else
 2694:               {
 2695:               ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
 2696:               }
 2697:             }
 2698:           }
 2699:         else if (rc != PCRE_ERROR_NOMATCH) return rc;
 2700:         }
 2701:       break;
 2702: 
 2703:       /*-----------------------------------------------------------------*/
 2704:       case OP_BRAPOS:
 2705:       case OP_SBRAPOS:
 2706:       case OP_CBRAPOS:
 2707:       case OP_SCBRAPOS:
 2708:       case OP_BRAPOSZERO:
 2709:         {
 2710:         int charcount, matched_count;
 2711:         const uschar *local_ptr = ptr;
 2712:         BOOL allow_zero;
 2713: 
 2714:         if (codevalue == OP_BRAPOSZERO)
 2715:           {
 2716:           allow_zero = TRUE;
 2717:           codevalue = *(++code);  /* Codevalue will be one of above BRAs */
 2718:           }
 2719:         else allow_zero = FALSE;
 2720: 
 2721:         /* Loop to match the subpattern as many times as possible as if it were
 2722:         a complete pattern. */
 2723: 
 2724:         for (matched_count = 0;; matched_count++)
 2725:           {
 2726:           int local_offsets[2];
 2727:           int local_workspace[1000];
 2728: 
 2729:           int rc = internal_dfa_exec(
 2730:             md,                                   /* fixed match data */
 2731:             code,                                 /* this subexpression's code */
 2732:             local_ptr,                            /* where we currently are */
 2733:             (int)(ptr - start_subject),           /* start offset */
 2734:             local_offsets,                        /* offset vector */
 2735:             sizeof(local_offsets)/sizeof(int),    /* size of same */
 2736:             local_workspace,                      /* workspace vector */
 2737:             sizeof(local_workspace)/sizeof(int),  /* size of same */
 2738:             rlevel);                              /* function recursion level */
 2739: 
 2740:           /* Failed to match */
 2741: 
 2742:           if (rc < 0)
 2743:             {
 2744:             if (rc != PCRE_ERROR_NOMATCH) return rc;
 2745:             break;
 2746:             }
 2747: 
 2748:           /* Matched: break the loop if zero characters matched. */
 2749: 
 2750:           charcount = local_offsets[1] - local_offsets[0];
 2751:           if (charcount == 0) break;
 2752:           local_ptr += charcount;    /* Advance temporary position ptr */
 2753:           }
 2754: 
 2755:         /* At this point we have matched the subpattern matched_count
 2756:         times, and local_ptr is pointing to the character after the end of the
 2757:         last match. */
 2758: 
 2759:         if (matched_count > 0 || allow_zero)
 2760:           {
 2761:           const uschar *end_subpattern = code;
 2762:           int next_state_offset;
 2763: 
 2764:           do { end_subpattern += GET(end_subpattern, 1); }
 2765:             while (*end_subpattern == OP_ALT);
 2766:           next_state_offset =
 2767:             (int)(end_subpattern - start_code + LINK_SIZE + 1);
 2768: 
 2769:           /* Optimization: if there are no more active states, and there
 2770:           are no new states yet set up, then skip over the subject string
 2771:           right here, to save looping. Otherwise, set up the new state to swing
 2772:           into action when the end of the matched substring is reached. */
 2773: 
 2774:           if (i + 1 >= active_count && new_count == 0)
 2775:             {
 2776:             ptr = local_ptr;
 2777:             clen = 0;
 2778:             ADD_NEW(next_state_offset, 0);
 2779:             }
 2780:           else
 2781:             {
 2782:             const uschar *p = ptr;
 2783:             const uschar *pp = local_ptr;
 2784:             charcount = (int)(pp - p);
 2785:             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
 2786:             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
 2787:             }
 2788:           }
 2789:         }
 2790:       break;
 2791: 
 2792:       /*-----------------------------------------------------------------*/
 2793:       case OP_ONCE:
 2794:       case OP_ONCE_NC:
 2795:         {
 2796:         int local_offsets[2];
 2797:         int local_workspace[1000];
 2798: 
 2799:         int rc = internal_dfa_exec(
 2800:           md,                                   /* fixed match data */
 2801:           code,                                 /* this subexpression's code */
 2802:           ptr,                                  /* where we currently are */
 2803:           (int)(ptr - start_subject),           /* start offset */
 2804:           local_offsets,                        /* offset vector */
 2805:           sizeof(local_offsets)/sizeof(int),    /* size of same */
 2806:           local_workspace,                      /* workspace vector */
 2807:           sizeof(local_workspace)/sizeof(int),  /* size of same */
 2808:           rlevel);                              /* function recursion level */
 2809: 
 2810:         if (rc >= 0)
 2811:           {
 2812:           const uschar *end_subpattern = code;
 2813:           int charcount = local_offsets[1] - local_offsets[0];
 2814:           int next_state_offset, repeat_state_offset;
 2815: 
 2816:           do { end_subpattern += GET(end_subpattern, 1); }
 2817:             while (*end_subpattern == OP_ALT);
 2818:           next_state_offset =
 2819:             (int)(end_subpattern - start_code + LINK_SIZE + 1);
 2820: 
 2821:           /* If the end of this subpattern is KETRMAX or KETRMIN, we must
 2822:           arrange for the repeat state also to be added to the relevant list.
 2823:           Calculate the offset, or set -1 for no repeat. */
 2824: 
 2825:           repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
 2826:                                  *end_subpattern == OP_KETRMIN)?
 2827:             (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
 2828: 
 2829:           /* If we have matched an empty string, add the next state at the
 2830:           current character pointer. This is important so that the duplicate
 2831:           checking kicks in, which is what breaks infinite loops that match an
 2832:           empty string. */
 2833: 
 2834:           if (charcount == 0)
 2835:             {
 2836:             ADD_ACTIVE(next_state_offset, 0);
 2837:             }
 2838: 
 2839:           /* Optimization: if there are no more active states, and there
 2840:           are no new states yet set up, then skip over the subject string
 2841:           right here, to save looping. Otherwise, set up the new state to swing
 2842:           into action when the end of the matched substring is reached. */
 2843: 
 2844:           else if (i + 1 >= active_count && new_count == 0)
 2845:             {
 2846:             ptr += charcount;
 2847:             clen = 0;
 2848:             ADD_NEW(next_state_offset, 0);
 2849: 
 2850:             /* If we are adding a repeat state at the new character position,
 2851:             we must fudge things so that it is the only current state.
 2852:             Otherwise, it might be a duplicate of one we processed before, and
 2853:             that would cause it to be skipped. */
 2854: 
 2855:             if (repeat_state_offset >= 0)
 2856:               {
 2857:               next_active_state = active_states;
 2858:               active_count = 0;
 2859:               i = -1;
 2860:               ADD_ACTIVE(repeat_state_offset, 0);
 2861:               }
 2862:             }
 2863:           else
 2864:             {
 2865:             const uschar *p = start_subject + local_offsets[0];
 2866:             const uschar *pp = start_subject + local_offsets[1];
 2867:             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
 2868:             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
 2869:             if (repeat_state_offset >= 0)
 2870:               { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
 2871:             }
 2872:           }
 2873:         else if (rc != PCRE_ERROR_NOMATCH) return rc;
 2874:         }
 2875:       break;
 2876: 
 2877: 
 2878: /* ========================================================================== */
 2879:       /* Handle callouts */
 2880: 
 2881:       case OP_CALLOUT:
 2882:       rrc = 0;
 2883:       if (pcre_callout != NULL)
 2884:         {
 2885:         pcre_callout_block cb;
 2886:         cb.version          = 1;   /* Version 1 of the callout block */
 2887:         cb.callout_number   = code[1];
 2888:         cb.offset_vector    = offsets;
 2889:         cb.subject          = (PCRE_SPTR)start_subject;
 2890:         cb.subject_length   = (int)(end_subject - start_subject);
 2891:         cb.start_match      = (int)(current_subject - start_subject);
 2892:         cb.current_position = (int)(ptr - start_subject);
 2893:         cb.pattern_position = GET(code, 2);
 2894:         cb.next_item_length = GET(code, 2 + LINK_SIZE);
 2895:         cb.capture_top      = 1;
 2896:         cb.capture_last     = -1;
 2897:         cb.callout_data     = md->callout_data;
 2898:         cb.mark             = NULL;   /* No (*MARK) support */
 2899:         if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
 2900:         }
 2901:       if (rrc == 0)
 2902:         { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
 2903:       break;
 2904: 
 2905: 
 2906: /* ========================================================================== */
 2907:       default:        /* Unsupported opcode */
 2908:       return PCRE_ERROR_DFA_UITEM;
 2909:       }
 2910: 
 2911:     NEXT_ACTIVE_STATE: continue;
 2912: 
 2913:     }      /* End of loop scanning active states */
 2914: 
 2915:   /* We have finished the processing at the current subject character. If no
 2916:   new states have been set for the next character, we have found all the
 2917:   matches that we are going to find. If we are at the top level and partial
 2918:   matching has been requested, check for appropriate conditions.
 2919: 
 2920:   The "forced_ fail" variable counts the number of (*F) encountered for the
 2921:   character. If it is equal to the original active_count (saved in
 2922:   workspace[1]) it means that (*F) was found on every active state. In this
 2923:   case we don't want to give a partial match.
 2924: 
 2925:   The "could_continue" variable is true if a state could have continued but
 2926:   for the fact that the end of the subject was reached. */
 2927: 
 2928:   if (new_count <= 0)
 2929:     {
 2930:     if (rlevel == 1 &&                               /* Top level, and */
 2931:         could_continue &&                            /* Some could go on */
 2932:         forced_fail != workspace[1] &&               /* Not all forced fail & */
 2933:         (                                            /* either... */
 2934:         (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
 2935:         ||                                           /* or... */
 2936:         ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
 2937:          match_count < 0)                            /* no matches */
 2938:         ) &&                                         /* And... */
 2939:         ptr >= end_subject &&                  /* Reached end of subject */
 2940:         ptr > md->start_used_ptr)              /* Inspected non-empty string */
 2941:       {
 2942:       if (offsetcount >= 2)
 2943:         {
 2944:         offsets[0] = (int)(md->start_used_ptr - start_subject);
 2945:         offsets[1] = (int)(end_subject - start_subject);
 2946:         }
 2947:       match_count = PCRE_ERROR_PARTIAL;
 2948:       }
 2949: 
 2950:     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
 2951:       "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
 2952:       rlevel*2-2, SP));
 2953:     break;        /* In effect, "return", but see the comment below */
 2954:     }
 2955: 
 2956:   /* One or more states are active for the next character. */
 2957: 
 2958:   ptr += clen;    /* Advance to next subject character */
 2959:   }               /* Loop to move along the subject string */
 2960: 
 2961: /* Control gets here from "break" a few lines above. We do it this way because
 2962: if we use "return" above, we have compiler trouble. Some compilers warn if
 2963: there's nothing here because they think the function doesn't return a value. On
 2964: the other hand, if we put a dummy statement here, some more clever compilers
 2965: complain that it can't be reached. Sigh. */
 2966: 
 2967: return match_count;
 2968: }
 2969: 
 2970: 
 2971: 
 2972: 
 2973: /*************************************************
 2974: *    Execute a Regular Expression - DFA engine   *
 2975: *************************************************/
 2976: 
 2977: /* This external function applies a compiled re to a subject string using a DFA
 2978: engine. This function calls the internal function multiple times if the pattern
 2979: is not anchored.
 2980: 
 2981: Arguments:
 2982:   argument_re     points to the compiled expression
 2983:   extra_data      points to extra data or is NULL
 2984:   subject         points to the subject string
 2985:   length          length of subject string (may contain binary zeros)
 2986:   start_offset    where to start in the subject string
 2987:   options         option bits
 2988:   offsets         vector of match offsets
 2989:   offsetcount     size of same
 2990:   workspace       workspace vector
 2991:   wscount         size of same
 2992: 
 2993: Returns:          > 0 => number of match offset pairs placed in offsets
 2994:                   = 0 => offsets overflowed; longest matches are present
 2995:                    -1 => failed to match
 2996:                  < -1 => some kind of unexpected problem
 2997: */
 2998: 
 2999: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
 3000: pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
 3001:   const char *subject, int length, int start_offset, int options, int *offsets,
 3002:   int offsetcount, int *workspace, int wscount)
 3003: {
 3004: real_pcre *re = (real_pcre *)argument_re;
 3005: dfa_match_data match_block;
 3006: dfa_match_data *md = &match_block;
 3007: BOOL utf8, anchored, startline, firstline;
 3008: const uschar *current_subject, *end_subject, *lcc;
 3009: 
 3010: pcre_study_data internal_study;
 3011: const pcre_study_data *study = NULL;
 3012: real_pcre internal_re;
 3013: 
 3014: const uschar *req_byte_ptr;
 3015: const uschar *start_bits = NULL;
 3016: BOOL first_byte_caseless = FALSE;
 3017: BOOL req_byte_caseless = FALSE;
 3018: int first_byte = -1;
 3019: int req_byte = -1;
 3020: int req_byte2 = -1;
 3021: int newline;
 3022: 
 3023: /* Plausibility checks */
 3024: 
 3025: if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
 3026: if (re == NULL || subject == NULL || workspace == NULL ||
 3027:    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
 3028: if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
 3029: if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
 3030: if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
 3031: 
 3032: /* We need to find the pointer to any study data before we test for byte
 3033: flipping, so we scan the extra_data block first. This may set two fields in the
 3034: match block, so we must initialize them beforehand. However, the other fields
 3035: in the match block must not be set until after the byte flipping. */
 3036: 
 3037: md->tables = re->tables;
 3038: md->callout_data = NULL;
 3039: 
 3040: if (extra_data != NULL)
 3041:   {
 3042:   unsigned int flags = extra_data->flags;
 3043:   if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
 3044:     study = (const pcre_study_data *)extra_data->study_data;
 3045:   if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
 3046:   if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
 3047:     return PCRE_ERROR_DFA_UMLIMIT;
 3048:   if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
 3049:     md->callout_data = extra_data->callout_data;
 3050:   if ((flags & PCRE_EXTRA_TABLES) != 0)
 3051:     md->tables = extra_data->tables;
 3052:   }
 3053: 
 3054: /* Check that the first field in the block is the magic number. If it is not,
 3055: test for a regex that was compiled on a host of opposite endianness. If this is
 3056: the case, flipped values are put in internal_re and internal_study if there was
 3057: study data too. */
 3058: 
 3059: if (re->magic_number != MAGIC_NUMBER)
 3060:   {
 3061:   re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
 3062:   if (re == NULL) return PCRE_ERROR_BADMAGIC;
 3063:   if (study != NULL) study = &internal_study;
 3064:   }
 3065: 
 3066: /* Set some local values */
 3067: 
 3068: current_subject = (const unsigned char *)subject + start_offset;
 3069: end_subject = (const unsigned char *)subject + length;
 3070: req_byte_ptr = current_subject - 1;
 3071: 
 3072: #ifdef SUPPORT_UTF8
 3073: utf8 = (re->options & PCRE_UTF8) != 0;
 3074: #else
 3075: utf8 = FALSE;
 3076: #endif
 3077: 
 3078: anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
 3079:   (re->options & PCRE_ANCHORED) != 0;
 3080: 
 3081: /* The remaining fixed data for passing around. */
 3082: 
 3083: md->start_code = (const uschar *)argument_re +
 3084:     re->name_table_offset + re->name_count * re->name_entry_size;
 3085: md->start_subject = (const unsigned char *)subject;
 3086: md->end_subject = end_subject;
 3087: md->start_offset = start_offset;
 3088: md->moptions = options;
 3089: md->poptions = re->options;
 3090: 
 3091: /* If the BSR option is not set at match time, copy what was set
 3092: at compile time. */
 3093: 
 3094: if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
 3095:   {
 3096:   if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
 3097:     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
 3098: #ifdef BSR_ANYCRLF
 3099:   else md->moptions |= PCRE_BSR_ANYCRLF;
 3100: #endif
 3101:   }
 3102: 
 3103: /* Handle different types of newline. The three bits give eight cases. If
 3104: nothing is set at run time, whatever was used at compile time applies. */
 3105: 
 3106: switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
 3107:          PCRE_NEWLINE_BITS)
 3108:   {
 3109:   case 0: newline = NEWLINE; break;   /* Compile-time default */
 3110:   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
 3111:   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
 3112:   case PCRE_NEWLINE_CR+
 3113:        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
 3114:   case PCRE_NEWLINE_ANY: newline = -1; break;
 3115:   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
 3116:   default: return PCRE_ERROR_BADNEWLINE;
 3117:   }
 3118: 
 3119: if (newline == -2)
 3120:   {
 3121:   md->nltype = NLTYPE_ANYCRLF;
 3122:   }
 3123: else if (newline < 0)
 3124:   {
 3125:   md->nltype = NLTYPE_ANY;
 3126:   }
 3127: else
 3128:   {
 3129:   md->nltype = NLTYPE_FIXED;
 3130:   if (newline > 255)
 3131:     {
 3132:     md->nllen = 2;
 3133:     md->nl[0] = (newline >> 8) & 255;
 3134:     md->nl[1] = newline & 255;
 3135:     }
 3136:   else
 3137:     {
 3138:     md->nllen = 1;
 3139:     md->nl[0] = newline;
 3140:     }
 3141:   }
 3142: 
 3143: /* Check a UTF-8 string if required. Unfortunately there's no way of passing
 3144: back the character offset. */
 3145: 
 3146: #ifdef SUPPORT_UTF8
 3147: if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
 3148:   {
 3149:   int erroroffset;
 3150:   int errorcode = _pcre_valid_utf8((uschar *)subject, length, &erroroffset);
 3151:   if (errorcode != 0)
 3152:     {
 3153:     if (offsetcount >= 2)
 3154:       {
 3155:       offsets[0] = erroroffset;
 3156:       offsets[1] = errorcode;
 3157:       }
 3158:     return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
 3159:       PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
 3160:     }
 3161:   if (start_offset > 0 && start_offset < length &&
 3162:         (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
 3163:     return PCRE_ERROR_BADUTF8_OFFSET;
 3164:   }
 3165: #endif
 3166: 
 3167: /* If the exec call supplied NULL for tables, use the inbuilt ones. This
 3168: is a feature that makes it possible to save compiled regex and re-use them
 3169: in other programs later. */
 3170: 
 3171: if (md->tables == NULL) md->tables = _pcre_default_tables;
 3172: 
 3173: /* The lower casing table and the "must be at the start of a line" flag are
 3174: used in a loop when finding where to start. */
 3175: 
 3176: lcc = md->tables + lcc_offset;
 3177: startline = (re->flags & PCRE_STARTLINE) != 0;
 3178: firstline = (re->options & PCRE_FIRSTLINE) != 0;
 3179: 
 3180: /* Set up the first character to match, if available. The first_byte value is
 3181: never set for an anchored regular expression, but the anchoring may be forced
 3182: at run time, so we have to test for anchoring. The first char may be unset for
 3183: an unanchored pattern, of course. If there's no first char and the pattern was
 3184: studied, there may be a bitmap of possible first characters. */
 3185: 
 3186: if (!anchored)
 3187:   {
 3188:   if ((re->flags & PCRE_FIRSTSET) != 0)
 3189:     {
 3190:     first_byte = re->first_byte & 255;
 3191:     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
 3192:       first_byte = lcc[first_byte];
 3193:     }
 3194:   else
 3195:     {
 3196:     if (!startline && study != NULL &&
 3197:          (study->flags & PCRE_STUDY_MAPPED) != 0)
 3198:       start_bits = study->start_bits;
 3199:     }
 3200:   }
 3201: 
 3202: /* For anchored or unanchored matches, there may be a "last known required
 3203: character" set. */
 3204: 
 3205: if ((re->flags & PCRE_REQCHSET) != 0)
 3206:   {
 3207:   req_byte = re->req_byte & 255;
 3208:   req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
 3209:   req_byte2 = (md->tables + fcc_offset)[req_byte];  /* case flipped */
 3210:   }
 3211: 
 3212: /* Call the main matching function, looping for a non-anchored regex after a
 3213: failed match. If not restarting, perform certain optimizations at the start of
 3214: a match. */
 3215: 
 3216: for (;;)
 3217:   {
 3218:   int rc;
 3219: 
 3220:   if ((options & PCRE_DFA_RESTART) == 0)
 3221:     {
 3222:     const uschar *save_end_subject = end_subject;
 3223: 
 3224:     /* If firstline is TRUE, the start of the match is constrained to the first
 3225:     line of a multiline string. Implement this by temporarily adjusting
 3226:     end_subject so that we stop scanning at a newline. If the match fails at
 3227:     the newline, later code breaks this loop. */
 3228: 
 3229:     if (firstline)
 3230:       {
 3231:       USPTR t = current_subject;
 3232: #ifdef SUPPORT_UTF8
 3233:       if (utf8)
 3234:         {
 3235:         while (t < md->end_subject && !IS_NEWLINE(t))
 3236:           {
 3237:           t++;
 3238:           while (t < end_subject && (*t & 0xc0) == 0x80) t++;
 3239:           }
 3240:         }
 3241:       else
 3242: #endif
 3243:       while (t < md->end_subject && !IS_NEWLINE(t)) t++;
 3244:       end_subject = t;
 3245:       }
 3246: 
 3247:     /* There are some optimizations that avoid running the match if a known
 3248:     starting point is not found. However, there is an option that disables
 3249:     these, for testing and for ensuring that all callouts do actually occur.
 3250:     The option can be set in the regex by (*NO_START_OPT) or passed in
 3251:     match-time options. */
 3252: 
 3253:     if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
 3254:       {
 3255:       /* Advance to a known first byte. */
 3256: 
 3257:       if (first_byte >= 0)
 3258:         {
 3259:         if (first_byte_caseless)
 3260:           while (current_subject < end_subject &&
 3261:                  lcc[*current_subject] != first_byte)
 3262:             current_subject++;
 3263:         else
 3264:           while (current_subject < end_subject &&
 3265:                  *current_subject != first_byte)
 3266:             current_subject++;
 3267:         }
 3268: 
 3269:       /* Or to just after a linebreak for a multiline match if possible */
 3270: 
 3271:       else if (startline)
 3272:         {
 3273:         if (current_subject > md->start_subject + start_offset)
 3274:           {
 3275: #ifdef SUPPORT_UTF8
 3276:           if (utf8)
 3277:             {
 3278:             while (current_subject < end_subject &&
 3279:                    !WAS_NEWLINE(current_subject))
 3280:               {
 3281:               current_subject++;
 3282:               while(current_subject < end_subject &&
 3283:                     (*current_subject & 0xc0) == 0x80)
 3284:                 current_subject++;
 3285:               }
 3286:             }
 3287:           else
 3288: #endif
 3289:           while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
 3290:             current_subject++;
 3291: 
 3292:           /* If we have just passed a CR and the newline option is ANY or
 3293:           ANYCRLF, and we are now at a LF, advance the match position by one
 3294:           more character. */
 3295: 
 3296:           if (current_subject[-1] == CHAR_CR &&
 3297:                (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
 3298:                current_subject < end_subject &&
 3299:                *current_subject == CHAR_NL)
 3300:             current_subject++;
 3301:           }
 3302:         }
 3303: 
 3304:       /* Or to a non-unique first char after study */
 3305: 
 3306:       else if (start_bits != NULL)
 3307:         {
 3308:         while (current_subject < end_subject)
 3309:           {
 3310:           register unsigned int c = *current_subject;
 3311:           if ((start_bits[c/8] & (1 << (c&7))) == 0)
 3312:             {
 3313:             current_subject++;
 3314: #ifdef SUPPORT_UTF8
 3315:             if (utf8)
 3316:               while(current_subject < end_subject &&
 3317:                     (*current_subject & 0xc0) == 0x80) current_subject++;
 3318: #endif
 3319:             }
 3320:           else break;
 3321:           }
 3322:         }
 3323:       }
 3324: 
 3325:     /* Restore fudged end_subject */
 3326: 
 3327:     end_subject = save_end_subject;
 3328: 
 3329:     /* The following two optimizations are disabled for partial matching or if
 3330:     disabling is explicitly requested (and of course, by the test above, this
 3331:     code is not obeyed when restarting after a partial match). */
 3332: 
 3333:     if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
 3334:         (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
 3335:       {
 3336:       /* If the pattern was studied, a minimum subject length may be set. This
 3337:       is a lower bound; no actual string of that length may actually match the
 3338:       pattern. Although the value is, strictly, in characters, we treat it as
 3339:       bytes to avoid spending too much time in this optimization. */
 3340: 
 3341:       if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
 3342:           (pcre_uint32)(end_subject - current_subject) < study->minlength)
 3343:         return PCRE_ERROR_NOMATCH;
 3344: 
 3345:       /* If req_byte is set, we know that that character must appear in the
 3346:       subject for the match to succeed. If the first character is set, req_byte
 3347:       must be later in the subject; otherwise the test starts at the match
 3348:       point. This optimization can save a huge amount of work in patterns with
 3349:       nested unlimited repeats that aren't going to match. Writing separate
 3350:       code for cased/caseless versions makes it go faster, as does using an
 3351:       autoincrement and backing off on a match.
 3352: 
 3353:       HOWEVER: when the subject string is very, very long, searching to its end
 3354:       can take a long time, and give bad performance on quite ordinary
 3355:       patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
 3356:       string... so we don't do this when the string is sufficiently long. */
 3357: 
 3358:       if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)
 3359:         {
 3360:         register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
 3361: 
 3362:         /* We don't need to repeat the search if we haven't yet reached the
 3363:         place we found it at last time. */
 3364: 
 3365:         if (p > req_byte_ptr)
 3366:           {
 3367:           if (req_byte_caseless)
 3368:             {
 3369:             while (p < end_subject)
 3370:               {
 3371:               register int pp = *p++;
 3372:               if (pp == req_byte || pp == req_byte2) { p--; break; }
 3373:               }
 3374:             }
 3375:           else
 3376:             {
 3377:             while (p < end_subject)
 3378:               {
 3379:               if (*p++ == req_byte) { p--; break; }
 3380:               }
 3381:             }
 3382: 
 3383:           /* If we can't find the required character, break the matching loop,
 3384:           which will cause a return or PCRE_ERROR_NOMATCH. */
 3385: 
 3386:           if (p >= end_subject) break;
 3387: 
 3388:           /* If we have found the required character, save the point where we
 3389:           found it, so that we don't search again next time round the loop if
 3390:           the start hasn't passed this character yet. */
 3391: 
 3392:           req_byte_ptr = p;
 3393:           }
 3394:         }
 3395:       }
 3396:     }   /* End of optimizations that are done when not restarting */
 3397: 
 3398:   /* OK, now we can do the business */
 3399: 
 3400:   md->start_used_ptr = current_subject;
 3401:   md->recursive = NULL;
 3402: 
 3403:   rc = internal_dfa_exec(
 3404:     md,                                /* fixed match data */
 3405:     md->start_code,                    /* this subexpression's code */
 3406:     current_subject,                   /* where we currently are */
 3407:     start_offset,                      /* start offset in subject */
 3408:     offsets,                           /* offset vector */
 3409:     offsetcount,                       /* size of same */
 3410:     workspace,                         /* workspace vector */
 3411:     wscount,                           /* size of same */
 3412:     0);                                /* function recurse level */
 3413: 
 3414:   /* Anything other than "no match" means we are done, always; otherwise, carry
 3415:   on only if not anchored. */
 3416: 
 3417:   if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
 3418: 
 3419:   /* Advance to the next subject character unless we are at the end of a line
 3420:   and firstline is set. */
 3421: 
 3422:   if (firstline && IS_NEWLINE(current_subject)) break;
 3423:   current_subject++;
 3424:   if (utf8)
 3425:     {
 3426:     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
 3427:       current_subject++;
 3428:     }
 3429:   if (current_subject > end_subject) break;
 3430: 
 3431:   /* If we have just passed a CR and we are now at a LF, and the pattern does
 3432:   not contain any explicit matches for \r or \n, and the newline option is CRLF
 3433:   or ANY or ANYCRLF, advance the match position by one more character. */
 3434: 
 3435:   if (current_subject[-1] == CHAR_CR &&
 3436:       current_subject < end_subject &&
 3437:       *current_subject == CHAR_NL &&
 3438:       (re->flags & PCRE_HASCRORLF) == 0 &&
 3439:         (md->nltype == NLTYPE_ANY ||
 3440:          md->nltype == NLTYPE_ANYCRLF ||
 3441:          md->nllen == 2))
 3442:     current_subject++;
 3443: 
 3444:   }   /* "Bumpalong" loop */
 3445: 
 3446: return PCRE_ERROR_NOMATCH;
 3447: }
 3448: 
 3449: /* End of pcre_dfa_exec.c */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>