File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / pcre / pcre_dfa_exec.c
Revision 1.1.1.4 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Mon Jul 22 08:25:55 2013 UTC (11 years ago) by misho
Branches: pcre, MAIN
CVS tags: v8_33, HEAD
8.33

    1: /*************************************************
    2: *      Perl-Compatible Regular Expressions       *
    3: *************************************************/
    4: 
    5: /* PCRE is a library of functions to support regular expressions whose syntax
    6: and semantics are as close as possible to those of the Perl 5 language (but see
    7: below for why this module is different).
    8: 
    9:                        Written by Philip Hazel
   10:            Copyright (c) 1997-2013 University of Cambridge
   11: 
   12: -----------------------------------------------------------------------------
   13: Redistribution and use in source and binary forms, with or without
   14: modification, are permitted provided that the following conditions are met:
   15: 
   16:     * Redistributions of source code must retain the above copyright notice,
   17:       this list of conditions and the following disclaimer.
   18: 
   19:     * Redistributions in binary form must reproduce the above copyright
   20:       notice, this list of conditions and the following disclaimer in the
   21:       documentation and/or other materials provided with the distribution.
   22: 
   23:     * Neither the name of the University of Cambridge nor the names of its
   24:       contributors may be used to endorse or promote products derived from
   25:       this software without specific prior written permission.
   26: 
   27: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
   28: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   29: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   30: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
   31: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   32: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   33: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   34: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   35: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   36: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   37: POSSIBILITY OF SUCH DAMAGE.
   38: -----------------------------------------------------------------------------
   39: */
   40: 
   41: /* This module contains the external function pcre_dfa_exec(), which is an
   42: alternative matching function that uses a sort of DFA algorithm (not a true
   43: FSM). This is NOT Perl-compatible, but it has advantages in certain
   44: applications. */
   45: 
   46: 
   47: /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
   48: the performance of his patterns greatly. I could not use it as it stood, as it
   49: was not thread safe, and made assumptions about pattern sizes. Also, it caused
   50: test 7 to loop, and test 9 to crash with a segfault.
   51: 
   52: The issue is the check for duplicate states, which is done by a simple linear
   53: search up the state list. (Grep for "duplicate" below to find the code.) For
   54: many patterns, there will never be many states active at one time, so a simple
   55: linear search is fine. In patterns that have many active states, it might be a
   56: bottleneck. The suggested code used an indexing scheme to remember which states
   57: had previously been used for each character, and avoided the linear search when
   58: it knew there was no chance of a duplicate. This was implemented when adding
   59: states to the state lists.
   60: 
   61: I wrote some thread-safe, not-limited code to try something similar at the time
   62: of checking for duplicates (instead of when adding states), using index vectors
   63: on the stack. It did give a 13% improvement with one specially constructed
   64: pattern for certain subject strings, but on other strings and on many of the
   65: simpler patterns in the test suite it did worse. The major problem, I think,
   66: was the extra time to initialize the index. This had to be done for each call
   67: of internal_dfa_exec(). (The supplied patch used a static vector, initialized
   68: only once - I suspect this was the cause of the problems with the tests.)
   69: 
   70: Overall, I concluded that the gains in some cases did not outweigh the losses
   71: in others, so I abandoned this code. */
   72: 
   73: 
   74: 
   75: #ifdef HAVE_CONFIG_H
   76: #include "config.h"
   77: #endif
   78: 
   79: #define NLBLOCK md             /* Block containing newline information */
   80: #define PSSTART start_subject  /* Field containing processed string start */
   81: #define PSEND   end_subject    /* Field containing processed string end */
   82: 
   83: #include "pcre_internal.h"
   84: 
   85: 
   86: /* For use to indent debugging output */
   87: 
   88: #define SP "                   "
   89: 
   90: 
   91: /*************************************************
   92: *      Code parameters and static tables         *
   93: *************************************************/
   94: 
   95: /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
   96: into others, under special conditions. A gap of 20 between the blocks should be
   97: enough. The resulting opcodes don't have to be less than 256 because they are
   98: never stored, so we push them well clear of the normal opcodes. */
   99: 
  100: #define OP_PROP_EXTRA       300
  101: #define OP_EXTUNI_EXTRA     320
  102: #define OP_ANYNL_EXTRA      340
  103: #define OP_HSPACE_EXTRA     360
  104: #define OP_VSPACE_EXTRA     380
  105: 
  106: 
  107: /* This table identifies those opcodes that are followed immediately by a
  108: character that is to be tested in some way. This makes it possible to
  109: centralize the loading of these characters. In the case of Type * etc, the
  110: "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
  111: small value. Non-zero values in the table are the offsets from the opcode where
  112: the character is to be found. ***NOTE*** If the start of this table is
  113: modified, the three tables that follow must also be modified. */
  114: 
  115: static const pcre_uint8 coptable[] = {
  116:   0,                             /* End                                    */
  117:   0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
  118:   0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
  119:   0, 0, 0,                       /* Any, AllAny, Anybyte                   */
  120:   0, 0,                          /* \P, \p                                 */
  121:   0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
  122:   0,                             /* \X                                     */
  123:   0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
  124:   1,                             /* Char                                   */
  125:   1,                             /* Chari                                  */
  126:   1,                             /* not                                    */
  127:   1,                             /* noti                                   */
  128:   /* Positive single-char repeats                                          */
  129:   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
  130:   1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
  131:   1+IMM2_SIZE,                   /* exact                                  */
  132:   1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
  133:   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
  134:   1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
  135:   1+IMM2_SIZE,                   /* exact I                                */
  136:   1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
  137:   /* Negative single-char repeats - only for chars < 256                   */
  138:   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
  139:   1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
  140:   1+IMM2_SIZE,                   /* NOT exact                              */
  141:   1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
  142:   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
  143:   1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
  144:   1+IMM2_SIZE,                   /* NOT exact I                            */
  145:   1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
  146:   /* Positive type repeats                                                 */
  147:   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
  148:   1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
  149:   1+IMM2_SIZE,                   /* Type exact                             */
  150:   1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
  151:   /* Character class & ref repeats                                         */
  152:   0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
  153:   0, 0,                          /* CRRANGE, CRMINRANGE                    */
  154:   0,                             /* CLASS                                  */
  155:   0,                             /* NCLASS                                 */
  156:   0,                             /* XCLASS - variable length               */
  157:   0,                             /* REF                                    */
  158:   0,                             /* REFI                                   */
  159:   0,                             /* RECURSE                                */
  160:   0,                             /* CALLOUT                                */
  161:   0,                             /* Alt                                    */
  162:   0,                             /* Ket                                    */
  163:   0,                             /* KetRmax                                */
  164:   0,                             /* KetRmin                                */
  165:   0,                             /* KetRpos                                */
  166:   0,                             /* Reverse                                */
  167:   0,                             /* Assert                                 */
  168:   0,                             /* Assert not                             */
  169:   0,                             /* Assert behind                          */
  170:   0,                             /* Assert behind not                      */
  171:   0, 0,                          /* ONCE, ONCE_NC                          */
  172:   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
  173:   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
  174:   0, 0,                          /* CREF, NCREF                            */
  175:   0, 0,                          /* RREF, NRREF                            */
  176:   0,                             /* DEF                                    */
  177:   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
  178:   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
  179:   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
  180:   0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
  181:   0, 0                           /* CLOSE, SKIPZERO  */
  182: };
  183: 
  184: /* This table identifies those opcodes that inspect a character. It is used to
  185: remember the fact that a character could have been inspected when the end of
  186: the subject is reached. ***NOTE*** If the start of this table is modified, the
  187: two tables that follow must also be modified. */
  188: 
  189: static const pcre_uint8 poptable[] = {
  190:   0,                             /* End                                    */
  191:   0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
  192:   1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
  193:   1, 1, 1,                       /* Any, AllAny, Anybyte                   */
  194:   1, 1,                          /* \P, \p                                 */
  195:   1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
  196:   1,                             /* \X                                     */
  197:   0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
  198:   1,                             /* Char                                   */
  199:   1,                             /* Chari                                  */
  200:   1,                             /* not                                    */
  201:   1,                             /* noti                                   */
  202:   /* Positive single-char repeats                                          */
  203:   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
  204:   1, 1, 1,                       /* upto, minupto, exact                   */
  205:   1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
  206:   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
  207:   1, 1, 1,                       /* upto I, minupto I, exact I             */
  208:   1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
  209:   /* Negative single-char repeats - only for chars < 256                   */
  210:   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
  211:   1, 1, 1,                       /* NOT upto, minupto, exact               */
  212:   1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
  213:   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
  214:   1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
  215:   1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
  216:   /* Positive type repeats                                                 */
  217:   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
  218:   1, 1, 1,                       /* Type upto, minupto, exact              */
  219:   1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
  220:   /* Character class & ref repeats                                         */
  221:   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
  222:   1, 1,                          /* CRRANGE, CRMINRANGE                    */
  223:   1,                             /* CLASS                                  */
  224:   1,                             /* NCLASS                                 */
  225:   1,                             /* XCLASS - variable length               */
  226:   0,                             /* REF                                    */
  227:   0,                             /* REFI                                   */
  228:   0,                             /* RECURSE                                */
  229:   0,                             /* CALLOUT                                */
  230:   0,                             /* Alt                                    */
  231:   0,                             /* Ket                                    */
  232:   0,                             /* KetRmax                                */
  233:   0,                             /* KetRmin                                */
  234:   0,                             /* KetRpos                                */
  235:   0,                             /* Reverse                                */
  236:   0,                             /* Assert                                 */
  237:   0,                             /* Assert not                             */
  238:   0,                             /* Assert behind                          */
  239:   0,                             /* Assert behind not                      */
  240:   0, 0,                          /* ONCE, ONCE_NC                          */
  241:   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
  242:   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
  243:   0, 0,                          /* CREF, NCREF                            */
  244:   0, 0,                          /* RREF, NRREF                            */
  245:   0,                             /* DEF                                    */
  246:   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
  247:   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
  248:   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
  249:   0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
  250:   0, 0                           /* CLOSE, SKIPZERO                        */
  251: };
  252: 
  253: /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
  254: and \w */
  255: 
  256: static const pcre_uint8 toptable1[] = {
  257:   0, 0, 0, 0, 0, 0,
  258:   ctype_digit, ctype_digit,
  259:   ctype_space, ctype_space,
  260:   ctype_word,  ctype_word,
  261:   0, 0                            /* OP_ANY, OP_ALLANY */
  262: };
  263: 
  264: static const pcre_uint8 toptable2[] = {
  265:   0, 0, 0, 0, 0, 0,
  266:   ctype_digit, 0,
  267:   ctype_space, 0,
  268:   ctype_word,  0,
  269:   1, 1                            /* OP_ANY, OP_ALLANY */
  270: };
  271: 
  272: 
  273: /* Structure for holding data about a particular state, which is in effect the
  274: current data for an active path through the match tree. It must consist
  275: entirely of ints because the working vector we are passed, and which we put
  276: these structures in, is a vector of ints. */
  277: 
  278: typedef struct stateblock {
  279:   int offset;                     /* Offset to opcode */
  280:   int count;                      /* Count for repeats */
  281:   int data;                       /* Some use extra data */
  282: } stateblock;
  283: 
  284: #define INTS_PER_STATEBLOCK  (int)(sizeof(stateblock)/sizeof(int))
  285: 
  286: 
  287: #ifdef PCRE_DEBUG
  288: /*************************************************
  289: *             Print character string             *
  290: *************************************************/
  291: 
  292: /* Character string printing function for debugging.
  293: 
  294: Arguments:
  295:   p            points to string
  296:   length       number of bytes
  297:   f            where to print
  298: 
  299: Returns:       nothing
  300: */
  301: 
  302: static void
  303: pchars(const pcre_uchar *p, int length, FILE *f)
  304: {
  305: pcre_uint32 c;
  306: while (length-- > 0)
  307:   {
  308:   if (isprint(c = *(p++)))
  309:     fprintf(f, "%c", c);
  310:   else
  311:     fprintf(f, "\\x{%02x}", c);
  312:   }
  313: }
  314: #endif
  315: 
  316: 
  317: 
  318: /*************************************************
  319: *    Execute a Regular Expression - DFA engine   *
  320: *************************************************/
  321: 
  322: /* This internal function applies a compiled pattern to a subject string,
  323: starting at a given point, using a DFA engine. This function is called from the
  324: external one, possibly multiple times if the pattern is not anchored. The
  325: function calls itself recursively for some kinds of subpattern.
  326: 
  327: Arguments:
  328:   md                the match_data block with fixed information
  329:   this_start_code   the opening bracket of this subexpression's code
  330:   current_subject   where we currently are in the subject string
  331:   start_offset      start offset in the subject string
  332:   offsets           vector to contain the matching string offsets
  333:   offsetcount       size of same
  334:   workspace         vector of workspace
  335:   wscount           size of same
  336:   rlevel            function call recursion level
  337: 
  338: Returns:            > 0 => number of match offset pairs placed in offsets
  339:                     = 0 => offsets overflowed; longest matches are present
  340:                      -1 => failed to match
  341:                    < -1 => some kind of unexpected problem
  342: 
  343: The following macros are used for adding states to the two state vectors (one
  344: for the current character, one for the following character). */
  345: 
  346: #define ADD_ACTIVE(x,y) \
  347:   if (active_count++ < wscount) \
  348:     { \
  349:     next_active_state->offset = (x); \
  350:     next_active_state->count  = (y); \
  351:     next_active_state++; \
  352:     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
  353:     } \
  354:   else return PCRE_ERROR_DFA_WSSIZE
  355: 
  356: #define ADD_ACTIVE_DATA(x,y,z) \
  357:   if (active_count++ < wscount) \
  358:     { \
  359:     next_active_state->offset = (x); \
  360:     next_active_state->count  = (y); \
  361:     next_active_state->data   = (z); \
  362:     next_active_state++; \
  363:     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
  364:     } \
  365:   else return PCRE_ERROR_DFA_WSSIZE
  366: 
  367: #define ADD_NEW(x,y) \
  368:   if (new_count++ < wscount) \
  369:     { \
  370:     next_new_state->offset = (x); \
  371:     next_new_state->count  = (y); \
  372:     next_new_state++; \
  373:     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
  374:     } \
  375:   else return PCRE_ERROR_DFA_WSSIZE
  376: 
  377: #define ADD_NEW_DATA(x,y,z) \
  378:   if (new_count++ < wscount) \
  379:     { \
  380:     next_new_state->offset = (x); \
  381:     next_new_state->count  = (y); \
  382:     next_new_state->data   = (z); \
  383:     next_new_state++; \
  384:     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
  385:       (x), (y), (z), __LINE__)); \
  386:     } \
  387:   else return PCRE_ERROR_DFA_WSSIZE
  388: 
  389: /* And now, here is the code */
  390: 
  391: static int
  392: internal_dfa_exec(
  393:   dfa_match_data *md,
  394:   const pcre_uchar *this_start_code,
  395:   const pcre_uchar *current_subject,
  396:   int start_offset,
  397:   int *offsets,
  398:   int offsetcount,
  399:   int *workspace,
  400:   int wscount,
  401:   int  rlevel)
  402: {
  403: stateblock *active_states, *new_states, *temp_states;
  404: stateblock *next_active_state, *next_new_state;
  405: 
  406: const pcre_uint8 *ctypes, *lcc, *fcc;
  407: const pcre_uchar *ptr;
  408: const pcre_uchar *end_code, *first_op;
  409: 
  410: dfa_recursion_info new_recursive;
  411: 
  412: int active_count, new_count, match_count;
  413: 
  414: /* Some fields in the md block are frequently referenced, so we load them into
  415: independent variables in the hope that this will perform better. */
  416: 
  417: const pcre_uchar *start_subject = md->start_subject;
  418: const pcre_uchar *end_subject = md->end_subject;
  419: const pcre_uchar *start_code = md->start_code;
  420: 
  421: #ifdef SUPPORT_UTF
  422: BOOL utf = (md->poptions & PCRE_UTF8) != 0;
  423: #else
  424: BOOL utf = FALSE;
  425: #endif
  426: 
  427: BOOL reset_could_continue = FALSE;
  428: 
  429: rlevel++;
  430: offsetcount &= (-2);
  431: 
  432: wscount -= 2;
  433: wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
  434:           (2 * INTS_PER_STATEBLOCK);
  435: 
  436: DPRINTF(("\n%.*s---------------------\n"
  437:   "%.*sCall to internal_dfa_exec f=%d\n",
  438:   rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
  439: 
  440: ctypes = md->tables + ctypes_offset;
  441: lcc = md->tables + lcc_offset;
  442: fcc = md->tables + fcc_offset;
  443: 
  444: match_count = PCRE_ERROR_NOMATCH;   /* A negative number */
  445: 
  446: active_states = (stateblock *)(workspace + 2);
  447: next_new_state = new_states = active_states + wscount;
  448: new_count = 0;
  449: 
  450: first_op = this_start_code + 1 + LINK_SIZE +
  451:   ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
  452:     *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
  453:     ? IMM2_SIZE:0);
  454: 
  455: /* The first thing in any (sub) pattern is a bracket of some sort. Push all
  456: the alternative states onto the list, and find out where the end is. This
  457: makes is possible to use this function recursively, when we want to stop at a
  458: matching internal ket rather than at the end.
  459: 
  460: If the first opcode in the first alternative is OP_REVERSE, we are dealing with
  461: a backward assertion. In that case, we have to find out the maximum amount to
  462: move back, and set up each alternative appropriately. */
  463: 
  464: if (*first_op == OP_REVERSE)
  465:   {
  466:   int max_back = 0;
  467:   int gone_back;
  468: 
  469:   end_code = this_start_code;
  470:   do
  471:     {
  472:     int back = GET(end_code, 2+LINK_SIZE);
  473:     if (back > max_back) max_back = back;
  474:     end_code += GET(end_code, 1);
  475:     }
  476:   while (*end_code == OP_ALT);
  477: 
  478:   /* If we can't go back the amount required for the longest lookbehind
  479:   pattern, go back as far as we can; some alternatives may still be viable. */
  480: 
  481: #ifdef SUPPORT_UTF
  482:   /* In character mode we have to step back character by character */
  483: 
  484:   if (utf)
  485:     {
  486:     for (gone_back = 0; gone_back < max_back; gone_back++)
  487:       {
  488:       if (current_subject <= start_subject) break;
  489:       current_subject--;
  490:       ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
  491:       }
  492:     }
  493:   else
  494: #endif
  495: 
  496:   /* In byte-mode we can do this quickly. */
  497: 
  498:     {
  499:     gone_back = (current_subject - max_back < start_subject)?
  500:       (int)(current_subject - start_subject) : max_back;
  501:     current_subject -= gone_back;
  502:     }
  503: 
  504:   /* Save the earliest consulted character */
  505: 
  506:   if (current_subject < md->start_used_ptr)
  507:     md->start_used_ptr = current_subject;
  508: 
  509:   /* Now we can process the individual branches. */
  510: 
  511:   end_code = this_start_code;
  512:   do
  513:     {
  514:     int back = GET(end_code, 2+LINK_SIZE);
  515:     if (back <= gone_back)
  516:       {
  517:       int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
  518:       ADD_NEW_DATA(-bstate, 0, gone_back - back);
  519:       }
  520:     end_code += GET(end_code, 1);
  521:     }
  522:   while (*end_code == OP_ALT);
  523:  }
  524: 
  525: /* This is the code for a "normal" subpattern (not a backward assertion). The
  526: start of a whole pattern is always one of these. If we are at the top level,
  527: we may be asked to restart matching from the same point that we reached for a
  528: previous partial match. We still have to scan through the top-level branches to
  529: find the end state. */
  530: 
  531: else
  532:   {
  533:   end_code = this_start_code;
  534: 
  535:   /* Restarting */
  536: 
  537:   if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
  538:     {
  539:     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
  540:     new_count = workspace[1];
  541:     if (!workspace[0])
  542:       memcpy(new_states, active_states, new_count * sizeof(stateblock));
  543:     }
  544: 
  545:   /* Not restarting */
  546: 
  547:   else
  548:     {
  549:     int length = 1 + LINK_SIZE +
  550:       ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
  551:         *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
  552:         ? IMM2_SIZE:0);
  553:     do
  554:       {
  555:       ADD_NEW((int)(end_code - start_code + length), 0);
  556:       end_code += GET(end_code, 1);
  557:       length = 1 + LINK_SIZE;
  558:       }
  559:     while (*end_code == OP_ALT);
  560:     }
  561:   }
  562: 
  563: workspace[0] = 0;    /* Bit indicating which vector is current */
  564: 
  565: DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
  566: 
  567: /* Loop for scanning the subject */
  568: 
  569: ptr = current_subject;
  570: for (;;)
  571:   {
  572:   int i, j;
  573:   int clen, dlen;
  574:   pcre_uint32 c, d;
  575:   int forced_fail = 0;
  576:   BOOL partial_newline = FALSE;
  577:   BOOL could_continue = reset_could_continue;
  578:   reset_could_continue = FALSE;
  579: 
  580:   /* Make the new state list into the active state list and empty the
  581:   new state list. */
  582: 
  583:   temp_states = active_states;
  584:   active_states = new_states;
  585:   new_states = temp_states;
  586:   active_count = new_count;
  587:   new_count = 0;
  588: 
  589:   workspace[0] ^= 1;              /* Remember for the restarting feature */
  590:   workspace[1] = active_count;
  591: 
  592: #ifdef PCRE_DEBUG
  593:   printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
  594:   pchars(ptr, STRLEN_UC(ptr), stdout);
  595:   printf("\"\n");
  596: 
  597:   printf("%.*sActive states: ", rlevel*2-2, SP);
  598:   for (i = 0; i < active_count; i++)
  599:     printf("%d/%d ", active_states[i].offset, active_states[i].count);
  600:   printf("\n");
  601: #endif
  602: 
  603:   /* Set the pointers for adding new states */
  604: 
  605:   next_active_state = active_states + active_count;
  606:   next_new_state = new_states;
  607: 
  608:   /* Load the current character from the subject outside the loop, as many
  609:   different states may want to look at it, and we assume that at least one
  610:   will. */
  611: 
  612:   if (ptr < end_subject)
  613:     {
  614:     clen = 1;        /* Number of data items in the character */
  615: #ifdef SUPPORT_UTF
  616:     GETCHARLENTEST(c, ptr, clen);
  617: #else
  618:     c = *ptr;
  619: #endif  /* SUPPORT_UTF */
  620:     }
  621:   else
  622:     {
  623:     clen = 0;        /* This indicates the end of the subject */
  624:     c = NOTACHAR;    /* This value should never actually be used */
  625:     }
  626: 
  627:   /* Scan up the active states and act on each one. The result of an action
  628:   may be to add more states to the currently active list (e.g. on hitting a
  629:   parenthesis) or it may be to put states on the new list, for considering
  630:   when we move the character pointer on. */
  631: 
  632:   for (i = 0; i < active_count; i++)
  633:     {
  634:     stateblock *current_state = active_states + i;
  635:     BOOL caseless = FALSE;
  636:     const pcre_uchar *code;
  637:     int state_offset = current_state->offset;
  638:     int codevalue, rrc;
  639:     int count;
  640: 
  641: #ifdef PCRE_DEBUG
  642:     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
  643:     if (clen == 0) printf("EOL\n");
  644:       else if (c > 32 && c < 127) printf("'%c'\n", c);
  645:         else printf("0x%02x\n", c);
  646: #endif
  647: 
  648:     /* A negative offset is a special case meaning "hold off going to this
  649:     (negated) state until the number of characters in the data field have
  650:     been skipped". If the could_continue flag was passed over from a previous
  651:     state, arrange for it to passed on. */
  652: 
  653:     if (state_offset < 0)
  654:       {
  655:       if (current_state->data > 0)
  656:         {
  657:         DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
  658:         ADD_NEW_DATA(state_offset, current_state->count,
  659:           current_state->data - 1);
  660:         if (could_continue) reset_could_continue = TRUE;
  661:         continue;
  662:         }
  663:       else
  664:         {
  665:         current_state->offset = state_offset = -state_offset;
  666:         }
  667:       }
  668: 
  669:     /* Check for a duplicate state with the same count, and skip if found.
  670:     See the note at the head of this module about the possibility of improving
  671:     performance here. */
  672: 
  673:     for (j = 0; j < i; j++)
  674:       {
  675:       if (active_states[j].offset == state_offset &&
  676:           active_states[j].count == current_state->count)
  677:         {
  678:         DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
  679:         goto NEXT_ACTIVE_STATE;
  680:         }
  681:       }
  682: 
  683:     /* The state offset is the offset to the opcode */
  684: 
  685:     code = start_code + state_offset;
  686:     codevalue = *code;
  687: 
  688:     /* If this opcode inspects a character, but we are at the end of the
  689:     subject, remember the fact for use when testing for a partial match. */
  690: 
  691:     if (clen == 0 && poptable[codevalue] != 0)
  692:       could_continue = TRUE;
  693: 
  694:     /* If this opcode is followed by an inline character, load it. It is
  695:     tempting to test for the presence of a subject character here, but that
  696:     is wrong, because sometimes zero repetitions of the subject are
  697:     permitted.
  698: 
  699:     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
  700:     argument that is not a data character - but is always one byte long because
  701:     the values are small. We have to take special action to deal with  \P, \p,
  702:     \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
  703:     these ones to new opcodes. */
  704: 
  705:     if (coptable[codevalue] > 0)
  706:       {
  707:       dlen = 1;
  708: #ifdef SUPPORT_UTF
  709:       if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
  710: #endif  /* SUPPORT_UTF */
  711:       d = code[coptable[codevalue]];
  712:       if (codevalue >= OP_TYPESTAR)
  713:         {
  714:         switch(d)
  715:           {
  716:           case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
  717:           case OP_NOTPROP:
  718:           case OP_PROP: codevalue += OP_PROP_EXTRA; break;
  719:           case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
  720:           case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
  721:           case OP_NOT_HSPACE:
  722:           case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
  723:           case OP_NOT_VSPACE:
  724:           case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
  725:           default: break;
  726:           }
  727:         }
  728:       }
  729:     else
  730:       {
  731:       dlen = 0;         /* Not strictly necessary, but compilers moan */
  732:       d = NOTACHAR;     /* if these variables are not set. */
  733:       }
  734: 
  735: 
  736:     /* Now process the individual opcodes */
  737: 
  738:     switch (codevalue)
  739:       {
  740: /* ========================================================================== */
  741:       /* These cases are never obeyed. This is a fudge that causes a compile-
  742:       time error if the vectors coptable or poptable, which are indexed by
  743:       opcode, are not the correct length. It seems to be the only way to do
  744:       such a check at compile time, as the sizeof() operator does not work
  745:       in the C preprocessor. */
  746: 
  747:       case OP_TABLE_LENGTH:
  748:       case OP_TABLE_LENGTH +
  749:         ((sizeof(coptable) == OP_TABLE_LENGTH) &&
  750:          (sizeof(poptable) == OP_TABLE_LENGTH)):
  751:       break;
  752: 
  753: /* ========================================================================== */
  754:       /* Reached a closing bracket. If not at the end of the pattern, carry
  755:       on with the next opcode. For repeating opcodes, also add the repeat
  756:       state. Note that KETRPOS will always be encountered at the end of the
  757:       subpattern, because the possessive subpattern repeats are always handled
  758:       using recursive calls. Thus, it never adds any new states.
  759: 
  760:       At the end of the (sub)pattern, unless we have an empty string and
  761:       PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
  762:       start of the subject, save the match data, shifting up all previous
  763:       matches so we always have the longest first. */
  764: 
  765:       case OP_KET:
  766:       case OP_KETRMIN:
  767:       case OP_KETRMAX:
  768:       case OP_KETRPOS:
  769:       if (code != end_code)
  770:         {
  771:         ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
  772:         if (codevalue != OP_KET)
  773:           {
  774:           ADD_ACTIVE(state_offset - GET(code, 1), 0);
  775:           }
  776:         }
  777:       else
  778:         {
  779:         if (ptr > current_subject ||
  780:             ((md->moptions & PCRE_NOTEMPTY) == 0 &&
  781:               ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
  782:                 current_subject > start_subject + md->start_offset)))
  783:           {
  784:           if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
  785:             else if (match_count > 0 && ++match_count * 2 > offsetcount)
  786:               match_count = 0;
  787:           count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
  788:           if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
  789:           if (offsetcount >= 2)
  790:             {
  791:             offsets[0] = (int)(current_subject - start_subject);
  792:             offsets[1] = (int)(ptr - start_subject);
  793:             DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
  794:               offsets[1] - offsets[0], (char *)current_subject));
  795:             }
  796:           if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
  797:             {
  798:             DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
  799:               "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
  800:               match_count, rlevel*2-2, SP));
  801:             return match_count;
  802:             }
  803:           }
  804:         }
  805:       break;
  806: 
  807: /* ========================================================================== */
  808:       /* These opcodes add to the current list of states without looking
  809:       at the current character. */
  810: 
  811:       /*-----------------------------------------------------------------*/
  812:       case OP_ALT:
  813:       do { code += GET(code, 1); } while (*code == OP_ALT);
  814:       ADD_ACTIVE((int)(code - start_code), 0);
  815:       break;
  816: 
  817:       /*-----------------------------------------------------------------*/
  818:       case OP_BRA:
  819:       case OP_SBRA:
  820:       do
  821:         {
  822:         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
  823:         code += GET(code, 1);
  824:         }
  825:       while (*code == OP_ALT);
  826:       break;
  827: 
  828:       /*-----------------------------------------------------------------*/
  829:       case OP_CBRA:
  830:       case OP_SCBRA:
  831:       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
  832:       code += GET(code, 1);
  833:       while (*code == OP_ALT)
  834:         {
  835:         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
  836:         code += GET(code, 1);
  837:         }
  838:       break;
  839: 
  840:       /*-----------------------------------------------------------------*/
  841:       case OP_BRAZERO:
  842:       case OP_BRAMINZERO:
  843:       ADD_ACTIVE(state_offset + 1, 0);
  844:       code += 1 + GET(code, 2);
  845:       while (*code == OP_ALT) code += GET(code, 1);
  846:       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
  847:       break;
  848: 
  849:       /*-----------------------------------------------------------------*/
  850:       case OP_SKIPZERO:
  851:       code += 1 + GET(code, 2);
  852:       while (*code == OP_ALT) code += GET(code, 1);
  853:       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
  854:       break;
  855: 
  856:       /*-----------------------------------------------------------------*/
  857:       case OP_CIRC:
  858:       if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
  859:         { ADD_ACTIVE(state_offset + 1, 0); }
  860:       break;
  861: 
  862:       /*-----------------------------------------------------------------*/
  863:       case OP_CIRCM:
  864:       if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
  865:           (ptr != end_subject && WAS_NEWLINE(ptr)))
  866:         { ADD_ACTIVE(state_offset + 1, 0); }
  867:       break;
  868: 
  869:       /*-----------------------------------------------------------------*/
  870:       case OP_EOD:
  871:       if (ptr >= end_subject)
  872:         {
  873:         if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
  874:           could_continue = TRUE;
  875:         else { ADD_ACTIVE(state_offset + 1, 0); }
  876:         }
  877:       break;
  878: 
  879:       /*-----------------------------------------------------------------*/
  880:       case OP_SOD:
  881:       if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
  882:       break;
  883: 
  884:       /*-----------------------------------------------------------------*/
  885:       case OP_SOM:
  886:       if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
  887:       break;
  888: 
  889: 
  890: /* ========================================================================== */
  891:       /* These opcodes inspect the next subject character, and sometimes
  892:       the previous one as well, but do not have an argument. The variable
  893:       clen contains the length of the current character and is zero if we are
  894:       at the end of the subject. */
  895: 
  896:       /*-----------------------------------------------------------------*/
  897:       case OP_ANY:
  898:       if (clen > 0 && !IS_NEWLINE(ptr))
  899:         {
  900:         if (ptr + 1 >= md->end_subject &&
  901:             (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
  902:             NLBLOCK->nltype == NLTYPE_FIXED &&
  903:             NLBLOCK->nllen == 2 &&
  904:             c == NLBLOCK->nl[0])
  905:           {
  906:           could_continue = partial_newline = TRUE;
  907:           }
  908:         else
  909:           {
  910:           ADD_NEW(state_offset + 1, 0);
  911:           }
  912:         }
  913:       break;
  914: 
  915:       /*-----------------------------------------------------------------*/
  916:       case OP_ALLANY:
  917:       if (clen > 0)
  918:         { ADD_NEW(state_offset + 1, 0); }
  919:       break;
  920: 
  921:       /*-----------------------------------------------------------------*/
  922:       case OP_EODN:
  923:       if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
  924:         could_continue = TRUE;
  925:       else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
  926:         { ADD_ACTIVE(state_offset + 1, 0); }
  927:       break;
  928: 
  929:       /*-----------------------------------------------------------------*/
  930:       case OP_DOLL:
  931:       if ((md->moptions & PCRE_NOTEOL) == 0)
  932:         {
  933:         if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
  934:           could_continue = TRUE;
  935:         else if (clen == 0 ||
  936:             ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
  937:                (ptr == end_subject - md->nllen)
  938:             ))
  939:           { ADD_ACTIVE(state_offset + 1, 0); }
  940:         else if (ptr + 1 >= md->end_subject &&
  941:                  (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
  942:                  NLBLOCK->nltype == NLTYPE_FIXED &&
  943:                  NLBLOCK->nllen == 2 &&
  944:                  c == NLBLOCK->nl[0])
  945:           {
  946:           if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
  947:             {
  948:             reset_could_continue = TRUE;
  949:             ADD_NEW_DATA(-(state_offset + 1), 0, 1);
  950:             }
  951:           else could_continue = partial_newline = TRUE;
  952:           }
  953:         }
  954:       break;
  955: 
  956:       /*-----------------------------------------------------------------*/
  957:       case OP_DOLLM:
  958:       if ((md->moptions & PCRE_NOTEOL) == 0)
  959:         {
  960:         if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
  961:           could_continue = TRUE;
  962:         else if (clen == 0 ||
  963:             ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
  964:           { ADD_ACTIVE(state_offset + 1, 0); }
  965:         else if (ptr + 1 >= md->end_subject &&
  966:                  (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
  967:                  NLBLOCK->nltype == NLTYPE_FIXED &&
  968:                  NLBLOCK->nllen == 2 &&
  969:                  c == NLBLOCK->nl[0])
  970:           {
  971:           if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
  972:             {
  973:             reset_could_continue = TRUE;
  974:             ADD_NEW_DATA(-(state_offset + 1), 0, 1);
  975:             }
  976:           else could_continue = partial_newline = TRUE;
  977:           }
  978:         }
  979:       else if (IS_NEWLINE(ptr))
  980:         { ADD_ACTIVE(state_offset + 1, 0); }
  981:       break;
  982: 
  983:       /*-----------------------------------------------------------------*/
  984: 
  985:       case OP_DIGIT:
  986:       case OP_WHITESPACE:
  987:       case OP_WORDCHAR:
  988:       if (clen > 0 && c < 256 &&
  989:             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
  990:         { ADD_NEW(state_offset + 1, 0); }
  991:       break;
  992: 
  993:       /*-----------------------------------------------------------------*/
  994:       case OP_NOT_DIGIT:
  995:       case OP_NOT_WHITESPACE:
  996:       case OP_NOT_WORDCHAR:
  997:       if (clen > 0 && (c >= 256 ||
  998:             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
  999:         { ADD_NEW(state_offset + 1, 0); }
 1000:       break;
 1001: 
 1002:       /*-----------------------------------------------------------------*/
 1003:       case OP_WORD_BOUNDARY:
 1004:       case OP_NOT_WORD_BOUNDARY:
 1005:         {
 1006:         int left_word, right_word;
 1007: 
 1008:         if (ptr > start_subject)
 1009:           {
 1010:           const pcre_uchar *temp = ptr - 1;
 1011:           if (temp < md->start_used_ptr) md->start_used_ptr = temp;
 1012: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
 1013:           if (utf) { BACKCHAR(temp); }
 1014: #endif
 1015:           GETCHARTEST(d, temp);
 1016: #ifdef SUPPORT_UCP
 1017:           if ((md->poptions & PCRE_UCP) != 0)
 1018:             {
 1019:             if (d == '_') left_word = TRUE; else
 1020:               {
 1021:               int cat = UCD_CATEGORY(d);
 1022:               left_word = (cat == ucp_L || cat == ucp_N);
 1023:               }
 1024:             }
 1025:           else
 1026: #endif
 1027:           left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
 1028:           }
 1029:         else left_word = FALSE;
 1030: 
 1031:         if (clen > 0)
 1032:           {
 1033: #ifdef SUPPORT_UCP
 1034:           if ((md->poptions & PCRE_UCP) != 0)
 1035:             {
 1036:             if (c == '_') right_word = TRUE; else
 1037:               {
 1038:               int cat = UCD_CATEGORY(c);
 1039:               right_word = (cat == ucp_L || cat == ucp_N);
 1040:               }
 1041:             }
 1042:           else
 1043: #endif
 1044:           right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
 1045:           }
 1046:         else right_word = FALSE;
 1047: 
 1048:         if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
 1049:           { ADD_ACTIVE(state_offset + 1, 0); }
 1050:         }
 1051:       break;
 1052: 
 1053: 
 1054:       /*-----------------------------------------------------------------*/
 1055:       /* Check the next character by Unicode property. We will get here only
 1056:       if the support is in the binary; otherwise a compile-time error occurs.
 1057:       */
 1058: 
 1059: #ifdef SUPPORT_UCP
 1060:       case OP_PROP:
 1061:       case OP_NOTPROP:
 1062:       if (clen > 0)
 1063:         {
 1064:         BOOL OK;
 1065:         const pcre_uint32 *cp;
 1066:         const ucd_record * prop = GET_UCD(c);
 1067:         switch(code[1])
 1068:           {
 1069:           case PT_ANY:
 1070:           OK = TRUE;
 1071:           break;
 1072: 
 1073:           case PT_LAMP:
 1074:           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
 1075:                prop->chartype == ucp_Lt;
 1076:           break;
 1077: 
 1078:           case PT_GC:
 1079:           OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
 1080:           break;
 1081: 
 1082:           case PT_PC:
 1083:           OK = prop->chartype == code[2];
 1084:           break;
 1085: 
 1086:           case PT_SC:
 1087:           OK = prop->script == code[2];
 1088:           break;
 1089: 
 1090:           /* These are specials for combination cases. */
 1091: 
 1092:           case PT_ALNUM:
 1093:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
 1094:                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
 1095:           break;
 1096: 
 1097:           case PT_SPACE:    /* Perl space */
 1098:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
 1099:                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
 1100:           break;
 1101: 
 1102:           case PT_PXSPACE:  /* POSIX space */
 1103:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
 1104:                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
 1105:                c == CHAR_FF || c == CHAR_CR;
 1106:           break;
 1107: 
 1108:           case PT_WORD:
 1109:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
 1110:                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
 1111:                c == CHAR_UNDERSCORE;
 1112:           break;
 1113: 
 1114:           case PT_CLIST:
 1115:           cp = PRIV(ucd_caseless_sets) + code[2];
 1116:           for (;;)
 1117:             {
 1118:             if (c < *cp) { OK = FALSE; break; }
 1119:             if (c == *cp++) { OK = TRUE; break; }
 1120:             }
 1121:           break;
 1122: 
 1123:           case PT_UCNC:
 1124:           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
 1125:                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
 1126:                c >= 0xe000;
 1127:           break;
 1128: 
 1129:           /* Should never occur, but keep compilers from grumbling. */
 1130: 
 1131:           default:
 1132:           OK = codevalue != OP_PROP;
 1133:           break;
 1134:           }
 1135: 
 1136:         if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
 1137:         }
 1138:       break;
 1139: #endif
 1140: 
 1141: 
 1142: 
 1143: /* ========================================================================== */
 1144:       /* These opcodes likewise inspect the subject character, but have an
 1145:       argument that is not a data character. It is one of these opcodes:
 1146:       OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
 1147:       OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
 1148: 
 1149:       case OP_TYPEPLUS:
 1150:       case OP_TYPEMINPLUS:
 1151:       case OP_TYPEPOSPLUS:
 1152:       count = current_state->count;  /* Already matched */
 1153:       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
 1154:       if (clen > 0)
 1155:         {
 1156:         if (d == OP_ANY && ptr + 1 >= md->end_subject &&
 1157:             (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
 1158:             NLBLOCK->nltype == NLTYPE_FIXED &&
 1159:             NLBLOCK->nllen == 2 &&
 1160:             c == NLBLOCK->nl[0])
 1161:           {
 1162:           could_continue = partial_newline = TRUE;
 1163:           }
 1164:         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 1165:             (c < 256 &&
 1166:               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
 1167:               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 1168:           {
 1169:           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
 1170:             {
 1171:             active_count--;            /* Remove non-match possibility */
 1172:             next_active_state--;
 1173:             }
 1174:           count++;
 1175:           ADD_NEW(state_offset, count);
 1176:           }
 1177:         }
 1178:       break;
 1179: 
 1180:       /*-----------------------------------------------------------------*/
 1181:       case OP_TYPEQUERY:
 1182:       case OP_TYPEMINQUERY:
 1183:       case OP_TYPEPOSQUERY:
 1184:       ADD_ACTIVE(state_offset + 2, 0);
 1185:       if (clen > 0)
 1186:         {
 1187:         if (d == OP_ANY && ptr + 1 >= md->end_subject &&
 1188:             (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
 1189:             NLBLOCK->nltype == NLTYPE_FIXED &&
 1190:             NLBLOCK->nllen == 2 &&
 1191:             c == NLBLOCK->nl[0])
 1192:           {
 1193:           could_continue = partial_newline = TRUE;
 1194:           }
 1195:         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 1196:             (c < 256 &&
 1197:               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
 1198:               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 1199:           {
 1200:           if (codevalue == OP_TYPEPOSQUERY)
 1201:             {
 1202:             active_count--;            /* Remove non-match possibility */
 1203:             next_active_state--;
 1204:             }
 1205:           ADD_NEW(state_offset + 2, 0);
 1206:           }
 1207:         }
 1208:       break;
 1209: 
 1210:       /*-----------------------------------------------------------------*/
 1211:       case OP_TYPESTAR:
 1212:       case OP_TYPEMINSTAR:
 1213:       case OP_TYPEPOSSTAR:
 1214:       ADD_ACTIVE(state_offset + 2, 0);
 1215:       if (clen > 0)
 1216:         {
 1217:         if (d == OP_ANY && ptr + 1 >= md->end_subject &&
 1218:             (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
 1219:             NLBLOCK->nltype == NLTYPE_FIXED &&
 1220:             NLBLOCK->nllen == 2 &&
 1221:             c == NLBLOCK->nl[0])
 1222:           {
 1223:           could_continue = partial_newline = TRUE;
 1224:           }
 1225:         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 1226:             (c < 256 &&
 1227:               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
 1228:               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 1229:           {
 1230:           if (codevalue == OP_TYPEPOSSTAR)
 1231:             {
 1232:             active_count--;            /* Remove non-match possibility */
 1233:             next_active_state--;
 1234:             }
 1235:           ADD_NEW(state_offset, 0);
 1236:           }
 1237:         }
 1238:       break;
 1239: 
 1240:       /*-----------------------------------------------------------------*/
 1241:       case OP_TYPEEXACT:
 1242:       count = current_state->count;  /* Number already matched */
 1243:       if (clen > 0)
 1244:         {
 1245:         if (d == OP_ANY && ptr + 1 >= md->end_subject &&
 1246:             (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
 1247:             NLBLOCK->nltype == NLTYPE_FIXED &&
 1248:             NLBLOCK->nllen == 2 &&
 1249:             c == NLBLOCK->nl[0])
 1250:           {
 1251:           could_continue = partial_newline = TRUE;
 1252:           }
 1253:         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 1254:             (c < 256 &&
 1255:               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
 1256:               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 1257:           {
 1258:           if (++count >= (int)GET2(code, 1))
 1259:             { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
 1260:           else
 1261:             { ADD_NEW(state_offset, count); }
 1262:           }
 1263:         }
 1264:       break;
 1265: 
 1266:       /*-----------------------------------------------------------------*/
 1267:       case OP_TYPEUPTO:
 1268:       case OP_TYPEMINUPTO:
 1269:       case OP_TYPEPOSUPTO:
 1270:       ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
 1271:       count = current_state->count;  /* Number already matched */
 1272:       if (clen > 0)
 1273:         {
 1274:         if (d == OP_ANY && ptr + 1 >= md->end_subject &&
 1275:             (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
 1276:             NLBLOCK->nltype == NLTYPE_FIXED &&
 1277:             NLBLOCK->nllen == 2 &&
 1278:             c == NLBLOCK->nl[0])
 1279:           {
 1280:           could_continue = partial_newline = TRUE;
 1281:           }
 1282:         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 1283:             (c < 256 &&
 1284:               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
 1285:               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 1286:           {
 1287:           if (codevalue == OP_TYPEPOSUPTO)
 1288:             {
 1289:             active_count--;           /* Remove non-match possibility */
 1290:             next_active_state--;
 1291:             }
 1292:           if (++count >= (int)GET2(code, 1))
 1293:             { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
 1294:           else
 1295:             { ADD_NEW(state_offset, count); }
 1296:           }
 1297:         }
 1298:       break;
 1299: 
 1300: /* ========================================================================== */
 1301:       /* These are virtual opcodes that are used when something like
 1302:       OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
 1303:       argument. It keeps the code above fast for the other cases. The argument
 1304:       is in the d variable. */
 1305: 
 1306: #ifdef SUPPORT_UCP
 1307:       case OP_PROP_EXTRA + OP_TYPEPLUS:
 1308:       case OP_PROP_EXTRA + OP_TYPEMINPLUS:
 1309:       case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
 1310:       count = current_state->count;           /* Already matched */
 1311:       if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
 1312:       if (clen > 0)
 1313:         {
 1314:         BOOL OK;
 1315:         const pcre_uint32 *cp;
 1316:         const ucd_record * prop = GET_UCD(c);
 1317:         switch(code[2])
 1318:           {
 1319:           case PT_ANY:
 1320:           OK = TRUE;
 1321:           break;
 1322: 
 1323:           case PT_LAMP:
 1324:           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
 1325:             prop->chartype == ucp_Lt;
 1326:           break;
 1327: 
 1328:           case PT_GC:
 1329:           OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
 1330:           break;
 1331: 
 1332:           case PT_PC:
 1333:           OK = prop->chartype == code[3];
 1334:           break;
 1335: 
 1336:           case PT_SC:
 1337:           OK = prop->script == code[3];
 1338:           break;
 1339: 
 1340:           /* These are specials for combination cases. */
 1341: 
 1342:           case PT_ALNUM:
 1343:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
 1344:                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
 1345:           break;
 1346: 
 1347:           case PT_SPACE:    /* Perl space */
 1348:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
 1349:                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
 1350:           break;
 1351: 
 1352:           case PT_PXSPACE:  /* POSIX space */
 1353:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
 1354:                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
 1355:                c == CHAR_FF || c == CHAR_CR;
 1356:           break;
 1357: 
 1358:           case PT_WORD:
 1359:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
 1360:                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
 1361:                c == CHAR_UNDERSCORE;
 1362:           break;
 1363: 
 1364:           case PT_CLIST:
 1365:           cp = PRIV(ucd_caseless_sets) + code[3];
 1366:           for (;;)
 1367:             {
 1368:             if (c < *cp) { OK = FALSE; break; }
 1369:             if (c == *cp++) { OK = TRUE; break; }
 1370:             }
 1371:           break;
 1372: 
 1373:           case PT_UCNC:
 1374:           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
 1375:                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
 1376:                c >= 0xe000;
 1377:           break;
 1378: 
 1379:           /* Should never occur, but keep compilers from grumbling. */
 1380: 
 1381:           default:
 1382:           OK = codevalue != OP_PROP;
 1383:           break;
 1384:           }
 1385: 
 1386:         if (OK == (d == OP_PROP))
 1387:           {
 1388:           if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
 1389:             {
 1390:             active_count--;           /* Remove non-match possibility */
 1391:             next_active_state--;
 1392:             }
 1393:           count++;
 1394:           ADD_NEW(state_offset, count);
 1395:           }
 1396:         }
 1397:       break;
 1398: 
 1399:       /*-----------------------------------------------------------------*/
 1400:       case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
 1401:       case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
 1402:       case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
 1403:       count = current_state->count;  /* Already matched */
 1404:       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
 1405:       if (clen > 0)
 1406:         {
 1407:         int lgb, rgb;
 1408:         const pcre_uchar *nptr = ptr + clen;
 1409:         int ncount = 0;
 1410:         if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
 1411:           {
 1412:           active_count--;           /* Remove non-match possibility */
 1413:           next_active_state--;
 1414:           }
 1415:         lgb = UCD_GRAPHBREAK(c);
 1416:         while (nptr < end_subject)
 1417:           {
 1418:           dlen = 1;
 1419:           if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
 1420:           rgb = UCD_GRAPHBREAK(d);
 1421:           if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
 1422:           ncount++;
 1423:           lgb = rgb;
 1424:           nptr += dlen;
 1425:           }
 1426:         count++;
 1427:         ADD_NEW_DATA(-state_offset, count, ncount);
 1428:         }
 1429:       break;
 1430: #endif
 1431: 
 1432:       /*-----------------------------------------------------------------*/
 1433:       case OP_ANYNL_EXTRA + OP_TYPEPLUS:
 1434:       case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
 1435:       case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
 1436:       count = current_state->count;  /* Already matched */
 1437:       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
 1438:       if (clen > 0)
 1439:         {
 1440:         int ncount = 0;
 1441:         switch (c)
 1442:           {
 1443:           case CHAR_VT:
 1444:           case CHAR_FF:
 1445:           case CHAR_NEL:
 1446: #ifndef EBCDIC
 1447:           case 0x2028:
 1448:           case 0x2029:
 1449: #endif  /* Not EBCDIC */
 1450:           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
 1451:           goto ANYNL01;
 1452: 
 1453:           case CHAR_CR:
 1454:           if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
 1455:           /* Fall through */
 1456: 
 1457:           ANYNL01:
 1458:           case CHAR_LF:
 1459:           if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
 1460:             {
 1461:             active_count--;           /* Remove non-match possibility */
 1462:             next_active_state--;
 1463:             }
 1464:           count++;
 1465:           ADD_NEW_DATA(-state_offset, count, ncount);
 1466:           break;
 1467: 
 1468:           default:
 1469:           break;
 1470:           }
 1471:         }
 1472:       break;
 1473: 
 1474:       /*-----------------------------------------------------------------*/
 1475:       case OP_VSPACE_EXTRA + OP_TYPEPLUS:
 1476:       case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
 1477:       case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
 1478:       count = current_state->count;  /* Already matched */
 1479:       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
 1480:       if (clen > 0)
 1481:         {
 1482:         BOOL OK;
 1483:         switch (c)
 1484:           {
 1485:           VSPACE_CASES:
 1486:           OK = TRUE;
 1487:           break;
 1488: 
 1489:           default:
 1490:           OK = FALSE;
 1491:           break;
 1492:           }
 1493: 
 1494:         if (OK == (d == OP_VSPACE))
 1495:           {
 1496:           if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
 1497:             {
 1498:             active_count--;           /* Remove non-match possibility */
 1499:             next_active_state--;
 1500:             }
 1501:           count++;
 1502:           ADD_NEW_DATA(-state_offset, count, 0);
 1503:           }
 1504:         }
 1505:       break;
 1506: 
 1507:       /*-----------------------------------------------------------------*/
 1508:       case OP_HSPACE_EXTRA + OP_TYPEPLUS:
 1509:       case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
 1510:       case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
 1511:       count = current_state->count;  /* Already matched */
 1512:       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
 1513:       if (clen > 0)
 1514:         {
 1515:         BOOL OK;
 1516:         switch (c)
 1517:           {
 1518:           HSPACE_CASES:
 1519:           OK = TRUE;
 1520:           break;
 1521: 
 1522:           default:
 1523:           OK = FALSE;
 1524:           break;
 1525:           }
 1526: 
 1527:         if (OK == (d == OP_HSPACE))
 1528:           {
 1529:           if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
 1530:             {
 1531:             active_count--;           /* Remove non-match possibility */
 1532:             next_active_state--;
 1533:             }
 1534:           count++;
 1535:           ADD_NEW_DATA(-state_offset, count, 0);
 1536:           }
 1537:         }
 1538:       break;
 1539: 
 1540:       /*-----------------------------------------------------------------*/
 1541: #ifdef SUPPORT_UCP
 1542:       case OP_PROP_EXTRA + OP_TYPEQUERY:
 1543:       case OP_PROP_EXTRA + OP_TYPEMINQUERY:
 1544:       case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
 1545:       count = 4;
 1546:       goto QS1;
 1547: 
 1548:       case OP_PROP_EXTRA + OP_TYPESTAR:
 1549:       case OP_PROP_EXTRA + OP_TYPEMINSTAR:
 1550:       case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
 1551:       count = 0;
 1552: 
 1553:       QS1:
 1554: 
 1555:       ADD_ACTIVE(state_offset + 4, 0);
 1556:       if (clen > 0)
 1557:         {
 1558:         BOOL OK;
 1559:         const pcre_uint32 *cp;
 1560:         const ucd_record * prop = GET_UCD(c);
 1561:         switch(code[2])
 1562:           {
 1563:           case PT_ANY:
 1564:           OK = TRUE;
 1565:           break;
 1566: 
 1567:           case PT_LAMP:
 1568:           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
 1569:             prop->chartype == ucp_Lt;
 1570:           break;
 1571: 
 1572:           case PT_GC:
 1573:           OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
 1574:           break;
 1575: 
 1576:           case PT_PC:
 1577:           OK = prop->chartype == code[3];
 1578:           break;
 1579: 
 1580:           case PT_SC:
 1581:           OK = prop->script == code[3];
 1582:           break;
 1583: 
 1584:           /* These are specials for combination cases. */
 1585: 
 1586:           case PT_ALNUM:
 1587:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
 1588:                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
 1589:           break;
 1590: 
 1591:           case PT_SPACE:    /* Perl space */
 1592:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
 1593:                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
 1594:           break;
 1595: 
 1596:           case PT_PXSPACE:  /* POSIX space */
 1597:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
 1598:                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
 1599:                c == CHAR_FF || c == CHAR_CR;
 1600:           break;
 1601: 
 1602:           case PT_WORD:
 1603:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
 1604:                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
 1605:                c == CHAR_UNDERSCORE;
 1606:           break;
 1607: 
 1608:           case PT_CLIST:
 1609:           cp = PRIV(ucd_caseless_sets) + code[3];
 1610:           for (;;)
 1611:             {
 1612:             if (c < *cp) { OK = FALSE; break; }
 1613:             if (c == *cp++) { OK = TRUE; break; }
 1614:             }
 1615:           break;
 1616: 
 1617:           case PT_UCNC:
 1618:           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
 1619:                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
 1620:                c >= 0xe000;
 1621:           break;
 1622: 
 1623:           /* Should never occur, but keep compilers from grumbling. */
 1624: 
 1625:           default:
 1626:           OK = codevalue != OP_PROP;
 1627:           break;
 1628:           }
 1629: 
 1630:         if (OK == (d == OP_PROP))
 1631:           {
 1632:           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
 1633:               codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
 1634:             {
 1635:             active_count--;           /* Remove non-match possibility */
 1636:             next_active_state--;
 1637:             }
 1638:           ADD_NEW(state_offset + count, 0);
 1639:           }
 1640:         }
 1641:       break;
 1642: 
 1643:       /*-----------------------------------------------------------------*/
 1644:       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
 1645:       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
 1646:       case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
 1647:       count = 2;
 1648:       goto QS2;
 1649: 
 1650:       case OP_EXTUNI_EXTRA + OP_TYPESTAR:
 1651:       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
 1652:       case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
 1653:       count = 0;
 1654: 
 1655:       QS2:
 1656: 
 1657:       ADD_ACTIVE(state_offset + 2, 0);
 1658:       if (clen > 0)
 1659:         {
 1660:         int lgb, rgb;
 1661:         const pcre_uchar *nptr = ptr + clen;
 1662:         int ncount = 0;
 1663:         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
 1664:             codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
 1665:           {
 1666:           active_count--;           /* Remove non-match possibility */
 1667:           next_active_state--;
 1668:           }
 1669:         lgb = UCD_GRAPHBREAK(c);
 1670:         while (nptr < end_subject)
 1671:           {
 1672:           dlen = 1;
 1673:           if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
 1674:           rgb = UCD_GRAPHBREAK(d);
 1675:           if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
 1676:           ncount++;
 1677:           lgb = rgb;
 1678:           nptr += dlen;
 1679:           }
 1680:         ADD_NEW_DATA(-(state_offset + count), 0, ncount);
 1681:         }
 1682:       break;
 1683: #endif
 1684: 
 1685:       /*-----------------------------------------------------------------*/
 1686:       case OP_ANYNL_EXTRA + OP_TYPEQUERY:
 1687:       case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
 1688:       case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
 1689:       count = 2;
 1690:       goto QS3;
 1691: 
 1692:       case OP_ANYNL_EXTRA + OP_TYPESTAR:
 1693:       case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
 1694:       case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
 1695:       count = 0;
 1696: 
 1697:       QS3:
 1698:       ADD_ACTIVE(state_offset + 2, 0);
 1699:       if (clen > 0)
 1700:         {
 1701:         int ncount = 0;
 1702:         switch (c)
 1703:           {
 1704:           case CHAR_VT:
 1705:           case CHAR_FF:
 1706:           case CHAR_NEL:
 1707: #ifndef EBCDIC
 1708:           case 0x2028:
 1709:           case 0x2029:
 1710: #endif  /* Not EBCDIC */
 1711:           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
 1712:           goto ANYNL02;
 1713: 
 1714:           case CHAR_CR:
 1715:           if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
 1716:           /* Fall through */
 1717: 
 1718:           ANYNL02:
 1719:           case CHAR_LF:
 1720:           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
 1721:               codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
 1722:             {
 1723:             active_count--;           /* Remove non-match possibility */
 1724:             next_active_state--;
 1725:             }
 1726:           ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
 1727:           break;
 1728: 
 1729:           default:
 1730:           break;
 1731:           }
 1732:         }
 1733:       break;
 1734: 
 1735:       /*-----------------------------------------------------------------*/
 1736:       case OP_VSPACE_EXTRA + OP_TYPEQUERY:
 1737:       case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
 1738:       case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
 1739:       count = 2;
 1740:       goto QS4;
 1741: 
 1742:       case OP_VSPACE_EXTRA + OP_TYPESTAR:
 1743:       case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
 1744:       case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
 1745:       count = 0;
 1746: 
 1747:       QS4:
 1748:       ADD_ACTIVE(state_offset + 2, 0);
 1749:       if (clen > 0)
 1750:         {
 1751:         BOOL OK;
 1752:         switch (c)
 1753:           {
 1754:           VSPACE_CASES:
 1755:           OK = TRUE;
 1756:           break;
 1757: 
 1758:           default:
 1759:           OK = FALSE;
 1760:           break;
 1761:           }
 1762:         if (OK == (d == OP_VSPACE))
 1763:           {
 1764:           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
 1765:               codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
 1766:             {
 1767:             active_count--;           /* Remove non-match possibility */
 1768:             next_active_state--;
 1769:             }
 1770:           ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
 1771:           }
 1772:         }
 1773:       break;
 1774: 
 1775:       /*-----------------------------------------------------------------*/
 1776:       case OP_HSPACE_EXTRA + OP_TYPEQUERY:
 1777:       case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
 1778:       case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
 1779:       count = 2;
 1780:       goto QS5;
 1781: 
 1782:       case OP_HSPACE_EXTRA + OP_TYPESTAR:
 1783:       case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
 1784:       case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
 1785:       count = 0;
 1786: 
 1787:       QS5:
 1788:       ADD_ACTIVE(state_offset + 2, 0);
 1789:       if (clen > 0)
 1790:         {
 1791:         BOOL OK;
 1792:         switch (c)
 1793:           {
 1794:           HSPACE_CASES:
 1795:           OK = TRUE;
 1796:           break;
 1797: 
 1798:           default:
 1799:           OK = FALSE;
 1800:           break;
 1801:           }
 1802: 
 1803:         if (OK == (d == OP_HSPACE))
 1804:           {
 1805:           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
 1806:               codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
 1807:             {
 1808:             active_count--;           /* Remove non-match possibility */
 1809:             next_active_state--;
 1810:             }
 1811:           ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
 1812:           }
 1813:         }
 1814:       break;
 1815: 
 1816:       /*-----------------------------------------------------------------*/
 1817: #ifdef SUPPORT_UCP
 1818:       case OP_PROP_EXTRA + OP_TYPEEXACT:
 1819:       case OP_PROP_EXTRA + OP_TYPEUPTO:
 1820:       case OP_PROP_EXTRA + OP_TYPEMINUPTO:
 1821:       case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
 1822:       if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
 1823:         { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
 1824:       count = current_state->count;  /* Number already matched */
 1825:       if (clen > 0)
 1826:         {
 1827:         BOOL OK;
 1828:         const pcre_uint32 *cp;
 1829:         const ucd_record * prop = GET_UCD(c);
 1830:         switch(code[1 + IMM2_SIZE + 1])
 1831:           {
 1832:           case PT_ANY:
 1833:           OK = TRUE;
 1834:           break;
 1835: 
 1836:           case PT_LAMP:
 1837:           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
 1838:             prop->chartype == ucp_Lt;
 1839:           break;
 1840: 
 1841:           case PT_GC:
 1842:           OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
 1843:           break;
 1844: 
 1845:           case PT_PC:
 1846:           OK = prop->chartype == code[1 + IMM2_SIZE + 2];
 1847:           break;
 1848: 
 1849:           case PT_SC:
 1850:           OK = prop->script == code[1 + IMM2_SIZE + 2];
 1851:           break;
 1852: 
 1853:           /* These are specials for combination cases. */
 1854: 
 1855:           case PT_ALNUM:
 1856:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
 1857:                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
 1858:           break;
 1859: 
 1860:           case PT_SPACE:    /* Perl space */
 1861:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
 1862:                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
 1863:           break;
 1864: 
 1865:           case PT_PXSPACE:  /* POSIX space */
 1866:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
 1867:                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
 1868:                c == CHAR_FF || c == CHAR_CR;
 1869:           break;
 1870: 
 1871:           case PT_WORD:
 1872:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
 1873:                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
 1874:                c == CHAR_UNDERSCORE;
 1875:           break;
 1876: 
 1877:           case PT_CLIST:
 1878:           cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
 1879:           for (;;)
 1880:             {
 1881:             if (c < *cp) { OK = FALSE; break; }
 1882:             if (c == *cp++) { OK = TRUE; break; }
 1883:             }
 1884:           break;
 1885: 
 1886:           case PT_UCNC:
 1887:           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
 1888:                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
 1889:                c >= 0xe000;
 1890:           break;
 1891: 
 1892:           /* Should never occur, but keep compilers from grumbling. */
 1893: 
 1894:           default:
 1895:           OK = codevalue != OP_PROP;
 1896:           break;
 1897:           }
 1898: 
 1899:         if (OK == (d == OP_PROP))
 1900:           {
 1901:           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
 1902:             {
 1903:             active_count--;           /* Remove non-match possibility */
 1904:             next_active_state--;
 1905:             }
 1906:           if (++count >= (int)GET2(code, 1))
 1907:             { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
 1908:           else
 1909:             { ADD_NEW(state_offset, count); }
 1910:           }
 1911:         }
 1912:       break;
 1913: 
 1914:       /*-----------------------------------------------------------------*/
 1915:       case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
 1916:       case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
 1917:       case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
 1918:       case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
 1919:       if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
 1920:         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
 1921:       count = current_state->count;  /* Number already matched */
 1922:       if (clen > 0)
 1923:         {
 1924:         int lgb, rgb;
 1925:         const pcre_uchar *nptr = ptr + clen;
 1926:         int ncount = 0;
 1927:         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
 1928:           {
 1929:           active_count--;           /* Remove non-match possibility */
 1930:           next_active_state--;
 1931:           }
 1932:         lgb = UCD_GRAPHBREAK(c);
 1933:         while (nptr < end_subject)
 1934:           {
 1935:           dlen = 1;
 1936:           if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
 1937:           rgb = UCD_GRAPHBREAK(d);
 1938:           if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
 1939:           ncount++;
 1940:           lgb = rgb;
 1941:           nptr += dlen;
 1942:           }
 1943:         if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
 1944:             reset_could_continue = TRUE;
 1945:         if (++count >= (int)GET2(code, 1))
 1946:           { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
 1947:         else
 1948:           { ADD_NEW_DATA(-state_offset, count, ncount); }
 1949:         }
 1950:       break;
 1951: #endif
 1952: 
 1953:       /*-----------------------------------------------------------------*/
 1954:       case OP_ANYNL_EXTRA + OP_TYPEEXACT:
 1955:       case OP_ANYNL_EXTRA + OP_TYPEUPTO:
 1956:       case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
 1957:       case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
 1958:       if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
 1959:         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
 1960:       count = current_state->count;  /* Number already matched */
 1961:       if (clen > 0)
 1962:         {
 1963:         int ncount = 0;
 1964:         switch (c)
 1965:           {
 1966:           case CHAR_VT:
 1967:           case CHAR_FF:
 1968:           case CHAR_NEL:
 1969: #ifndef EBCDIC
 1970:           case 0x2028:
 1971:           case 0x2029:
 1972: #endif  /* Not EBCDIC */
 1973:           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
 1974:           goto ANYNL03;
 1975: 
 1976:           case CHAR_CR:
 1977:           if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
 1978:           /* Fall through */
 1979: 
 1980:           ANYNL03:
 1981:           case CHAR_LF:
 1982:           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
 1983:             {
 1984:             active_count--;           /* Remove non-match possibility */
 1985:             next_active_state--;
 1986:             }
 1987:           if (++count >= (int)GET2(code, 1))
 1988:             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
 1989:           else
 1990:             { ADD_NEW_DATA(-state_offset, count, ncount); }
 1991:           break;
 1992: 
 1993:           default:
 1994:           break;
 1995:           }
 1996:         }
 1997:       break;
 1998: 
 1999:       /*-----------------------------------------------------------------*/
 2000:       case OP_VSPACE_EXTRA + OP_TYPEEXACT:
 2001:       case OP_VSPACE_EXTRA + OP_TYPEUPTO:
 2002:       case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
 2003:       case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
 2004:       if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
 2005:         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
 2006:       count = current_state->count;  /* Number already matched */
 2007:       if (clen > 0)
 2008:         {
 2009:         BOOL OK;
 2010:         switch (c)
 2011:           {
 2012:           VSPACE_CASES:
 2013:           OK = TRUE;
 2014:           break;
 2015: 
 2016:           default:
 2017:           OK = FALSE;
 2018:           }
 2019: 
 2020:         if (OK == (d == OP_VSPACE))
 2021:           {
 2022:           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
 2023:             {
 2024:             active_count--;           /* Remove non-match possibility */
 2025:             next_active_state--;
 2026:             }
 2027:           if (++count >= (int)GET2(code, 1))
 2028:             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
 2029:           else
 2030:             { ADD_NEW_DATA(-state_offset, count, 0); }
 2031:           }
 2032:         }
 2033:       break;
 2034: 
 2035:       /*-----------------------------------------------------------------*/
 2036:       case OP_HSPACE_EXTRA + OP_TYPEEXACT:
 2037:       case OP_HSPACE_EXTRA + OP_TYPEUPTO:
 2038:       case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
 2039:       case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
 2040:       if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
 2041:         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
 2042:       count = current_state->count;  /* Number already matched */
 2043:       if (clen > 0)
 2044:         {
 2045:         BOOL OK;
 2046:         switch (c)
 2047:           {
 2048:           HSPACE_CASES:
 2049:           OK = TRUE;
 2050:           break;
 2051: 
 2052:           default:
 2053:           OK = FALSE;
 2054:           break;
 2055:           }
 2056: 
 2057:         if (OK == (d == OP_HSPACE))
 2058:           {
 2059:           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
 2060:             {
 2061:             active_count--;           /* Remove non-match possibility */
 2062:             next_active_state--;
 2063:             }
 2064:           if (++count >= (int)GET2(code, 1))
 2065:             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
 2066:           else
 2067:             { ADD_NEW_DATA(-state_offset, count, 0); }
 2068:           }
 2069:         }
 2070:       break;
 2071: 
 2072: /* ========================================================================== */
 2073:       /* These opcodes are followed by a character that is usually compared
 2074:       to the current subject character; it is loaded into d. We still get
 2075:       here even if there is no subject character, because in some cases zero
 2076:       repetitions are permitted. */
 2077: 
 2078:       /*-----------------------------------------------------------------*/
 2079:       case OP_CHAR:
 2080:       if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
 2081:       break;
 2082: 
 2083:       /*-----------------------------------------------------------------*/
 2084:       case OP_CHARI:
 2085:       if (clen == 0) break;
 2086: 
 2087: #ifdef SUPPORT_UTF
 2088:       if (utf)
 2089:         {
 2090:         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
 2091:           {
 2092:           unsigned int othercase;
 2093:           if (c < 128)
 2094:             othercase = fcc[c];
 2095:           else
 2096:             /* If we have Unicode property support, we can use it to test the
 2097:             other case of the character. */
 2098: #ifdef SUPPORT_UCP
 2099:             othercase = UCD_OTHERCASE(c);
 2100: #else
 2101:             othercase = NOTACHAR;
 2102: #endif
 2103: 
 2104:           if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
 2105:           }
 2106:         }
 2107:       else
 2108: #endif  /* SUPPORT_UTF */
 2109:       /* Not UTF mode */
 2110:         {
 2111:         if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
 2112:           { ADD_NEW(state_offset + 2, 0); }
 2113:         }
 2114:       break;
 2115: 
 2116: 
 2117: #ifdef SUPPORT_UCP
 2118:       /*-----------------------------------------------------------------*/
 2119:       /* This is a tricky one because it can match more than one character.
 2120:       Find out how many characters to skip, and then set up a negative state
 2121:       to wait for them to pass before continuing. */
 2122: 
 2123:       case OP_EXTUNI:
 2124:       if (clen > 0)
 2125:         {
 2126:         int lgb, rgb;
 2127:         const pcre_uchar *nptr = ptr + clen;
 2128:         int ncount = 0;
 2129:         lgb = UCD_GRAPHBREAK(c);
 2130:         while (nptr < end_subject)
 2131:           {
 2132:           dlen = 1;
 2133:           if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
 2134:           rgb = UCD_GRAPHBREAK(d);
 2135:           if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
 2136:           ncount++;
 2137:           lgb = rgb;
 2138:           nptr += dlen;
 2139:           }
 2140:         if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
 2141:             reset_could_continue = TRUE;
 2142:         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
 2143:         }
 2144:       break;
 2145: #endif
 2146: 
 2147:       /*-----------------------------------------------------------------*/
 2148:       /* This is a tricky like EXTUNI because it too can match more than one
 2149:       character (when CR is followed by LF). In this case, set up a negative
 2150:       state to wait for one character to pass before continuing. */
 2151: 
 2152:       case OP_ANYNL:
 2153:       if (clen > 0) switch(c)
 2154:         {
 2155:         case CHAR_VT:
 2156:         case CHAR_FF:
 2157:         case CHAR_NEL:
 2158: #ifndef EBCDIC
 2159:         case 0x2028:
 2160:         case 0x2029:
 2161: #endif  /* Not EBCDIC */
 2162:         if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
 2163: 
 2164:         case CHAR_LF:
 2165:         ADD_NEW(state_offset + 1, 0);
 2166:         break;
 2167: 
 2168:         case CHAR_CR:
 2169:         if (ptr + 1 >= end_subject)
 2170:           {
 2171:           ADD_NEW(state_offset + 1, 0);
 2172:           if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
 2173:             reset_could_continue = TRUE;
 2174:           }
 2175:         else if (RAWUCHARTEST(ptr + 1) == CHAR_LF)
 2176:           {
 2177:           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
 2178:           }
 2179:         else
 2180:           {
 2181:           ADD_NEW(state_offset + 1, 0);
 2182:           }
 2183:         break;
 2184:         }
 2185:       break;
 2186: 
 2187:       /*-----------------------------------------------------------------*/
 2188:       case OP_NOT_VSPACE:
 2189:       if (clen > 0) switch(c)
 2190:         {
 2191:         VSPACE_CASES:
 2192:         break;
 2193: 
 2194:         default:
 2195:         ADD_NEW(state_offset + 1, 0);
 2196:         break;
 2197:         }
 2198:       break;
 2199: 
 2200:       /*-----------------------------------------------------------------*/
 2201:       case OP_VSPACE:
 2202:       if (clen > 0) switch(c)
 2203:         {
 2204:         VSPACE_CASES:
 2205:         ADD_NEW(state_offset + 1, 0);
 2206:         break;
 2207: 
 2208:         default:
 2209:         break;
 2210:         }
 2211:       break;
 2212: 
 2213:       /*-----------------------------------------------------------------*/
 2214:       case OP_NOT_HSPACE:
 2215:       if (clen > 0) switch(c)
 2216:         {
 2217:         HSPACE_CASES:
 2218:         break;
 2219: 
 2220:         default:
 2221:         ADD_NEW(state_offset + 1, 0);
 2222:         break;
 2223:         }
 2224:       break;
 2225: 
 2226:       /*-----------------------------------------------------------------*/
 2227:       case OP_HSPACE:
 2228:       if (clen > 0) switch(c)
 2229:         {
 2230:         HSPACE_CASES:
 2231:         ADD_NEW(state_offset + 1, 0);
 2232:         break;
 2233: 
 2234:         default:
 2235:         break;
 2236:         }
 2237:       break;
 2238: 
 2239:       /*-----------------------------------------------------------------*/
 2240:       /* Match a negated single character casefully. */
 2241: 
 2242:       case OP_NOT:
 2243:       if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
 2244:       break;
 2245: 
 2246:       /*-----------------------------------------------------------------*/
 2247:       /* Match a negated single character caselessly. */
 2248: 
 2249:       case OP_NOTI:
 2250:       if (clen > 0)
 2251:         {
 2252:         unsigned int otherd;
 2253: #ifdef SUPPORT_UTF
 2254:         if (utf && d >= 128)
 2255:           {
 2256: #ifdef SUPPORT_UCP
 2257:           otherd = UCD_OTHERCASE(d);
 2258: #endif  /* SUPPORT_UCP */
 2259:           }
 2260:         else
 2261: #endif  /* SUPPORT_UTF */
 2262:         otherd = TABLE_GET(d, fcc, d);
 2263:         if (c != d && c != otherd)
 2264:           { ADD_NEW(state_offset + dlen + 1, 0); }
 2265:         }
 2266:       break;
 2267: 
 2268:       /*-----------------------------------------------------------------*/
 2269:       case OP_PLUSI:
 2270:       case OP_MINPLUSI:
 2271:       case OP_POSPLUSI:
 2272:       case OP_NOTPLUSI:
 2273:       case OP_NOTMINPLUSI:
 2274:       case OP_NOTPOSPLUSI:
 2275:       caseless = TRUE;
 2276:       codevalue -= OP_STARI - OP_STAR;
 2277: 
 2278:       /* Fall through */
 2279:       case OP_PLUS:
 2280:       case OP_MINPLUS:
 2281:       case OP_POSPLUS:
 2282:       case OP_NOTPLUS:
 2283:       case OP_NOTMINPLUS:
 2284:       case OP_NOTPOSPLUS:
 2285:       count = current_state->count;  /* Already matched */
 2286:       if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
 2287:       if (clen > 0)
 2288:         {
 2289:         pcre_uint32 otherd = NOTACHAR;
 2290:         if (caseless)
 2291:           {
 2292: #ifdef SUPPORT_UTF
 2293:           if (utf && d >= 128)
 2294:             {
 2295: #ifdef SUPPORT_UCP
 2296:             otherd = UCD_OTHERCASE(d);
 2297: #endif  /* SUPPORT_UCP */
 2298:             }
 2299:           else
 2300: #endif  /* SUPPORT_UTF */
 2301:           otherd = TABLE_GET(d, fcc, d);
 2302:           }
 2303:         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
 2304:           {
 2305:           if (count > 0 &&
 2306:               (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
 2307:             {
 2308:             active_count--;             /* Remove non-match possibility */
 2309:             next_active_state--;
 2310:             }
 2311:           count++;
 2312:           ADD_NEW(state_offset, count);
 2313:           }
 2314:         }
 2315:       break;
 2316: 
 2317:       /*-----------------------------------------------------------------*/
 2318:       case OP_QUERYI:
 2319:       case OP_MINQUERYI:
 2320:       case OP_POSQUERYI:
 2321:       case OP_NOTQUERYI:
 2322:       case OP_NOTMINQUERYI:
 2323:       case OP_NOTPOSQUERYI:
 2324:       caseless = TRUE;
 2325:       codevalue -= OP_STARI - OP_STAR;
 2326:       /* Fall through */
 2327:       case OP_QUERY:
 2328:       case OP_MINQUERY:
 2329:       case OP_POSQUERY:
 2330:       case OP_NOTQUERY:
 2331:       case OP_NOTMINQUERY:
 2332:       case OP_NOTPOSQUERY:
 2333:       ADD_ACTIVE(state_offset + dlen + 1, 0);
 2334:       if (clen > 0)
 2335:         {
 2336:         pcre_uint32 otherd = NOTACHAR;
 2337:         if (caseless)
 2338:           {
 2339: #ifdef SUPPORT_UTF
 2340:           if (utf && d >= 128)
 2341:             {
 2342: #ifdef SUPPORT_UCP
 2343:             otherd = UCD_OTHERCASE(d);
 2344: #endif  /* SUPPORT_UCP */
 2345:             }
 2346:           else
 2347: #endif  /* SUPPORT_UTF */
 2348:           otherd = TABLE_GET(d, fcc, d);
 2349:           }
 2350:         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
 2351:           {
 2352:           if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
 2353:             {
 2354:             active_count--;            /* Remove non-match possibility */
 2355:             next_active_state--;
 2356:             }
 2357:           ADD_NEW(state_offset + dlen + 1, 0);
 2358:           }
 2359:         }
 2360:       break;
 2361: 
 2362:       /*-----------------------------------------------------------------*/
 2363:       case OP_STARI:
 2364:       case OP_MINSTARI:
 2365:       case OP_POSSTARI:
 2366:       case OP_NOTSTARI:
 2367:       case OP_NOTMINSTARI:
 2368:       case OP_NOTPOSSTARI:
 2369:       caseless = TRUE;
 2370:       codevalue -= OP_STARI - OP_STAR;
 2371:       /* Fall through */
 2372:       case OP_STAR:
 2373:       case OP_MINSTAR:
 2374:       case OP_POSSTAR:
 2375:       case OP_NOTSTAR:
 2376:       case OP_NOTMINSTAR:
 2377:       case OP_NOTPOSSTAR:
 2378:       ADD_ACTIVE(state_offset + dlen + 1, 0);
 2379:       if (clen > 0)
 2380:         {
 2381:         pcre_uint32 otherd = NOTACHAR;
 2382:         if (caseless)
 2383:           {
 2384: #ifdef SUPPORT_UTF
 2385:           if (utf && d >= 128)
 2386:             {
 2387: #ifdef SUPPORT_UCP
 2388:             otherd = UCD_OTHERCASE(d);
 2389: #endif  /* SUPPORT_UCP */
 2390:             }
 2391:           else
 2392: #endif  /* SUPPORT_UTF */
 2393:           otherd = TABLE_GET(d, fcc, d);
 2394:           }
 2395:         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
 2396:           {
 2397:           if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
 2398:             {
 2399:             active_count--;            /* Remove non-match possibility */
 2400:             next_active_state--;
 2401:             }
 2402:           ADD_NEW(state_offset, 0);
 2403:           }
 2404:         }
 2405:       break;
 2406: 
 2407:       /*-----------------------------------------------------------------*/
 2408:       case OP_EXACTI:
 2409:       case OP_NOTEXACTI:
 2410:       caseless = TRUE;
 2411:       codevalue -= OP_STARI - OP_STAR;
 2412:       /* Fall through */
 2413:       case OP_EXACT:
 2414:       case OP_NOTEXACT:
 2415:       count = current_state->count;  /* Number already matched */
 2416:       if (clen > 0)
 2417:         {
 2418:         pcre_uint32 otherd = NOTACHAR;
 2419:         if (caseless)
 2420:           {
 2421: #ifdef SUPPORT_UTF
 2422:           if (utf && d >= 128)
 2423:             {
 2424: #ifdef SUPPORT_UCP
 2425:             otherd = UCD_OTHERCASE(d);
 2426: #endif  /* SUPPORT_UCP */
 2427:             }
 2428:           else
 2429: #endif  /* SUPPORT_UTF */
 2430:           otherd = TABLE_GET(d, fcc, d);
 2431:           }
 2432:         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
 2433:           {
 2434:           if (++count >= (int)GET2(code, 1))
 2435:             { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
 2436:           else
 2437:             { ADD_NEW(state_offset, count); }
 2438:           }
 2439:         }
 2440:       break;
 2441: 
 2442:       /*-----------------------------------------------------------------*/
 2443:       case OP_UPTOI:
 2444:       case OP_MINUPTOI:
 2445:       case OP_POSUPTOI:
 2446:       case OP_NOTUPTOI:
 2447:       case OP_NOTMINUPTOI:
 2448:       case OP_NOTPOSUPTOI:
 2449:       caseless = TRUE;
 2450:       codevalue -= OP_STARI - OP_STAR;
 2451:       /* Fall through */
 2452:       case OP_UPTO:
 2453:       case OP_MINUPTO:
 2454:       case OP_POSUPTO:
 2455:       case OP_NOTUPTO:
 2456:       case OP_NOTMINUPTO:
 2457:       case OP_NOTPOSUPTO:
 2458:       ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
 2459:       count = current_state->count;  /* Number already matched */
 2460:       if (clen > 0)
 2461:         {
 2462:         pcre_uint32 otherd = NOTACHAR;
 2463:         if (caseless)
 2464:           {
 2465: #ifdef SUPPORT_UTF
 2466:           if (utf && d >= 128)
 2467:             {
 2468: #ifdef SUPPORT_UCP
 2469:             otherd = UCD_OTHERCASE(d);
 2470: #endif  /* SUPPORT_UCP */
 2471:             }
 2472:           else
 2473: #endif  /* SUPPORT_UTF */
 2474:           otherd = TABLE_GET(d, fcc, d);
 2475:           }
 2476:         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
 2477:           {
 2478:           if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
 2479:             {
 2480:             active_count--;             /* Remove non-match possibility */
 2481:             next_active_state--;
 2482:             }
 2483:           if (++count >= (int)GET2(code, 1))
 2484:             { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
 2485:           else
 2486:             { ADD_NEW(state_offset, count); }
 2487:           }
 2488:         }
 2489:       break;
 2490: 
 2491: 
 2492: /* ========================================================================== */
 2493:       /* These are the class-handling opcodes */
 2494: 
 2495:       case OP_CLASS:
 2496:       case OP_NCLASS:
 2497:       case OP_XCLASS:
 2498:         {
 2499:         BOOL isinclass = FALSE;
 2500:         int next_state_offset;
 2501:         const pcre_uchar *ecode;
 2502: 
 2503:         /* For a simple class, there is always just a 32-byte table, and we
 2504:         can set isinclass from it. */
 2505: 
 2506:         if (codevalue != OP_XCLASS)
 2507:           {
 2508:           ecode = code + 1 + (32 / sizeof(pcre_uchar));
 2509:           if (clen > 0)
 2510:             {
 2511:             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
 2512:               ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
 2513:             }
 2514:           }
 2515: 
 2516:         /* An extended class may have a table or a list of single characters,
 2517:         ranges, or both, and it may be positive or negative. There's a
 2518:         function that sorts all this out. */
 2519: 
 2520:         else
 2521:          {
 2522:          ecode = code + GET(code, 1);
 2523:          if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
 2524:          }
 2525: 
 2526:         /* At this point, isinclass is set for all kinds of class, and ecode
 2527:         points to the byte after the end of the class. If there is a
 2528:         quantifier, this is where it will be. */
 2529: 
 2530:         next_state_offset = (int)(ecode - start_code);
 2531: 
 2532:         switch (*ecode)
 2533:           {
 2534:           case OP_CRSTAR:
 2535:           case OP_CRMINSTAR:
 2536:           ADD_ACTIVE(next_state_offset + 1, 0);
 2537:           if (isinclass) { ADD_NEW(state_offset, 0); }
 2538:           break;
 2539: 
 2540:           case OP_CRPLUS:
 2541:           case OP_CRMINPLUS:
 2542:           count = current_state->count;  /* Already matched */
 2543:           if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
 2544:           if (isinclass) { count++; ADD_NEW(state_offset, count); }
 2545:           break;
 2546: 
 2547:           case OP_CRQUERY:
 2548:           case OP_CRMINQUERY:
 2549:           ADD_ACTIVE(next_state_offset + 1, 0);
 2550:           if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
 2551:           break;
 2552: 
 2553:           case OP_CRRANGE:
 2554:           case OP_CRMINRANGE:
 2555:           count = current_state->count;  /* Already matched */
 2556:           if (count >= (int)GET2(ecode, 1))
 2557:             { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
 2558:           if (isinclass)
 2559:             {
 2560:             int max = (int)GET2(ecode, 1 + IMM2_SIZE);
 2561:             if (++count >= max && max != 0)   /* Max 0 => no limit */
 2562:               { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
 2563:             else
 2564:               { ADD_NEW(state_offset, count); }
 2565:             }
 2566:           break;
 2567: 
 2568:           default:
 2569:           if (isinclass) { ADD_NEW(next_state_offset, 0); }
 2570:           break;
 2571:           }
 2572:         }
 2573:       break;
 2574: 
 2575: /* ========================================================================== */
 2576:       /* These are the opcodes for fancy brackets of various kinds. We have
 2577:       to use recursion in order to handle them. The "always failing" assertion
 2578:       (?!) is optimised to OP_FAIL when compiling, so we have to support that,
 2579:       though the other "backtracking verbs" are not supported. */
 2580: 
 2581:       case OP_FAIL:
 2582:       forced_fail++;    /* Count FAILs for multiple states */
 2583:       break;
 2584: 
 2585:       case OP_ASSERT:
 2586:       case OP_ASSERT_NOT:
 2587:       case OP_ASSERTBACK:
 2588:       case OP_ASSERTBACK_NOT:
 2589:         {
 2590:         int rc;
 2591:         int local_offsets[2];
 2592:         int local_workspace[1000];
 2593:         const pcre_uchar *endasscode = code + GET(code, 1);
 2594: 
 2595:         while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
 2596: 
 2597:         rc = internal_dfa_exec(
 2598:           md,                                   /* static match data */
 2599:           code,                                 /* this subexpression's code */
 2600:           ptr,                                  /* where we currently are */
 2601:           (int)(ptr - start_subject),           /* start offset */
 2602:           local_offsets,                        /* offset vector */
 2603:           sizeof(local_offsets)/sizeof(int),    /* size of same */
 2604:           local_workspace,                      /* workspace vector */
 2605:           sizeof(local_workspace)/sizeof(int),  /* size of same */
 2606:           rlevel);                              /* function recursion level */
 2607: 
 2608:         if (rc == PCRE_ERROR_DFA_UITEM) return rc;
 2609:         if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
 2610:             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
 2611:         }
 2612:       break;
 2613: 
 2614:       /*-----------------------------------------------------------------*/
 2615:       case OP_COND:
 2616:       case OP_SCOND:
 2617:         {
 2618:         int local_offsets[1000];
 2619:         int local_workspace[1000];
 2620:         int codelink = GET(code, 1);
 2621:         int condcode;
 2622: 
 2623:         /* Because of the way auto-callout works during compile, a callout item
 2624:         is inserted between OP_COND and an assertion condition. This does not
 2625:         happen for the other conditions. */
 2626: 
 2627:         if (code[LINK_SIZE+1] == OP_CALLOUT)
 2628:           {
 2629:           rrc = 0;
 2630:           if (PUBL(callout) != NULL)
 2631:             {
 2632:             PUBL(callout_block) cb;
 2633:             cb.version          = 1;   /* Version 1 of the callout block */
 2634:             cb.callout_number   = code[LINK_SIZE+2];
 2635:             cb.offset_vector    = offsets;
 2636: #if defined COMPILE_PCRE8
 2637:             cb.subject          = (PCRE_SPTR)start_subject;
 2638: #elif defined COMPILE_PCRE16
 2639:             cb.subject          = (PCRE_SPTR16)start_subject;
 2640: #elif defined COMPILE_PCRE32
 2641:             cb.subject          = (PCRE_SPTR32)start_subject;
 2642: #endif
 2643:             cb.subject_length   = (int)(end_subject - start_subject);
 2644:             cb.start_match      = (int)(current_subject - start_subject);
 2645:             cb.current_position = (int)(ptr - start_subject);
 2646:             cb.pattern_position = GET(code, LINK_SIZE + 3);
 2647:             cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
 2648:             cb.capture_top      = 1;
 2649:             cb.capture_last     = -1;
 2650:             cb.callout_data     = md->callout_data;
 2651:             cb.mark             = NULL;   /* No (*MARK) support */
 2652:             if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
 2653:             }
 2654:           if (rrc > 0) break;                      /* Fail this thread */
 2655:           code += PRIV(OP_lengths)[OP_CALLOUT];    /* Skip callout data */
 2656:           }
 2657: 
 2658:         condcode = code[LINK_SIZE+1];
 2659: 
 2660:         /* Back reference conditions are not supported */
 2661: 
 2662:         if (condcode == OP_CREF || condcode == OP_NCREF)
 2663:           return PCRE_ERROR_DFA_UCOND;
 2664: 
 2665:         /* The DEFINE condition is always false */
 2666: 
 2667:         if (condcode == OP_DEF)
 2668:           { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
 2669: 
 2670:         /* The only supported version of OP_RREF is for the value RREF_ANY,
 2671:         which means "test if in any recursion". We can't test for specifically
 2672:         recursed groups. */
 2673: 
 2674:         else if (condcode == OP_RREF || condcode == OP_NRREF)
 2675:           {
 2676:           int value = GET2(code, LINK_SIZE + 2);
 2677:           if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
 2678:           if (md->recursive != NULL)
 2679:             { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
 2680:           else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
 2681:           }
 2682: 
 2683:         /* Otherwise, the condition is an assertion */
 2684: 
 2685:         else
 2686:           {
 2687:           int rc;
 2688:           const pcre_uchar *asscode = code + LINK_SIZE + 1;
 2689:           const pcre_uchar *endasscode = asscode + GET(asscode, 1);
 2690: 
 2691:           while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
 2692: 
 2693:           rc = internal_dfa_exec(
 2694:             md,                                   /* fixed match data */
 2695:             asscode,                              /* this subexpression's code */
 2696:             ptr,                                  /* where we currently are */
 2697:             (int)(ptr - start_subject),           /* start offset */
 2698:             local_offsets,                        /* offset vector */
 2699:             sizeof(local_offsets)/sizeof(int),    /* size of same */
 2700:             local_workspace,                      /* workspace vector */
 2701:             sizeof(local_workspace)/sizeof(int),  /* size of same */
 2702:             rlevel);                              /* function recursion level */
 2703: 
 2704:           if (rc == PCRE_ERROR_DFA_UITEM) return rc;
 2705:           if ((rc >= 0) ==
 2706:                 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
 2707:             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
 2708:           else
 2709:             { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
 2710:           }
 2711:         }
 2712:       break;
 2713: 
 2714:       /*-----------------------------------------------------------------*/
 2715:       case OP_RECURSE:
 2716:         {
 2717:         dfa_recursion_info *ri;
 2718:         int local_offsets[1000];
 2719:         int local_workspace[1000];
 2720:         const pcre_uchar *callpat = start_code + GET(code, 1);
 2721:         int recno = (callpat == md->start_code)? 0 :
 2722:           GET2(callpat, 1 + LINK_SIZE);
 2723:         int rc;
 2724: 
 2725:         DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
 2726: 
 2727:         /* Check for repeating a recursion without advancing the subject
 2728:         pointer. This should catch convoluted mutual recursions. (Some simple
 2729:         cases are caught at compile time.) */
 2730: 
 2731:         for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
 2732:           if (recno == ri->group_num && ptr == ri->subject_position)
 2733:             return PCRE_ERROR_RECURSELOOP;
 2734: 
 2735:         /* Remember this recursion and where we started it so as to
 2736:         catch infinite loops. */
 2737: 
 2738:         new_recursive.group_num = recno;
 2739:         new_recursive.subject_position = ptr;
 2740:         new_recursive.prevrec = md->recursive;
 2741:         md->recursive = &new_recursive;
 2742: 
 2743:         rc = internal_dfa_exec(
 2744:           md,                                   /* fixed match data */
 2745:           callpat,                              /* this subexpression's code */
 2746:           ptr,                                  /* where we currently are */
 2747:           (int)(ptr - start_subject),           /* start offset */
 2748:           local_offsets,                        /* offset vector */
 2749:           sizeof(local_offsets)/sizeof(int),    /* size of same */
 2750:           local_workspace,                      /* workspace vector */
 2751:           sizeof(local_workspace)/sizeof(int),  /* size of same */
 2752:           rlevel);                              /* function recursion level */
 2753: 
 2754:         md->recursive = new_recursive.prevrec;  /* Done this recursion */
 2755: 
 2756:         DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
 2757:           rc));
 2758: 
 2759:         /* Ran out of internal offsets */
 2760: 
 2761:         if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
 2762: 
 2763:         /* For each successful matched substring, set up the next state with a
 2764:         count of characters to skip before trying it. Note that the count is in
 2765:         characters, not bytes. */
 2766: 
 2767:         if (rc > 0)
 2768:           {
 2769:           for (rc = rc*2 - 2; rc >= 0; rc -= 2)
 2770:             {
 2771:             int charcount = local_offsets[rc+1] - local_offsets[rc];
 2772: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
 2773:             if (utf)
 2774:               {
 2775:               const pcre_uchar *p = start_subject + local_offsets[rc];
 2776:               const pcre_uchar *pp = start_subject + local_offsets[rc+1];
 2777:               while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
 2778:               }
 2779: #endif
 2780:             if (charcount > 0)
 2781:               {
 2782:               ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
 2783:               }
 2784:             else
 2785:               {
 2786:               ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
 2787:               }
 2788:             }
 2789:           }
 2790:         else if (rc != PCRE_ERROR_NOMATCH) return rc;
 2791:         }
 2792:       break;
 2793: 
 2794:       /*-----------------------------------------------------------------*/
 2795:       case OP_BRAPOS:
 2796:       case OP_SBRAPOS:
 2797:       case OP_CBRAPOS:
 2798:       case OP_SCBRAPOS:
 2799:       case OP_BRAPOSZERO:
 2800:         {
 2801:         int charcount, matched_count;
 2802:         const pcre_uchar *local_ptr = ptr;
 2803:         BOOL allow_zero;
 2804: 
 2805:         if (codevalue == OP_BRAPOSZERO)
 2806:           {
 2807:           allow_zero = TRUE;
 2808:           codevalue = *(++code);  /* Codevalue will be one of above BRAs */
 2809:           }
 2810:         else allow_zero = FALSE;
 2811: 
 2812:         /* Loop to match the subpattern as many times as possible as if it were
 2813:         a complete pattern. */
 2814: 
 2815:         for (matched_count = 0;; matched_count++)
 2816:           {
 2817:           int local_offsets[2];
 2818:           int local_workspace[1000];
 2819: 
 2820:           int rc = internal_dfa_exec(
 2821:             md,                                   /* fixed match data */
 2822:             code,                                 /* this subexpression's code */
 2823:             local_ptr,                            /* where we currently are */
 2824:             (int)(ptr - start_subject),           /* start offset */
 2825:             local_offsets,                        /* offset vector */
 2826:             sizeof(local_offsets)/sizeof(int),    /* size of same */
 2827:             local_workspace,                      /* workspace vector */
 2828:             sizeof(local_workspace)/sizeof(int),  /* size of same */
 2829:             rlevel);                              /* function recursion level */
 2830: 
 2831:           /* Failed to match */
 2832: 
 2833:           if (rc < 0)
 2834:             {
 2835:             if (rc != PCRE_ERROR_NOMATCH) return rc;
 2836:             break;
 2837:             }
 2838: 
 2839:           /* Matched: break the loop if zero characters matched. */
 2840: 
 2841:           charcount = local_offsets[1] - local_offsets[0];
 2842:           if (charcount == 0) break;
 2843:           local_ptr += charcount;    /* Advance temporary position ptr */
 2844:           }
 2845: 
 2846:         /* At this point we have matched the subpattern matched_count
 2847:         times, and local_ptr is pointing to the character after the end of the
 2848:         last match. */
 2849: 
 2850:         if (matched_count > 0 || allow_zero)
 2851:           {
 2852:           const pcre_uchar *end_subpattern = code;
 2853:           int next_state_offset;
 2854: 
 2855:           do { end_subpattern += GET(end_subpattern, 1); }
 2856:             while (*end_subpattern == OP_ALT);
 2857:           next_state_offset =
 2858:             (int)(end_subpattern - start_code + LINK_SIZE + 1);
 2859: 
 2860:           /* Optimization: if there are no more active states, and there
 2861:           are no new states yet set up, then skip over the subject string
 2862:           right here, to save looping. Otherwise, set up the new state to swing
 2863:           into action when the end of the matched substring is reached. */
 2864: 
 2865:           if (i + 1 >= active_count && new_count == 0)
 2866:             {
 2867:             ptr = local_ptr;
 2868:             clen = 0;
 2869:             ADD_NEW(next_state_offset, 0);
 2870:             }
 2871:           else
 2872:             {
 2873:             const pcre_uchar *p = ptr;
 2874:             const pcre_uchar *pp = local_ptr;
 2875:             charcount = (int)(pp - p);
 2876: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
 2877:             if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
 2878: #endif
 2879:             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
 2880:             }
 2881:           }
 2882:         }
 2883:       break;
 2884: 
 2885:       /*-----------------------------------------------------------------*/
 2886:       case OP_ONCE:
 2887:       case OP_ONCE_NC:
 2888:         {
 2889:         int local_offsets[2];
 2890:         int local_workspace[1000];
 2891: 
 2892:         int rc = internal_dfa_exec(
 2893:           md,                                   /* fixed match data */
 2894:           code,                                 /* this subexpression's code */
 2895:           ptr,                                  /* where we currently are */
 2896:           (int)(ptr - start_subject),           /* start offset */
 2897:           local_offsets,                        /* offset vector */
 2898:           sizeof(local_offsets)/sizeof(int),    /* size of same */
 2899:           local_workspace,                      /* workspace vector */
 2900:           sizeof(local_workspace)/sizeof(int),  /* size of same */
 2901:           rlevel);                              /* function recursion level */
 2902: 
 2903:         if (rc >= 0)
 2904:           {
 2905:           const pcre_uchar *end_subpattern = code;
 2906:           int charcount = local_offsets[1] - local_offsets[0];
 2907:           int next_state_offset, repeat_state_offset;
 2908: 
 2909:           do { end_subpattern += GET(end_subpattern, 1); }
 2910:             while (*end_subpattern == OP_ALT);
 2911:           next_state_offset =
 2912:             (int)(end_subpattern - start_code + LINK_SIZE + 1);
 2913: 
 2914:           /* If the end of this subpattern is KETRMAX or KETRMIN, we must
 2915:           arrange for the repeat state also to be added to the relevant list.
 2916:           Calculate the offset, or set -1 for no repeat. */
 2917: 
 2918:           repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
 2919:                                  *end_subpattern == OP_KETRMIN)?
 2920:             (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
 2921: 
 2922:           /* If we have matched an empty string, add the next state at the
 2923:           current character pointer. This is important so that the duplicate
 2924:           checking kicks in, which is what breaks infinite loops that match an
 2925:           empty string. */
 2926: 
 2927:           if (charcount == 0)
 2928:             {
 2929:             ADD_ACTIVE(next_state_offset, 0);
 2930:             }
 2931: 
 2932:           /* Optimization: if there are no more active states, and there
 2933:           are no new states yet set up, then skip over the subject string
 2934:           right here, to save looping. Otherwise, set up the new state to swing
 2935:           into action when the end of the matched substring is reached. */
 2936: 
 2937:           else if (i + 1 >= active_count && new_count == 0)
 2938:             {
 2939:             ptr += charcount;
 2940:             clen = 0;
 2941:             ADD_NEW(next_state_offset, 0);
 2942: 
 2943:             /* If we are adding a repeat state at the new character position,
 2944:             we must fudge things so that it is the only current state.
 2945:             Otherwise, it might be a duplicate of one we processed before, and
 2946:             that would cause it to be skipped. */
 2947: 
 2948:             if (repeat_state_offset >= 0)
 2949:               {
 2950:               next_active_state = active_states;
 2951:               active_count = 0;
 2952:               i = -1;
 2953:               ADD_ACTIVE(repeat_state_offset, 0);
 2954:               }
 2955:             }
 2956:           else
 2957:             {
 2958: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
 2959:             if (utf)
 2960:               {
 2961:               const pcre_uchar *p = start_subject + local_offsets[0];
 2962:               const pcre_uchar *pp = start_subject + local_offsets[1];
 2963:               while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
 2964:               }
 2965: #endif
 2966:             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
 2967:             if (repeat_state_offset >= 0)
 2968:               { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
 2969:             }
 2970:           }
 2971:         else if (rc != PCRE_ERROR_NOMATCH) return rc;
 2972:         }
 2973:       break;
 2974: 
 2975: 
 2976: /* ========================================================================== */
 2977:       /* Handle callouts */
 2978: 
 2979:       case OP_CALLOUT:
 2980:       rrc = 0;
 2981:       if (PUBL(callout) != NULL)
 2982:         {
 2983:         PUBL(callout_block) cb;
 2984:         cb.version          = 1;   /* Version 1 of the callout block */
 2985:         cb.callout_number   = code[1];
 2986:         cb.offset_vector    = offsets;
 2987: #if defined COMPILE_PCRE8
 2988:         cb.subject          = (PCRE_SPTR)start_subject;
 2989: #elif defined COMPILE_PCRE16
 2990:         cb.subject          = (PCRE_SPTR16)start_subject;
 2991: #elif defined COMPILE_PCRE32
 2992:         cb.subject          = (PCRE_SPTR32)start_subject;
 2993: #endif
 2994:         cb.subject_length   = (int)(end_subject - start_subject);
 2995:         cb.start_match      = (int)(current_subject - start_subject);
 2996:         cb.current_position = (int)(ptr - start_subject);
 2997:         cb.pattern_position = GET(code, 2);
 2998:         cb.next_item_length = GET(code, 2 + LINK_SIZE);
 2999:         cb.capture_top      = 1;
 3000:         cb.capture_last     = -1;
 3001:         cb.callout_data     = md->callout_data;
 3002:         cb.mark             = NULL;   /* No (*MARK) support */
 3003:         if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
 3004:         }
 3005:       if (rrc == 0)
 3006:         { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
 3007:       break;
 3008: 
 3009: 
 3010: /* ========================================================================== */
 3011:       default:        /* Unsupported opcode */
 3012:       return PCRE_ERROR_DFA_UITEM;
 3013:       }
 3014: 
 3015:     NEXT_ACTIVE_STATE: continue;
 3016: 
 3017:     }      /* End of loop scanning active states */
 3018: 
 3019:   /* We have finished the processing at the current subject character. If no
 3020:   new states have been set for the next character, we have found all the
 3021:   matches that we are going to find. If we are at the top level and partial
 3022:   matching has been requested, check for appropriate conditions.
 3023: 
 3024:   The "forced_ fail" variable counts the number of (*F) encountered for the
 3025:   character. If it is equal to the original active_count (saved in
 3026:   workspace[1]) it means that (*F) was found on every active state. In this
 3027:   case we don't want to give a partial match.
 3028: 
 3029:   The "could_continue" variable is true if a state could have continued but
 3030:   for the fact that the end of the subject was reached. */
 3031: 
 3032:   if (new_count <= 0)
 3033:     {
 3034:     if (rlevel == 1 &&                               /* Top level, and */
 3035:         could_continue &&                            /* Some could go on, and */
 3036:         forced_fail != workspace[1] &&               /* Not all forced fail & */
 3037:         (                                            /* either... */
 3038:         (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
 3039:         ||                                           /* or... */
 3040:         ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
 3041:          match_count < 0)                            /* no matches */
 3042:         ) &&                                         /* And... */
 3043:         (
 3044:         partial_newline ||                           /* Either partial NL */
 3045:           (                                          /* or ... */
 3046:           ptr >= end_subject &&                /* End of subject and */
 3047:           ptr > md->start_used_ptr)            /* Inspected non-empty string */
 3048:           )
 3049:         )
 3050:       match_count = PCRE_ERROR_PARTIAL;
 3051:     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
 3052:       "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
 3053:       rlevel*2-2, SP));
 3054:     break;        /* In effect, "return", but see the comment below */
 3055:     }
 3056: 
 3057:   /* One or more states are active for the next character. */
 3058: 
 3059:   ptr += clen;    /* Advance to next subject character */
 3060:   }               /* Loop to move along the subject string */
 3061: 
 3062: /* Control gets here from "break" a few lines above. We do it this way because
 3063: if we use "return" above, we have compiler trouble. Some compilers warn if
 3064: there's nothing here because they think the function doesn't return a value. On
 3065: the other hand, if we put a dummy statement here, some more clever compilers
 3066: complain that it can't be reached. Sigh. */
 3067: 
 3068: return match_count;
 3069: }
 3070: 
 3071: 
 3072: 
 3073: 
 3074: /*************************************************
 3075: *    Execute a Regular Expression - DFA engine   *
 3076: *************************************************/
 3077: 
 3078: /* This external function applies a compiled re to a subject string using a DFA
 3079: engine. This function calls the internal function multiple times if the pattern
 3080: is not anchored.
 3081: 
 3082: Arguments:
 3083:   argument_re     points to the compiled expression
 3084:   extra_data      points to extra data or is NULL
 3085:   subject         points to the subject string
 3086:   length          length of subject string (may contain binary zeros)
 3087:   start_offset    where to start in the subject string
 3088:   options         option bits
 3089:   offsets         vector of match offsets
 3090:   offsetcount     size of same
 3091:   workspace       workspace vector
 3092:   wscount         size of same
 3093: 
 3094: Returns:          > 0 => number of match offset pairs placed in offsets
 3095:                   = 0 => offsets overflowed; longest matches are present
 3096:                    -1 => failed to match
 3097:                  < -1 => some kind of unexpected problem
 3098: */
 3099: 
 3100: #if defined COMPILE_PCRE8
 3101: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
 3102: pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
 3103:   const char *subject, int length, int start_offset, int options, int *offsets,
 3104:   int offsetcount, int *workspace, int wscount)
 3105: #elif defined COMPILE_PCRE16
 3106: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
 3107: pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
 3108:   PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
 3109:   int offsetcount, int *workspace, int wscount)
 3110: #elif defined COMPILE_PCRE32
 3111: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
 3112: pcre32_dfa_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
 3113:   PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
 3114:   int offsetcount, int *workspace, int wscount)
 3115: #endif
 3116: {
 3117: REAL_PCRE *re = (REAL_PCRE *)argument_re;
 3118: dfa_match_data match_block;
 3119: dfa_match_data *md = &match_block;
 3120: BOOL utf, anchored, startline, firstline;
 3121: const pcre_uchar *current_subject, *end_subject;
 3122: const pcre_study_data *study = NULL;
 3123: 
 3124: const pcre_uchar *req_char_ptr;
 3125: const pcre_uint8 *start_bits = NULL;
 3126: BOOL has_first_char = FALSE;
 3127: BOOL has_req_char = FALSE;
 3128: pcre_uchar first_char = 0;
 3129: pcre_uchar first_char2 = 0;
 3130: pcre_uchar req_char = 0;
 3131: pcre_uchar req_char2 = 0;
 3132: int newline;
 3133: 
 3134: /* Plausibility checks */
 3135: 
 3136: if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
 3137: if (re == NULL || subject == NULL || workspace == NULL ||
 3138:    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
 3139: if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
 3140: if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
 3141: if (length < 0) return PCRE_ERROR_BADLENGTH;
 3142: if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
 3143: 
 3144: /* Check that the first field in the block is the magic number. If it is not,
 3145: return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
 3146: REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
 3147: means that the pattern is likely compiled with different endianness. */
 3148: 
 3149: if (re->magic_number != MAGIC_NUMBER)
 3150:   return re->magic_number == REVERSED_MAGIC_NUMBER?
 3151:     PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
 3152: if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
 3153: 
 3154: /* If restarting after a partial match, do some sanity checks on the contents
 3155: of the workspace. */
 3156: 
 3157: if ((options & PCRE_DFA_RESTART) != 0)
 3158:   {
 3159:   if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
 3160:     workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
 3161:       return PCRE_ERROR_DFA_BADRESTART;
 3162:   }
 3163: 
 3164: /* Set up study, callout, and table data */
 3165: 
 3166: md->tables = re->tables;
 3167: md->callout_data = NULL;
 3168: 
 3169: if (extra_data != NULL)
 3170:   {
 3171:   unsigned int flags = extra_data->flags;
 3172:   if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
 3173:     study = (const pcre_study_data *)extra_data->study_data;
 3174:   if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
 3175:   if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
 3176:     return PCRE_ERROR_DFA_UMLIMIT;
 3177:   if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
 3178:     md->callout_data = extra_data->callout_data;
 3179:   if ((flags & PCRE_EXTRA_TABLES) != 0)
 3180:     md->tables = extra_data->tables;
 3181:   }
 3182: 
 3183: /* Set some local values */
 3184: 
 3185: current_subject = (const pcre_uchar *)subject + start_offset;
 3186: end_subject = (const pcre_uchar *)subject + length;
 3187: req_char_ptr = current_subject - 1;
 3188: 
 3189: #ifdef SUPPORT_UTF
 3190: /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
 3191: utf = (re->options & PCRE_UTF8) != 0;
 3192: #else
 3193: utf = FALSE;
 3194: #endif
 3195: 
 3196: anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
 3197:   (re->options & PCRE_ANCHORED) != 0;
 3198: 
 3199: /* The remaining fixed data for passing around. */
 3200: 
 3201: md->start_code = (const pcre_uchar *)argument_re +
 3202:     re->name_table_offset + re->name_count * re->name_entry_size;
 3203: md->start_subject = (const pcre_uchar *)subject;
 3204: md->end_subject = end_subject;
 3205: md->start_offset = start_offset;
 3206: md->moptions = options;
 3207: md->poptions = re->options;
 3208: 
 3209: /* If the BSR option is not set at match time, copy what was set
 3210: at compile time. */
 3211: 
 3212: if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
 3213:   {
 3214:   if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
 3215:     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
 3216: #ifdef BSR_ANYCRLF
 3217:   else md->moptions |= PCRE_BSR_ANYCRLF;
 3218: #endif
 3219:   }
 3220: 
 3221: /* Handle different types of newline. The three bits give eight cases. If
 3222: nothing is set at run time, whatever was used at compile time applies. */
 3223: 
 3224: switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
 3225:          PCRE_NEWLINE_BITS)
 3226:   {
 3227:   case 0: newline = NEWLINE; break;   /* Compile-time default */
 3228:   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
 3229:   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
 3230:   case PCRE_NEWLINE_CR+
 3231:        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
 3232:   case PCRE_NEWLINE_ANY: newline = -1; break;
 3233:   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
 3234:   default: return PCRE_ERROR_BADNEWLINE;
 3235:   }
 3236: 
 3237: if (newline == -2)
 3238:   {
 3239:   md->nltype = NLTYPE_ANYCRLF;
 3240:   }
 3241: else if (newline < 0)
 3242:   {
 3243:   md->nltype = NLTYPE_ANY;
 3244:   }
 3245: else
 3246:   {
 3247:   md->nltype = NLTYPE_FIXED;
 3248:   if (newline > 255)
 3249:     {
 3250:     md->nllen = 2;
 3251:     md->nl[0] = (newline >> 8) & 255;
 3252:     md->nl[1] = newline & 255;
 3253:     }
 3254:   else
 3255:     {
 3256:     md->nllen = 1;
 3257:     md->nl[0] = newline;
 3258:     }
 3259:   }
 3260: 
 3261: /* Check a UTF-8 string if required. Unfortunately there's no way of passing
 3262: back the character offset. */
 3263: 
 3264: #ifdef SUPPORT_UTF
 3265: if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
 3266:   {
 3267:   int erroroffset;
 3268:   int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
 3269:   if (errorcode != 0)
 3270:     {
 3271:     if (offsetcount >= 2)
 3272:       {
 3273:       offsets[0] = erroroffset;
 3274:       offsets[1] = errorcode;
 3275:       }
 3276: #if defined COMPILE_PCRE8
 3277:     return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0) ?
 3278:       PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
 3279: #elif defined COMPILE_PCRE16
 3280:     return (errorcode <= PCRE_UTF16_ERR1 && (options & PCRE_PARTIAL_HARD) != 0) ?
 3281:       PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
 3282: #elif defined COMPILE_PCRE32
 3283:     return PCRE_ERROR_BADUTF32;
 3284: #endif
 3285:     }
 3286: #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
 3287:   if (start_offset > 0 && start_offset < length &&
 3288:         NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
 3289:     return PCRE_ERROR_BADUTF8_OFFSET;
 3290: #endif
 3291:   }
 3292: #endif
 3293: 
 3294: /* If the exec call supplied NULL for tables, use the inbuilt ones. This
 3295: is a feature that makes it possible to save compiled regex and re-use them
 3296: in other programs later. */
 3297: 
 3298: if (md->tables == NULL) md->tables = PRIV(default_tables);
 3299: 
 3300: /* The "must be at the start of a line" flags are used in a loop when finding
 3301: where to start. */
 3302: 
 3303: startline = (re->flags & PCRE_STARTLINE) != 0;
 3304: firstline = (re->options & PCRE_FIRSTLINE) != 0;
 3305: 
 3306: /* Set up the first character to match, if available. The first_byte value is
 3307: never set for an anchored regular expression, but the anchoring may be forced
 3308: at run time, so we have to test for anchoring. The first char may be unset for
 3309: an unanchored pattern, of course. If there's no first char and the pattern was
 3310: studied, there may be a bitmap of possible first characters. */
 3311: 
 3312: if (!anchored)
 3313:   {
 3314:   if ((re->flags & PCRE_FIRSTSET) != 0)
 3315:     {
 3316:     has_first_char = TRUE;
 3317:     first_char = first_char2 = (pcre_uchar)(re->first_char);
 3318:     if ((re->flags & PCRE_FCH_CASELESS) != 0)
 3319:       {
 3320:       first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
 3321: #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
 3322:       if (utf && first_char > 127)
 3323:         first_char2 = UCD_OTHERCASE(first_char);
 3324: #endif
 3325:       }
 3326:     }
 3327:   else
 3328:     {
 3329:     if (!startline && study != NULL &&
 3330:          (study->flags & PCRE_STUDY_MAPPED) != 0)
 3331:       start_bits = study->start_bits;
 3332:     }
 3333:   }
 3334: 
 3335: /* For anchored or unanchored matches, there may be a "last known required
 3336: character" set. */
 3337: 
 3338: if ((re->flags & PCRE_REQCHSET) != 0)
 3339:   {
 3340:   has_req_char = TRUE;
 3341:   req_char = req_char2 = (pcre_uchar)(re->req_char);
 3342:   if ((re->flags & PCRE_RCH_CASELESS) != 0)
 3343:     {
 3344:     req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
 3345: #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
 3346:     if (utf && req_char > 127)
 3347:       req_char2 = UCD_OTHERCASE(req_char);
 3348: #endif
 3349:     }
 3350:   }
 3351: 
 3352: /* Call the main matching function, looping for a non-anchored regex after a
 3353: failed match. If not restarting, perform certain optimizations at the start of
 3354: a match. */
 3355: 
 3356: for (;;)
 3357:   {
 3358:   int rc;
 3359: 
 3360:   if ((options & PCRE_DFA_RESTART) == 0)
 3361:     {
 3362:     const pcre_uchar *save_end_subject = end_subject;
 3363: 
 3364:     /* If firstline is TRUE, the start of the match is constrained to the first
 3365:     line of a multiline string. Implement this by temporarily adjusting
 3366:     end_subject so that we stop scanning at a newline. If the match fails at
 3367:     the newline, later code breaks this loop. */
 3368: 
 3369:     if (firstline)
 3370:       {
 3371:       PCRE_PUCHAR t = current_subject;
 3372: #ifdef SUPPORT_UTF
 3373:       if (utf)
 3374:         {
 3375:         while (t < md->end_subject && !IS_NEWLINE(t))
 3376:           {
 3377:           t++;
 3378:           ACROSSCHAR(t < end_subject, *t, t++);
 3379:           }
 3380:         }
 3381:       else
 3382: #endif
 3383:       while (t < md->end_subject && !IS_NEWLINE(t)) t++;
 3384:       end_subject = t;
 3385:       }
 3386: 
 3387:     /* There are some optimizations that avoid running the match if a known
 3388:     starting point is not found. However, there is an option that disables
 3389:     these, for testing and for ensuring that all callouts do actually occur.
 3390:     The option can be set in the regex by (*NO_START_OPT) or passed in
 3391:     match-time options. */
 3392: 
 3393:     if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
 3394:       {
 3395:       /* Advance to a known first char. */
 3396: 
 3397:       if (has_first_char)
 3398:         {
 3399:         if (first_char != first_char2)
 3400:           {
 3401:           pcre_uchar csc;
 3402:           while (current_subject < end_subject &&
 3403:                  (csc = RAWUCHARTEST(current_subject)) != first_char && csc != first_char2)
 3404:             current_subject++;
 3405:           }
 3406:         else
 3407:           while (current_subject < end_subject &&
 3408:                  RAWUCHARTEST(current_subject) != first_char)
 3409:             current_subject++;
 3410:         }
 3411: 
 3412:       /* Or to just after a linebreak for a multiline match if possible */
 3413: 
 3414:       else if (startline)
 3415:         {
 3416:         if (current_subject > md->start_subject + start_offset)
 3417:           {
 3418: #ifdef SUPPORT_UTF
 3419:           if (utf)
 3420:             {
 3421:             while (current_subject < end_subject &&
 3422:                    !WAS_NEWLINE(current_subject))
 3423:               {
 3424:               current_subject++;
 3425:               ACROSSCHAR(current_subject < end_subject, *current_subject,
 3426:                 current_subject++);
 3427:               }
 3428:             }
 3429:           else
 3430: #endif
 3431:           while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
 3432:             current_subject++;
 3433: 
 3434:           /* If we have just passed a CR and the newline option is ANY or
 3435:           ANYCRLF, and we are now at a LF, advance the match position by one
 3436:           more character. */
 3437: 
 3438:           if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
 3439:                (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
 3440:                current_subject < end_subject &&
 3441:                RAWUCHARTEST(current_subject) == CHAR_NL)
 3442:             current_subject++;
 3443:           }
 3444:         }
 3445: 
 3446:       /* Or to a non-unique first char after study */
 3447: 
 3448:       else if (start_bits != NULL)
 3449:         {
 3450:         while (current_subject < end_subject)
 3451:           {
 3452:           register pcre_uint32 c = RAWUCHARTEST(current_subject);
 3453: #ifndef COMPILE_PCRE8
 3454:           if (c > 255) c = 255;
 3455: #endif
 3456:           if ((start_bits[c/8] & (1 << (c&7))) == 0)
 3457:             {
 3458:             current_subject++;
 3459: #if defined SUPPORT_UTF && defined COMPILE_PCRE8
 3460:             /* In non 8-bit mode, the iteration will stop for
 3461:             characters > 255 at the beginning or not stop at all. */
 3462:             if (utf)
 3463:               ACROSSCHAR(current_subject < end_subject, *current_subject,
 3464:                 current_subject++);
 3465: #endif
 3466:             }
 3467:           else break;
 3468:           }
 3469:         }
 3470:       }
 3471: 
 3472:     /* Restore fudged end_subject */
 3473: 
 3474:     end_subject = save_end_subject;
 3475: 
 3476:     /* The following two optimizations are disabled for partial matching or if
 3477:     disabling is explicitly requested (and of course, by the test above, this
 3478:     code is not obeyed when restarting after a partial match). */
 3479: 
 3480:     if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
 3481:         (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
 3482:       {
 3483:       /* If the pattern was studied, a minimum subject length may be set. This
 3484:       is a lower bound; no actual string of that length may actually match the
 3485:       pattern. Although the value is, strictly, in characters, we treat it as
 3486:       bytes to avoid spending too much time in this optimization. */
 3487: 
 3488:       if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
 3489:           (pcre_uint32)(end_subject - current_subject) < study->minlength)
 3490:         return PCRE_ERROR_NOMATCH;
 3491: 
 3492:       /* If req_char is set, we know that that character must appear in the
 3493:       subject for the match to succeed. If the first character is set, req_char
 3494:       must be later in the subject; otherwise the test starts at the match
 3495:       point. This optimization can save a huge amount of work in patterns with
 3496:       nested unlimited repeats that aren't going to match. Writing separate
 3497:       code for cased/caseless versions makes it go faster, as does using an
 3498:       autoincrement and backing off on a match.
 3499: 
 3500:       HOWEVER: when the subject string is very, very long, searching to its end
 3501:       can take a long time, and give bad performance on quite ordinary
 3502:       patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
 3503:       string... so we don't do this when the string is sufficiently long. */
 3504: 
 3505:       if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
 3506:         {
 3507:         register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
 3508: 
 3509:         /* We don't need to repeat the search if we haven't yet reached the
 3510:         place we found it at last time. */
 3511: 
 3512:         if (p > req_char_ptr)
 3513:           {
 3514:           if (req_char != req_char2)
 3515:             {
 3516:             while (p < end_subject)
 3517:               {
 3518:               register pcre_uint32 pp = RAWUCHARINCTEST(p);
 3519:               if (pp == req_char || pp == req_char2) { p--; break; }
 3520:               }
 3521:             }
 3522:           else
 3523:             {
 3524:             while (p < end_subject)
 3525:               {
 3526:               if (RAWUCHARINCTEST(p) == req_char) { p--; break; }
 3527:               }
 3528:             }
 3529: 
 3530:           /* If we can't find the required character, break the matching loop,
 3531:           which will cause a return or PCRE_ERROR_NOMATCH. */
 3532: 
 3533:           if (p >= end_subject) break;
 3534: 
 3535:           /* If we have found the required character, save the point where we
 3536:           found it, so that we don't search again next time round the loop if
 3537:           the start hasn't passed this character yet. */
 3538: 
 3539:           req_char_ptr = p;
 3540:           }
 3541:         }
 3542:       }
 3543:     }   /* End of optimizations that are done when not restarting */
 3544: 
 3545:   /* OK, now we can do the business */
 3546: 
 3547:   md->start_used_ptr = current_subject;
 3548:   md->recursive = NULL;
 3549: 
 3550:   rc = internal_dfa_exec(
 3551:     md,                                /* fixed match data */
 3552:     md->start_code,                    /* this subexpression's code */
 3553:     current_subject,                   /* where we currently are */
 3554:     start_offset,                      /* start offset in subject */
 3555:     offsets,                           /* offset vector */
 3556:     offsetcount,                       /* size of same */
 3557:     workspace,                         /* workspace vector */
 3558:     wscount,                           /* size of same */
 3559:     0);                                /* function recurse level */
 3560: 
 3561:   /* Anything other than "no match" means we are done, always; otherwise, carry
 3562:   on only if not anchored. */
 3563: 
 3564:   if (rc != PCRE_ERROR_NOMATCH || anchored)
 3565:     {
 3566:     if (rc == PCRE_ERROR_PARTIAL && offsetcount >= 2)
 3567:       {
 3568:       offsets[0] = (int)(md->start_used_ptr - (PCRE_PUCHAR)subject);
 3569:       offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
 3570:       if (offsetcount > 2)
 3571:         offsets[2] = (int)(current_subject - (PCRE_PUCHAR)subject);
 3572:       }
 3573:     return rc;
 3574:     }
 3575: 
 3576:   /* Advance to the next subject character unless we are at the end of a line
 3577:   and firstline is set. */
 3578: 
 3579:   if (firstline && IS_NEWLINE(current_subject)) break;
 3580:   current_subject++;
 3581: #ifdef SUPPORT_UTF
 3582:   if (utf)
 3583:     {
 3584:     ACROSSCHAR(current_subject < end_subject, *current_subject,
 3585:       current_subject++);
 3586:     }
 3587: #endif
 3588:   if (current_subject > end_subject) break;
 3589: 
 3590:   /* If we have just passed a CR and we are now at a LF, and the pattern does
 3591:   not contain any explicit matches for \r or \n, and the newline option is CRLF
 3592:   or ANY or ANYCRLF, advance the match position by one more character. */
 3593: 
 3594:   if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
 3595:       current_subject < end_subject &&
 3596:       RAWUCHARTEST(current_subject) == CHAR_NL &&
 3597:       (re->flags & PCRE_HASCRORLF) == 0 &&
 3598:         (md->nltype == NLTYPE_ANY ||
 3599:          md->nltype == NLTYPE_ANYCRLF ||
 3600:          md->nllen == 2))
 3601:     current_subject++;
 3602: 
 3603:   }   /* "Bumpalong" loop */
 3604: 
 3605: return PCRE_ERROR_NOMATCH;
 3606: }
 3607: 
 3608: /* End of pcre_dfa_exec.c */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>