Annotation of embedaddon/pcre/pcre_dfa_exec.c, revision 1.1.1.2

1.1       misho       1: /*************************************************
                      2: *      Perl-Compatible Regular Expressions       *
                      3: *************************************************/
                      4: 
                      5: /* PCRE is a library of functions to support regular expressions whose syntax
                      6: and semantics are as close as possible to those of the Perl 5 language (but see
                      7: below for why this module is different).
                      8: 
                      9:                        Written by Philip Hazel
1.1.1.2 ! misho      10:            Copyright (c) 1997-2012 University of Cambridge
1.1       misho      11: 
                     12: -----------------------------------------------------------------------------
                     13: Redistribution and use in source and binary forms, with or without
                     14: modification, are permitted provided that the following conditions are met:
                     15: 
                     16:     * Redistributions of source code must retain the above copyright notice,
                     17:       this list of conditions and the following disclaimer.
                     18: 
                     19:     * Redistributions in binary form must reproduce the above copyright
                     20:       notice, this list of conditions and the following disclaimer in the
                     21:       documentation and/or other materials provided with the distribution.
                     22: 
                     23:     * Neither the name of the University of Cambridge nor the names of its
                     24:       contributors may be used to endorse or promote products derived from
                     25:       this software without specific prior written permission.
                     26: 
                     27: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
                     28: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
                     29: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
                     30: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
                     31: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
                     32: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
                     33: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
                     34: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
                     35: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
                     36: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
                     37: POSSIBILITY OF SUCH DAMAGE.
                     38: -----------------------------------------------------------------------------
                     39: */
                     40: 
                     41: 
                     42: /* This module contains the external function pcre_dfa_exec(), which is an
                     43: alternative matching function that uses a sort of DFA algorithm (not a true
                     44: FSM). This is NOT Perl- compatible, but it has advantages in certain
                     45: applications. */
                     46: 
                     47: 
                     48: /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
                     49: the performance of his patterns greatly. I could not use it as it stood, as it
                     50: was not thread safe, and made assumptions about pattern sizes. Also, it caused
                     51: test 7 to loop, and test 9 to crash with a segfault.
                     52: 
                     53: The issue is the check for duplicate states, which is done by a simple linear
                     54: search up the state list. (Grep for "duplicate" below to find the code.) For
                     55: many patterns, there will never be many states active at one time, so a simple
                     56: linear search is fine. In patterns that have many active states, it might be a
                     57: bottleneck. The suggested code used an indexing scheme to remember which states
                     58: had previously been used for each character, and avoided the linear search when
                     59: it knew there was no chance of a duplicate. This was implemented when adding
                     60: states to the state lists.
                     61: 
                     62: I wrote some thread-safe, not-limited code to try something similar at the time
                     63: of checking for duplicates (instead of when adding states), using index vectors
                     64: on the stack. It did give a 13% improvement with one specially constructed
                     65: pattern for certain subject strings, but on other strings and on many of the
                     66: simpler patterns in the test suite it did worse. The major problem, I think,
                     67: was the extra time to initialize the index. This had to be done for each call
                     68: of internal_dfa_exec(). (The supplied patch used a static vector, initialized
                     69: only once - I suspect this was the cause of the problems with the tests.)
                     70: 
                     71: Overall, I concluded that the gains in some cases did not outweigh the losses
                     72: in others, so I abandoned this code. */
                     73: 
                     74: 
                     75: 
                     76: #ifdef HAVE_CONFIG_H
                     77: #include "config.h"
                     78: #endif
                     79: 
                     80: #define NLBLOCK md             /* Block containing newline information */
                     81: #define PSSTART start_subject  /* Field containing processed string start */
                     82: #define PSEND   end_subject    /* Field containing processed string end */
                     83: 
                     84: #include "pcre_internal.h"
                     85: 
                     86: 
                     87: /* For use to indent debugging output */
                     88: 
                     89: #define SP "                   "
                     90: 
                     91: 
                     92: /*************************************************
                     93: *      Code parameters and static tables         *
                     94: *************************************************/
                     95: 
                     96: /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
                     97: into others, under special conditions. A gap of 20 between the blocks should be
                     98: enough. The resulting opcodes don't have to be less than 256 because they are
                     99: never stored, so we push them well clear of the normal opcodes. */
                    100: 
                    101: #define OP_PROP_EXTRA       300
                    102: #define OP_EXTUNI_EXTRA     320
                    103: #define OP_ANYNL_EXTRA      340
                    104: #define OP_HSPACE_EXTRA     360
                    105: #define OP_VSPACE_EXTRA     380
                    106: 
                    107: 
                    108: /* This table identifies those opcodes that are followed immediately by a
                    109: character that is to be tested in some way. This makes it possible to
                    110: centralize the loading of these characters. In the case of Type * etc, the
                    111: "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
                    112: small value. Non-zero values in the table are the offsets from the opcode where
                    113: the character is to be found. ***NOTE*** If the start of this table is
                    114: modified, the three tables that follow must also be modified. */
                    115: 
1.1.1.2 ! misho     116: static const pcre_uint8 coptable[] = {
1.1       misho     117:   0,                             /* End                                    */
                    118:   0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
                    119:   0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
                    120:   0, 0, 0,                       /* Any, AllAny, Anybyte                   */
                    121:   0, 0,                          /* \P, \p                                 */
                    122:   0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
                    123:   0,                             /* \X                                     */
                    124:   0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
                    125:   1,                             /* Char                                   */
                    126:   1,                             /* Chari                                  */
                    127:   1,                             /* not                                    */
                    128:   1,                             /* noti                                   */
                    129:   /* Positive single-char repeats                                          */
                    130:   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
1.1.1.2 ! misho     131:   1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
        !           132:   1+IMM2_SIZE,                   /* exact                                  */
        !           133:   1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
1.1       misho     134:   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
1.1.1.2 ! misho     135:   1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
        !           136:   1+IMM2_SIZE,                   /* exact I                                */
        !           137:   1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
1.1       misho     138:   /* Negative single-char repeats - only for chars < 256                   */
                    139:   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
1.1.1.2 ! misho     140:   1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
        !           141:   1+IMM2_SIZE,                   /* NOT exact                              */
        !           142:   1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
1.1       misho     143:   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
1.1.1.2 ! misho     144:   1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
        !           145:   1+IMM2_SIZE,                   /* NOT exact I                            */
        !           146:   1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
1.1       misho     147:   /* Positive type repeats                                                 */
                    148:   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
1.1.1.2 ! misho     149:   1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
        !           150:   1+IMM2_SIZE,                   /* Type exact                             */
        !           151:   1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
1.1       misho     152:   /* Character class & ref repeats                                         */
                    153:   0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
                    154:   0, 0,                          /* CRRANGE, CRMINRANGE                    */
                    155:   0,                             /* CLASS                                  */
                    156:   0,                             /* NCLASS                                 */
                    157:   0,                             /* XCLASS - variable length               */
                    158:   0,                             /* REF                                    */
                    159:   0,                             /* REFI                                   */
                    160:   0,                             /* RECURSE                                */
                    161:   0,                             /* CALLOUT                                */
                    162:   0,                             /* Alt                                    */
                    163:   0,                             /* Ket                                    */
                    164:   0,                             /* KetRmax                                */
                    165:   0,                             /* KetRmin                                */
                    166:   0,                             /* KetRpos                                */
                    167:   0,                             /* Reverse                                */
                    168:   0,                             /* Assert                                 */
                    169:   0,                             /* Assert not                             */
                    170:   0,                             /* Assert behind                          */
                    171:   0,                             /* Assert behind not                      */
                    172:   0, 0,                          /* ONCE, ONCE_NC                          */
                    173:   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
                    174:   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
                    175:   0, 0,                          /* CREF, NCREF                            */
                    176:   0, 0,                          /* RREF, NRREF                            */
                    177:   0,                             /* DEF                                    */
                    178:   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
                    179:   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
                    180:   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
                    181:   0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
                    182:   0, 0                           /* CLOSE, SKIPZERO  */
                    183: };
                    184: 
                    185: /* This table identifies those opcodes that inspect a character. It is used to
                    186: remember the fact that a character could have been inspected when the end of
                    187: the subject is reached. ***NOTE*** If the start of this table is modified, the
                    188: two tables that follow must also be modified. */
                    189: 
1.1.1.2 ! misho     190: static const pcre_uint8 poptable[] = {
1.1       misho     191:   0,                             /* End                                    */
                    192:   0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
                    193:   1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
                    194:   1, 1, 1,                       /* Any, AllAny, Anybyte                   */
                    195:   1, 1,                          /* \P, \p                                 */
                    196:   1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
                    197:   1,                             /* \X                                     */
                    198:   0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
                    199:   1,                             /* Char                                   */
                    200:   1,                             /* Chari                                  */
                    201:   1,                             /* not                                    */
                    202:   1,                             /* noti                                   */
                    203:   /* Positive single-char repeats                                          */
                    204:   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
                    205:   1, 1, 1,                       /* upto, minupto, exact                   */
                    206:   1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
                    207:   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
                    208:   1, 1, 1,                       /* upto I, minupto I, exact I             */
                    209:   1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
                    210:   /* Negative single-char repeats - only for chars < 256                   */
                    211:   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
                    212:   1, 1, 1,                       /* NOT upto, minupto, exact               */
                    213:   1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
                    214:   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
                    215:   1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
                    216:   1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
                    217:   /* Positive type repeats                                                 */
                    218:   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
                    219:   1, 1, 1,                       /* Type upto, minupto, exact              */
                    220:   1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
                    221:   /* Character class & ref repeats                                         */
                    222:   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
                    223:   1, 1,                          /* CRRANGE, CRMINRANGE                    */
                    224:   1,                             /* CLASS                                  */
                    225:   1,                             /* NCLASS                                 */
                    226:   1,                             /* XCLASS - variable length               */
                    227:   0,                             /* REF                                    */
                    228:   0,                             /* REFI                                   */
                    229:   0,                             /* RECURSE                                */
                    230:   0,                             /* CALLOUT                                */
                    231:   0,                             /* Alt                                    */
                    232:   0,                             /* Ket                                    */
                    233:   0,                             /* KetRmax                                */
                    234:   0,                             /* KetRmin                                */
                    235:   0,                             /* KetRpos                                */
                    236:   0,                             /* Reverse                                */
                    237:   0,                             /* Assert                                 */
                    238:   0,                             /* Assert not                             */
                    239:   0,                             /* Assert behind                          */
                    240:   0,                             /* Assert behind not                      */
                    241:   0, 0,                          /* ONCE, ONCE_NC                          */
                    242:   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
                    243:   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
                    244:   0, 0,                          /* CREF, NCREF                            */
                    245:   0, 0,                          /* RREF, NRREF                            */
                    246:   0,                             /* DEF                                    */
                    247:   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
                    248:   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
                    249:   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
                    250:   0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
                    251:   0, 0                           /* CLOSE, SKIPZERO                        */
                    252: };
                    253: 
                    254: /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
                    255: and \w */
                    256: 
1.1.1.2 ! misho     257: static const pcre_uint8 toptable1[] = {
1.1       misho     258:   0, 0, 0, 0, 0, 0,
                    259:   ctype_digit, ctype_digit,
                    260:   ctype_space, ctype_space,
                    261:   ctype_word,  ctype_word,
                    262:   0, 0                            /* OP_ANY, OP_ALLANY */
                    263: };
                    264: 
1.1.1.2 ! misho     265: static const pcre_uint8 toptable2[] = {
1.1       misho     266:   0, 0, 0, 0, 0, 0,
                    267:   ctype_digit, 0,
                    268:   ctype_space, 0,
                    269:   ctype_word,  0,
                    270:   1, 1                            /* OP_ANY, OP_ALLANY */
                    271: };
                    272: 
                    273: 
                    274: /* Structure for holding data about a particular state, which is in effect the
                    275: current data for an active path through the match tree. It must consist
                    276: entirely of ints because the working vector we are passed, and which we put
                    277: these structures in, is a vector of ints. */
                    278: 
                    279: typedef struct stateblock {
                    280:   int offset;                     /* Offset to opcode */
                    281:   int count;                      /* Count for repeats */
                    282:   int data;                       /* Some use extra data */
                    283: } stateblock;
                    284: 
                    285: #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))
                    286: 
                    287: 
                    288: #ifdef PCRE_DEBUG
                    289: /*************************************************
                    290: *             Print character string             *
                    291: *************************************************/
                    292: 
                    293: /* Character string printing function for debugging.
                    294: 
                    295: Arguments:
                    296:   p            points to string
                    297:   length       number of bytes
                    298:   f            where to print
                    299: 
                    300: Returns:       nothing
                    301: */
                    302: 
                    303: static void
1.1.1.2 ! misho     304: pchars(const pcre_uchar *p, int length, FILE *f)
1.1       misho     305: {
                    306: int c;
                    307: while (length-- > 0)
                    308:   {
                    309:   if (isprint(c = *(p++)))
                    310:     fprintf(f, "%c", c);
                    311:   else
                    312:     fprintf(f, "\\x%02x", c);
                    313:   }
                    314: }
                    315: #endif
                    316: 
                    317: 
                    318: 
                    319: /*************************************************
                    320: *    Execute a Regular Expression - DFA engine   *
                    321: *************************************************/
                    322: 
                    323: /* This internal function applies a compiled pattern to a subject string,
                    324: starting at a given point, using a DFA engine. This function is called from the
                    325: external one, possibly multiple times if the pattern is not anchored. The
                    326: function calls itself recursively for some kinds of subpattern.
                    327: 
                    328: Arguments:
                    329:   md                the match_data block with fixed information
                    330:   this_start_code   the opening bracket of this subexpression's code
                    331:   current_subject   where we currently are in the subject string
                    332:   start_offset      start offset in the subject string
                    333:   offsets           vector to contain the matching string offsets
                    334:   offsetcount       size of same
                    335:   workspace         vector of workspace
                    336:   wscount           size of same
                    337:   rlevel            function call recursion level
                    338: 
                    339: Returns:            > 0 => number of match offset pairs placed in offsets
                    340:                     = 0 => offsets overflowed; longest matches are present
                    341:                      -1 => failed to match
                    342:                    < -1 => some kind of unexpected problem
                    343: 
                    344: The following macros are used for adding states to the two state vectors (one
                    345: for the current character, one for the following character). */
                    346: 
                    347: #define ADD_ACTIVE(x,y) \
                    348:   if (active_count++ < wscount) \
                    349:     { \
                    350:     next_active_state->offset = (x); \
                    351:     next_active_state->count  = (y); \
                    352:     next_active_state++; \
                    353:     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
                    354:     } \
                    355:   else return PCRE_ERROR_DFA_WSSIZE
                    356: 
                    357: #define ADD_ACTIVE_DATA(x,y,z) \
                    358:   if (active_count++ < wscount) \
                    359:     { \
                    360:     next_active_state->offset = (x); \
                    361:     next_active_state->count  = (y); \
                    362:     next_active_state->data   = (z); \
                    363:     next_active_state++; \
                    364:     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
                    365:     } \
                    366:   else return PCRE_ERROR_DFA_WSSIZE
                    367: 
                    368: #define ADD_NEW(x,y) \
                    369:   if (new_count++ < wscount) \
                    370:     { \
                    371:     next_new_state->offset = (x); \
                    372:     next_new_state->count  = (y); \
                    373:     next_new_state++; \
                    374:     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
                    375:     } \
                    376:   else return PCRE_ERROR_DFA_WSSIZE
                    377: 
                    378: #define ADD_NEW_DATA(x,y,z) \
                    379:   if (new_count++ < wscount) \
                    380:     { \
                    381:     next_new_state->offset = (x); \
                    382:     next_new_state->count  = (y); \
                    383:     next_new_state->data   = (z); \
                    384:     next_new_state++; \
                    385:     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
                    386:     } \
                    387:   else return PCRE_ERROR_DFA_WSSIZE
                    388: 
                    389: /* And now, here is the code */
                    390: 
                    391: static int
                    392: internal_dfa_exec(
                    393:   dfa_match_data *md,
1.1.1.2 ! misho     394:   const pcre_uchar *this_start_code,
        !           395:   const pcre_uchar *current_subject,
1.1       misho     396:   int start_offset,
                    397:   int *offsets,
                    398:   int offsetcount,
                    399:   int *workspace,
                    400:   int wscount,
                    401:   int  rlevel)
                    402: {
                    403: stateblock *active_states, *new_states, *temp_states;
                    404: stateblock *next_active_state, *next_new_state;
                    405: 
1.1.1.2 ! misho     406: const pcre_uint8 *ctypes, *lcc, *fcc;
        !           407: const pcre_uchar *ptr;
        !           408: const pcre_uchar *end_code, *first_op;
1.1       misho     409: 
                    410: dfa_recursion_info new_recursive;
                    411: 
                    412: int active_count, new_count, match_count;
                    413: 
                    414: /* Some fields in the md block are frequently referenced, so we load them into
                    415: independent variables in the hope that this will perform better. */
                    416: 
1.1.1.2 ! misho     417: const pcre_uchar *start_subject = md->start_subject;
        !           418: const pcre_uchar *end_subject = md->end_subject;
        !           419: const pcre_uchar *start_code = md->start_code;
1.1       misho     420: 
1.1.1.2 ! misho     421: #ifdef SUPPORT_UTF
        !           422: BOOL utf = (md->poptions & PCRE_UTF8) != 0;
1.1       misho     423: #else
1.1.1.2 ! misho     424: BOOL utf = FALSE;
1.1       misho     425: #endif
                    426: 
                    427: rlevel++;
                    428: offsetcount &= (-2);
                    429: 
                    430: wscount -= 2;
                    431: wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
                    432:           (2 * INTS_PER_STATEBLOCK);
                    433: 
                    434: DPRINTF(("\n%.*s---------------------\n"
                    435:   "%.*sCall to internal_dfa_exec f=%d\n",
                    436:   rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
                    437: 
                    438: ctypes = md->tables + ctypes_offset;
                    439: lcc = md->tables + lcc_offset;
                    440: fcc = md->tables + fcc_offset;
                    441: 
                    442: match_count = PCRE_ERROR_NOMATCH;   /* A negative number */
                    443: 
                    444: active_states = (stateblock *)(workspace + 2);
                    445: next_new_state = new_states = active_states + wscount;
                    446: new_count = 0;
                    447: 
                    448: first_op = this_start_code + 1 + LINK_SIZE +
                    449:   ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
1.1.1.2 ! misho     450:     *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
        !           451:     ? IMM2_SIZE:0);
1.1       misho     452: 
                    453: /* The first thing in any (sub) pattern is a bracket of some sort. Push all
                    454: the alternative states onto the list, and find out where the end is. This
                    455: makes is possible to use this function recursively, when we want to stop at a
                    456: matching internal ket rather than at the end.
                    457: 
                    458: If the first opcode in the first alternative is OP_REVERSE, we are dealing with
                    459: a backward assertion. In that case, we have to find out the maximum amount to
                    460: move back, and set up each alternative appropriately. */
                    461: 
                    462: if (*first_op == OP_REVERSE)
                    463:   {
                    464:   int max_back = 0;
                    465:   int gone_back;
                    466: 
                    467:   end_code = this_start_code;
                    468:   do
                    469:     {
                    470:     int back = GET(end_code, 2+LINK_SIZE);
                    471:     if (back > max_back) max_back = back;
                    472:     end_code += GET(end_code, 1);
                    473:     }
                    474:   while (*end_code == OP_ALT);
                    475: 
                    476:   /* If we can't go back the amount required for the longest lookbehind
                    477:   pattern, go back as far as we can; some alternatives may still be viable. */
                    478: 
1.1.1.2 ! misho     479: #ifdef SUPPORT_UTF
1.1       misho     480:   /* In character mode we have to step back character by character */
                    481: 
1.1.1.2 ! misho     482:   if (utf)
1.1       misho     483:     {
                    484:     for (gone_back = 0; gone_back < max_back; gone_back++)
                    485:       {
                    486:       if (current_subject <= start_subject) break;
                    487:       current_subject--;
1.1.1.2 ! misho     488:       ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
1.1       misho     489:       }
                    490:     }
                    491:   else
                    492: #endif
                    493: 
                    494:   /* In byte-mode we can do this quickly. */
                    495: 
                    496:     {
                    497:     gone_back = (current_subject - max_back < start_subject)?
                    498:       (int)(current_subject - start_subject) : max_back;
                    499:     current_subject -= gone_back;
                    500:     }
                    501: 
                    502:   /* Save the earliest consulted character */
                    503: 
                    504:   if (current_subject < md->start_used_ptr)
                    505:     md->start_used_ptr = current_subject;
                    506: 
                    507:   /* Now we can process the individual branches. */
                    508: 
                    509:   end_code = this_start_code;
                    510:   do
                    511:     {
                    512:     int back = GET(end_code, 2+LINK_SIZE);
                    513:     if (back <= gone_back)
                    514:       {
                    515:       int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
                    516:       ADD_NEW_DATA(-bstate, 0, gone_back - back);
                    517:       }
                    518:     end_code += GET(end_code, 1);
                    519:     }
                    520:   while (*end_code == OP_ALT);
                    521:  }
                    522: 
                    523: /* This is the code for a "normal" subpattern (not a backward assertion). The
                    524: start of a whole pattern is always one of these. If we are at the top level,
                    525: we may be asked to restart matching from the same point that we reached for a
                    526: previous partial match. We still have to scan through the top-level branches to
                    527: find the end state. */
                    528: 
                    529: else
                    530:   {
                    531:   end_code = this_start_code;
                    532: 
                    533:   /* Restarting */
                    534: 
                    535:   if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
                    536:     {
                    537:     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
                    538:     new_count = workspace[1];
                    539:     if (!workspace[0])
                    540:       memcpy(new_states, active_states, new_count * sizeof(stateblock));
                    541:     }
                    542: 
                    543:   /* Not restarting */
                    544: 
                    545:   else
                    546:     {
                    547:     int length = 1 + LINK_SIZE +
                    548:       ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
1.1.1.2 ! misho     549:         *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
        !           550:         ? IMM2_SIZE:0);
1.1       misho     551:     do
                    552:       {
                    553:       ADD_NEW((int)(end_code - start_code + length), 0);
                    554:       end_code += GET(end_code, 1);
                    555:       length = 1 + LINK_SIZE;
                    556:       }
                    557:     while (*end_code == OP_ALT);
                    558:     }
                    559:   }
                    560: 
                    561: workspace[0] = 0;    /* Bit indicating which vector is current */
                    562: 
1.1.1.2 ! misho     563: DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
1.1       misho     564: 
                    565: /* Loop for scanning the subject */
                    566: 
                    567: ptr = current_subject;
                    568: for (;;)
                    569:   {
                    570:   int i, j;
                    571:   int clen, dlen;
                    572:   unsigned int c, d;
                    573:   int forced_fail = 0;
                    574:   BOOL could_continue = FALSE;
                    575: 
                    576:   /* Make the new state list into the active state list and empty the
                    577:   new state list. */
                    578: 
                    579:   temp_states = active_states;
                    580:   active_states = new_states;
                    581:   new_states = temp_states;
                    582:   active_count = new_count;
                    583:   new_count = 0;
                    584: 
                    585:   workspace[0] ^= 1;              /* Remember for the restarting feature */
                    586:   workspace[1] = active_count;
                    587: 
                    588: #ifdef PCRE_DEBUG
                    589:   printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
1.1.1.2 ! misho     590:   pchars(ptr, STRLEN_UC(ptr), stdout);
1.1       misho     591:   printf("\"\n");
                    592: 
                    593:   printf("%.*sActive states: ", rlevel*2-2, SP);
                    594:   for (i = 0; i < active_count; i++)
                    595:     printf("%d/%d ", active_states[i].offset, active_states[i].count);
                    596:   printf("\n");
                    597: #endif
                    598: 
                    599:   /* Set the pointers for adding new states */
                    600: 
                    601:   next_active_state = active_states + active_count;
                    602:   next_new_state = new_states;
                    603: 
                    604:   /* Load the current character from the subject outside the loop, as many
                    605:   different states may want to look at it, and we assume that at least one
                    606:   will. */
                    607: 
                    608:   if (ptr < end_subject)
                    609:     {
                    610:     clen = 1;        /* Number of bytes in the character */
1.1.1.2 ! misho     611: #ifdef SUPPORT_UTF
        !           612:     if (utf) { GETCHARLEN(c, ptr, clen); } else
        !           613: #endif  /* SUPPORT_UTF */
1.1       misho     614:     c = *ptr;
                    615:     }
                    616:   else
                    617:     {
                    618:     clen = 0;        /* This indicates the end of the subject */
                    619:     c = NOTACHAR;    /* This value should never actually be used */
                    620:     }
                    621: 
                    622:   /* Scan up the active states and act on each one. The result of an action
                    623:   may be to add more states to the currently active list (e.g. on hitting a
                    624:   parenthesis) or it may be to put states on the new list, for considering
                    625:   when we move the character pointer on. */
                    626: 
                    627:   for (i = 0; i < active_count; i++)
                    628:     {
                    629:     stateblock *current_state = active_states + i;
                    630:     BOOL caseless = FALSE;
1.1.1.2 ! misho     631:     const pcre_uchar *code;
1.1       misho     632:     int state_offset = current_state->offset;
                    633:     int count, codevalue, rrc;
                    634: 
                    635: #ifdef PCRE_DEBUG
                    636:     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
                    637:     if (clen == 0) printf("EOL\n");
                    638:       else if (c > 32 && c < 127) printf("'%c'\n", c);
                    639:         else printf("0x%02x\n", c);
                    640: #endif
                    641: 
                    642:     /* A negative offset is a special case meaning "hold off going to this
                    643:     (negated) state until the number of characters in the data field have
                    644:     been skipped". */
                    645: 
                    646:     if (state_offset < 0)
                    647:       {
                    648:       if (current_state->data > 0)
                    649:         {
                    650:         DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
                    651:         ADD_NEW_DATA(state_offset, current_state->count,
                    652:           current_state->data - 1);
                    653:         continue;
                    654:         }
                    655:       else
                    656:         {
                    657:         current_state->offset = state_offset = -state_offset;
                    658:         }
                    659:       }
                    660: 
                    661:     /* Check for a duplicate state with the same count, and skip if found.
                    662:     See the note at the head of this module about the possibility of improving
                    663:     performance here. */
                    664: 
                    665:     for (j = 0; j < i; j++)
                    666:       {
                    667:       if (active_states[j].offset == state_offset &&
                    668:           active_states[j].count == current_state->count)
                    669:         {
                    670:         DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
                    671:         goto NEXT_ACTIVE_STATE;
                    672:         }
                    673:       }
                    674: 
                    675:     /* The state offset is the offset to the opcode */
                    676: 
                    677:     code = start_code + state_offset;
                    678:     codevalue = *code;
                    679: 
                    680:     /* If this opcode inspects a character, but we are at the end of the
                    681:     subject, remember the fact for use when testing for a partial match. */
                    682: 
                    683:     if (clen == 0 && poptable[codevalue] != 0)
                    684:       could_continue = TRUE;
                    685: 
                    686:     /* If this opcode is followed by an inline character, load it. It is
                    687:     tempting to test for the presence of a subject character here, but that
                    688:     is wrong, because sometimes zero repetitions of the subject are
                    689:     permitted.
                    690: 
                    691:     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
                    692:     argument that is not a data character - but is always one byte long. We
                    693:     have to take special action to deal with  \P, \p, \H, \h, \V, \v and \X in
                    694:     this case. To keep the other cases fast, convert these ones to new opcodes.
                    695:     */
                    696: 
                    697:     if (coptable[codevalue] > 0)
                    698:       {
                    699:       dlen = 1;
1.1.1.2 ! misho     700: #ifdef SUPPORT_UTF
        !           701:       if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
        !           702: #endif  /* SUPPORT_UTF */
1.1       misho     703:       d = code[coptable[codevalue]];
                    704:       if (codevalue >= OP_TYPESTAR)
                    705:         {
                    706:         switch(d)
                    707:           {
                    708:           case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
                    709:           case OP_NOTPROP:
                    710:           case OP_PROP: codevalue += OP_PROP_EXTRA; break;
                    711:           case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
                    712:           case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
                    713:           case OP_NOT_HSPACE:
                    714:           case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
                    715:           case OP_NOT_VSPACE:
                    716:           case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
                    717:           default: break;
                    718:           }
                    719:         }
                    720:       }
                    721:     else
                    722:       {
                    723:       dlen = 0;         /* Not strictly necessary, but compilers moan */
                    724:       d = NOTACHAR;     /* if these variables are not set. */
                    725:       }
                    726: 
                    727: 
                    728:     /* Now process the individual opcodes */
                    729: 
                    730:     switch (codevalue)
                    731:       {
                    732: /* ========================================================================== */
                    733:       /* These cases are never obeyed. This is a fudge that causes a compile-
                    734:       time error if the vectors coptable or poptable, which are indexed by
                    735:       opcode, are not the correct length. It seems to be the only way to do
                    736:       such a check at compile time, as the sizeof() operator does not work
                    737:       in the C preprocessor. */
                    738: 
                    739:       case OP_TABLE_LENGTH:
                    740:       case OP_TABLE_LENGTH +
                    741:         ((sizeof(coptable) == OP_TABLE_LENGTH) &&
                    742:          (sizeof(poptable) == OP_TABLE_LENGTH)):
                    743:       break;
                    744: 
                    745: /* ========================================================================== */
                    746:       /* Reached a closing bracket. If not at the end of the pattern, carry
                    747:       on with the next opcode. For repeating opcodes, also add the repeat
                    748:       state. Note that KETRPOS will always be encountered at the end of the
                    749:       subpattern, because the possessive subpattern repeats are always handled
                    750:       using recursive calls. Thus, it never adds any new states.
                    751: 
                    752:       At the end of the (sub)pattern, unless we have an empty string and
                    753:       PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
                    754:       start of the subject, save the match data, shifting up all previous
                    755:       matches so we always have the longest first. */
                    756: 
                    757:       case OP_KET:
                    758:       case OP_KETRMIN:
                    759:       case OP_KETRMAX:
                    760:       case OP_KETRPOS:
                    761:       if (code != end_code)
                    762:         {
                    763:         ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
                    764:         if (codevalue != OP_KET)
                    765:           {
                    766:           ADD_ACTIVE(state_offset - GET(code, 1), 0);
                    767:           }
                    768:         }
                    769:       else
                    770:         {
                    771:         if (ptr > current_subject ||
                    772:             ((md->moptions & PCRE_NOTEMPTY) == 0 &&
                    773:               ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
                    774:                 current_subject > start_subject + md->start_offset)))
                    775:           {
                    776:           if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
                    777:             else if (match_count > 0 && ++match_count * 2 > offsetcount)
                    778:               match_count = 0;
                    779:           count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
                    780:           if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
                    781:           if (offsetcount >= 2)
                    782:             {
                    783:             offsets[0] = (int)(current_subject - start_subject);
                    784:             offsets[1] = (int)(ptr - start_subject);
                    785:             DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
                    786:               offsets[1] - offsets[0], current_subject));
                    787:             }
                    788:           if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
                    789:             {
                    790:             DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
                    791:               "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
                    792:               match_count, rlevel*2-2, SP));
                    793:             return match_count;
                    794:             }
                    795:           }
                    796:         }
                    797:       break;
                    798: 
                    799: /* ========================================================================== */
                    800:       /* These opcodes add to the current list of states without looking
                    801:       at the current character. */
                    802: 
                    803:       /*-----------------------------------------------------------------*/
                    804:       case OP_ALT:
                    805:       do { code += GET(code, 1); } while (*code == OP_ALT);
                    806:       ADD_ACTIVE((int)(code - start_code), 0);
                    807:       break;
                    808: 
                    809:       /*-----------------------------------------------------------------*/
                    810:       case OP_BRA:
                    811:       case OP_SBRA:
                    812:       do
                    813:         {
                    814:         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
                    815:         code += GET(code, 1);
                    816:         }
                    817:       while (*code == OP_ALT);
                    818:       break;
                    819: 
                    820:       /*-----------------------------------------------------------------*/
                    821:       case OP_CBRA:
                    822:       case OP_SCBRA:
1.1.1.2 ! misho     823:       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
1.1       misho     824:       code += GET(code, 1);
                    825:       while (*code == OP_ALT)
                    826:         {
                    827:         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
                    828:         code += GET(code, 1);
                    829:         }
                    830:       break;
                    831: 
                    832:       /*-----------------------------------------------------------------*/
                    833:       case OP_BRAZERO:
                    834:       case OP_BRAMINZERO:
                    835:       ADD_ACTIVE(state_offset + 1, 0);
                    836:       code += 1 + GET(code, 2);
                    837:       while (*code == OP_ALT) code += GET(code, 1);
                    838:       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
                    839:       break;
                    840: 
                    841:       /*-----------------------------------------------------------------*/
                    842:       case OP_SKIPZERO:
                    843:       code += 1 + GET(code, 2);
                    844:       while (*code == OP_ALT) code += GET(code, 1);
                    845:       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
                    846:       break;
                    847: 
                    848:       /*-----------------------------------------------------------------*/
                    849:       case OP_CIRC:
                    850:       if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
                    851:         { ADD_ACTIVE(state_offset + 1, 0); }
                    852:       break;
                    853: 
                    854:       /*-----------------------------------------------------------------*/
                    855:       case OP_CIRCM:
                    856:       if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
                    857:           (ptr != end_subject && WAS_NEWLINE(ptr)))
                    858:         { ADD_ACTIVE(state_offset + 1, 0); }
                    859:       break;
                    860: 
                    861:       /*-----------------------------------------------------------------*/
                    862:       case OP_EOD:
                    863:       if (ptr >= end_subject)
                    864:         {
                    865:         if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
                    866:           could_continue = TRUE;
                    867:         else { ADD_ACTIVE(state_offset + 1, 0); }
                    868:         }
                    869:       break;
                    870: 
                    871:       /*-----------------------------------------------------------------*/
                    872:       case OP_SOD:
                    873:       if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
                    874:       break;
                    875: 
                    876:       /*-----------------------------------------------------------------*/
                    877:       case OP_SOM:
                    878:       if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
                    879:       break;
                    880: 
                    881: 
                    882: /* ========================================================================== */
                    883:       /* These opcodes inspect the next subject character, and sometimes
                    884:       the previous one as well, but do not have an argument. The variable
                    885:       clen contains the length of the current character and is zero if we are
                    886:       at the end of the subject. */
                    887: 
                    888:       /*-----------------------------------------------------------------*/
                    889:       case OP_ANY:
                    890:       if (clen > 0 && !IS_NEWLINE(ptr))
                    891:         { ADD_NEW(state_offset + 1, 0); }
                    892:       break;
                    893: 
                    894:       /*-----------------------------------------------------------------*/
                    895:       case OP_ALLANY:
                    896:       if (clen > 0)
                    897:         { ADD_NEW(state_offset + 1, 0); }
                    898:       break;
                    899: 
                    900:       /*-----------------------------------------------------------------*/
                    901:       case OP_EODN:
                    902:       if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
                    903:         could_continue = TRUE;
                    904:       else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
                    905:         { ADD_ACTIVE(state_offset + 1, 0); }
                    906:       break;
                    907: 
                    908:       /*-----------------------------------------------------------------*/
                    909:       case OP_DOLL:
                    910:       if ((md->moptions & PCRE_NOTEOL) == 0)
                    911:         {
                    912:         if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
                    913:           could_continue = TRUE;
                    914:         else if (clen == 0 ||
                    915:             ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
                    916:                (ptr == end_subject - md->nllen)
                    917:             ))
                    918:           { ADD_ACTIVE(state_offset + 1, 0); }
                    919:         }
                    920:       break;
                    921: 
                    922:       /*-----------------------------------------------------------------*/
                    923:       case OP_DOLLM:
                    924:       if ((md->moptions & PCRE_NOTEOL) == 0)
                    925:         {
                    926:         if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
                    927:           could_continue = TRUE;
                    928:         else if (clen == 0 ||
                    929:             ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
                    930:           { ADD_ACTIVE(state_offset + 1, 0); }
                    931:         }
                    932:       else if (IS_NEWLINE(ptr))
                    933:         { ADD_ACTIVE(state_offset + 1, 0); }
                    934:       break;
                    935: 
                    936:       /*-----------------------------------------------------------------*/
                    937: 
                    938:       case OP_DIGIT:
                    939:       case OP_WHITESPACE:
                    940:       case OP_WORDCHAR:
                    941:       if (clen > 0 && c < 256 &&
                    942:             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
                    943:         { ADD_NEW(state_offset + 1, 0); }
                    944:       break;
                    945: 
                    946:       /*-----------------------------------------------------------------*/
                    947:       case OP_NOT_DIGIT:
                    948:       case OP_NOT_WHITESPACE:
                    949:       case OP_NOT_WORDCHAR:
                    950:       if (clen > 0 && (c >= 256 ||
                    951:             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
                    952:         { ADD_NEW(state_offset + 1, 0); }
                    953:       break;
                    954: 
                    955:       /*-----------------------------------------------------------------*/
                    956:       case OP_WORD_BOUNDARY:
                    957:       case OP_NOT_WORD_BOUNDARY:
                    958:         {
                    959:         int left_word, right_word;
                    960: 
                    961:         if (ptr > start_subject)
                    962:           {
1.1.1.2 ! misho     963:           const pcre_uchar *temp = ptr - 1;
1.1       misho     964:           if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1.1.1.2 ! misho     965: #ifdef SUPPORT_UTF
        !           966:           if (utf) { BACKCHAR(temp); }
1.1       misho     967: #endif
                    968:           GETCHARTEST(d, temp);
                    969: #ifdef SUPPORT_UCP
                    970:           if ((md->poptions & PCRE_UCP) != 0)
                    971:             {
                    972:             if (d == '_') left_word = TRUE; else
                    973:               {
                    974:               int cat = UCD_CATEGORY(d);
                    975:               left_word = (cat == ucp_L || cat == ucp_N);
                    976:               }
                    977:             }
                    978:           else
                    979: #endif
                    980:           left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
                    981:           }
                    982:         else left_word = FALSE;
                    983: 
                    984:         if (clen > 0)
                    985:           {
                    986: #ifdef SUPPORT_UCP
                    987:           if ((md->poptions & PCRE_UCP) != 0)
                    988:             {
                    989:             if (c == '_') right_word = TRUE; else
                    990:               {
                    991:               int cat = UCD_CATEGORY(c);
                    992:               right_word = (cat == ucp_L || cat == ucp_N);
                    993:               }
                    994:             }
                    995:           else
                    996: #endif
                    997:           right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
                    998:           }
                    999:         else right_word = FALSE;
                   1000: 
                   1001:         if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
                   1002:           { ADD_ACTIVE(state_offset + 1, 0); }
                   1003:         }
                   1004:       break;
                   1005: 
                   1006: 
                   1007:       /*-----------------------------------------------------------------*/
                   1008:       /* Check the next character by Unicode property. We will get here only
                   1009:       if the support is in the binary; otherwise a compile-time error occurs.
                   1010:       */
                   1011: 
                   1012: #ifdef SUPPORT_UCP
                   1013:       case OP_PROP:
                   1014:       case OP_NOTPROP:
                   1015:       if (clen > 0)
                   1016:         {
                   1017:         BOOL OK;
                   1018:         const ucd_record * prop = GET_UCD(c);
                   1019:         switch(code[1])
                   1020:           {
                   1021:           case PT_ANY:
                   1022:           OK = TRUE;
                   1023:           break;
                   1024: 
                   1025:           case PT_LAMP:
                   1026:           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
                   1027:                prop->chartype == ucp_Lt;
                   1028:           break;
                   1029: 
                   1030:           case PT_GC:
1.1.1.2 ! misho    1031:           OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1.1       misho    1032:           break;
                   1033: 
                   1034:           case PT_PC:
                   1035:           OK = prop->chartype == code[2];
                   1036:           break;
                   1037: 
                   1038:           case PT_SC:
                   1039:           OK = prop->script == code[2];
                   1040:           break;
                   1041: 
                   1042:           /* These are specials for combination cases. */
                   1043: 
                   1044:           case PT_ALNUM:
1.1.1.2 ! misho    1045:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
        !          1046:                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1.1       misho    1047:           break;
                   1048: 
                   1049:           case PT_SPACE:    /* Perl space */
1.1.1.2 ! misho    1050:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.1       misho    1051:                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
                   1052:           break;
                   1053: 
                   1054:           case PT_PXSPACE:  /* POSIX space */
1.1.1.2 ! misho    1055:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.1       misho    1056:                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
                   1057:                c == CHAR_FF || c == CHAR_CR;
                   1058:           break;
                   1059: 
                   1060:           case PT_WORD:
1.1.1.2 ! misho    1061:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
        !          1062:                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1.1       misho    1063:                c == CHAR_UNDERSCORE;
                   1064:           break;
                   1065: 
                   1066:           /* Should never occur, but keep compilers from grumbling. */
                   1067: 
                   1068:           default:
                   1069:           OK = codevalue != OP_PROP;
                   1070:           break;
                   1071:           }
                   1072: 
                   1073:         if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
                   1074:         }
                   1075:       break;
                   1076: #endif
                   1077: 
                   1078: 
                   1079: 
                   1080: /* ========================================================================== */
                   1081:       /* These opcodes likewise inspect the subject character, but have an
                   1082:       argument that is not a data character. It is one of these opcodes:
                   1083:       OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
                   1084:       OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
                   1085: 
                   1086:       case OP_TYPEPLUS:
                   1087:       case OP_TYPEMINPLUS:
                   1088:       case OP_TYPEPOSPLUS:
                   1089:       count = current_state->count;  /* Already matched */
                   1090:       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
                   1091:       if (clen > 0)
                   1092:         {
                   1093:         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
                   1094:             (c < 256 &&
                   1095:               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                   1096:               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
                   1097:           {
                   1098:           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
                   1099:             {
                   1100:             active_count--;            /* Remove non-match possibility */
                   1101:             next_active_state--;
                   1102:             }
                   1103:           count++;
                   1104:           ADD_NEW(state_offset, count);
                   1105:           }
                   1106:         }
                   1107:       break;
                   1108: 
                   1109:       /*-----------------------------------------------------------------*/
                   1110:       case OP_TYPEQUERY:
                   1111:       case OP_TYPEMINQUERY:
                   1112:       case OP_TYPEPOSQUERY:
                   1113:       ADD_ACTIVE(state_offset + 2, 0);
                   1114:       if (clen > 0)
                   1115:         {
                   1116:         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
                   1117:             (c < 256 &&
                   1118:               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                   1119:               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
                   1120:           {
                   1121:           if (codevalue == OP_TYPEPOSQUERY)
                   1122:             {
                   1123:             active_count--;            /* Remove non-match possibility */
                   1124:             next_active_state--;
                   1125:             }
                   1126:           ADD_NEW(state_offset + 2, 0);
                   1127:           }
                   1128:         }
                   1129:       break;
                   1130: 
                   1131:       /*-----------------------------------------------------------------*/
                   1132:       case OP_TYPESTAR:
                   1133:       case OP_TYPEMINSTAR:
                   1134:       case OP_TYPEPOSSTAR:
                   1135:       ADD_ACTIVE(state_offset + 2, 0);
                   1136:       if (clen > 0)
                   1137:         {
                   1138:         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
                   1139:             (c < 256 &&
                   1140:               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                   1141:               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
                   1142:           {
                   1143:           if (codevalue == OP_TYPEPOSSTAR)
                   1144:             {
                   1145:             active_count--;            /* Remove non-match possibility */
                   1146:             next_active_state--;
                   1147:             }
                   1148:           ADD_NEW(state_offset, 0);
                   1149:           }
                   1150:         }
                   1151:       break;
                   1152: 
                   1153:       /*-----------------------------------------------------------------*/
                   1154:       case OP_TYPEEXACT:
                   1155:       count = current_state->count;  /* Number already matched */
                   1156:       if (clen > 0)
                   1157:         {
                   1158:         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
                   1159:             (c < 256 &&
                   1160:               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                   1161:               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
                   1162:           {
                   1163:           if (++count >= GET2(code, 1))
1.1.1.2 ! misho    1164:             { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1.1       misho    1165:           else
                   1166:             { ADD_NEW(state_offset, count); }
                   1167:           }
                   1168:         }
                   1169:       break;
                   1170: 
                   1171:       /*-----------------------------------------------------------------*/
                   1172:       case OP_TYPEUPTO:
                   1173:       case OP_TYPEMINUPTO:
                   1174:       case OP_TYPEPOSUPTO:
1.1.1.2 ! misho    1175:       ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1.1       misho    1176:       count = current_state->count;  /* Number already matched */
                   1177:       if (clen > 0)
                   1178:         {
                   1179:         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
                   1180:             (c < 256 &&
                   1181:               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                   1182:               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
                   1183:           {
                   1184:           if (codevalue == OP_TYPEPOSUPTO)
                   1185:             {
                   1186:             active_count--;           /* Remove non-match possibility */
                   1187:             next_active_state--;
                   1188:             }
                   1189:           if (++count >= GET2(code, 1))
1.1.1.2 ! misho    1190:             { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1.1       misho    1191:           else
                   1192:             { ADD_NEW(state_offset, count); }
                   1193:           }
                   1194:         }
                   1195:       break;
                   1196: 
                   1197: /* ========================================================================== */
                   1198:       /* These are virtual opcodes that are used when something like
                   1199:       OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
                   1200:       argument. It keeps the code above fast for the other cases. The argument
                   1201:       is in the d variable. */
                   1202: 
                   1203: #ifdef SUPPORT_UCP
                   1204:       case OP_PROP_EXTRA + OP_TYPEPLUS:
                   1205:       case OP_PROP_EXTRA + OP_TYPEMINPLUS:
                   1206:       case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
                   1207:       count = current_state->count;           /* Already matched */
                   1208:       if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
                   1209:       if (clen > 0)
                   1210:         {
                   1211:         BOOL OK;
                   1212:         const ucd_record * prop = GET_UCD(c);
                   1213:         switch(code[2])
                   1214:           {
                   1215:           case PT_ANY:
                   1216:           OK = TRUE;
                   1217:           break;
                   1218: 
                   1219:           case PT_LAMP:
                   1220:           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
                   1221:             prop->chartype == ucp_Lt;
                   1222:           break;
                   1223: 
                   1224:           case PT_GC:
1.1.1.2 ! misho    1225:           OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1.1       misho    1226:           break;
                   1227: 
                   1228:           case PT_PC:
                   1229:           OK = prop->chartype == code[3];
                   1230:           break;
                   1231: 
                   1232:           case PT_SC:
                   1233:           OK = prop->script == code[3];
                   1234:           break;
                   1235: 
                   1236:           /* These are specials for combination cases. */
                   1237: 
                   1238:           case PT_ALNUM:
1.1.1.2 ! misho    1239:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
        !          1240:                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1.1       misho    1241:           break;
                   1242: 
                   1243:           case PT_SPACE:    /* Perl space */
1.1.1.2 ! misho    1244:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.1       misho    1245:                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
                   1246:           break;
                   1247: 
                   1248:           case PT_PXSPACE:  /* POSIX space */
1.1.1.2 ! misho    1249:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.1       misho    1250:                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
                   1251:                c == CHAR_FF || c == CHAR_CR;
                   1252:           break;
                   1253: 
                   1254:           case PT_WORD:
1.1.1.2 ! misho    1255:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
        !          1256:                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1.1       misho    1257:                c == CHAR_UNDERSCORE;
                   1258:           break;
                   1259: 
                   1260:           /* Should never occur, but keep compilers from grumbling. */
                   1261: 
                   1262:           default:
                   1263:           OK = codevalue != OP_PROP;
                   1264:           break;
                   1265:           }
                   1266: 
                   1267:         if (OK == (d == OP_PROP))
                   1268:           {
                   1269:           if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
                   1270:             {
                   1271:             active_count--;           /* Remove non-match possibility */
                   1272:             next_active_state--;
                   1273:             }
                   1274:           count++;
                   1275:           ADD_NEW(state_offset, count);
                   1276:           }
                   1277:         }
                   1278:       break;
                   1279: 
                   1280:       /*-----------------------------------------------------------------*/
                   1281:       case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
                   1282:       case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
                   1283:       case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
                   1284:       count = current_state->count;  /* Already matched */
                   1285:       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
                   1286:       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
                   1287:         {
1.1.1.2 ! misho    1288:         const pcre_uchar *nptr = ptr + clen;
1.1       misho    1289:         int ncount = 0;
                   1290:         if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
                   1291:           {
                   1292:           active_count--;           /* Remove non-match possibility */
                   1293:           next_active_state--;
                   1294:           }
                   1295:         while (nptr < end_subject)
                   1296:           {
                   1297:           int nd;
                   1298:           int ndlen = 1;
                   1299:           GETCHARLEN(nd, nptr, ndlen);
                   1300:           if (UCD_CATEGORY(nd) != ucp_M) break;
                   1301:           ncount++;
                   1302:           nptr += ndlen;
                   1303:           }
                   1304:         count++;
                   1305:         ADD_NEW_DATA(-state_offset, count, ncount);
                   1306:         }
                   1307:       break;
                   1308: #endif
                   1309: 
                   1310:       /*-----------------------------------------------------------------*/
                   1311:       case OP_ANYNL_EXTRA + OP_TYPEPLUS:
                   1312:       case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
                   1313:       case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
                   1314:       count = current_state->count;  /* Already matched */
                   1315:       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
                   1316:       if (clen > 0)
                   1317:         {
                   1318:         int ncount = 0;
                   1319:         switch (c)
                   1320:           {
                   1321:           case 0x000b:
                   1322:           case 0x000c:
                   1323:           case 0x0085:
                   1324:           case 0x2028:
                   1325:           case 0x2029:
                   1326:           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
                   1327:           goto ANYNL01;
                   1328: 
                   1329:           case 0x000d:
                   1330:           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
                   1331:           /* Fall through */
                   1332: 
                   1333:           ANYNL01:
                   1334:           case 0x000a:
                   1335:           if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
                   1336:             {
                   1337:             active_count--;           /* Remove non-match possibility */
                   1338:             next_active_state--;
                   1339:             }
                   1340:           count++;
                   1341:           ADD_NEW_DATA(-state_offset, count, ncount);
                   1342:           break;
                   1343: 
                   1344:           default:
                   1345:           break;
                   1346:           }
                   1347:         }
                   1348:       break;
                   1349: 
                   1350:       /*-----------------------------------------------------------------*/
                   1351:       case OP_VSPACE_EXTRA + OP_TYPEPLUS:
                   1352:       case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
                   1353:       case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
                   1354:       count = current_state->count;  /* Already matched */
                   1355:       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
                   1356:       if (clen > 0)
                   1357:         {
                   1358:         BOOL OK;
                   1359:         switch (c)
                   1360:           {
                   1361:           case 0x000a:
                   1362:           case 0x000b:
                   1363:           case 0x000c:
                   1364:           case 0x000d:
                   1365:           case 0x0085:
                   1366:           case 0x2028:
                   1367:           case 0x2029:
                   1368:           OK = TRUE;
                   1369:           break;
                   1370: 
                   1371:           default:
                   1372:           OK = FALSE;
                   1373:           break;
                   1374:           }
                   1375: 
                   1376:         if (OK == (d == OP_VSPACE))
                   1377:           {
                   1378:           if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
                   1379:             {
                   1380:             active_count--;           /* Remove non-match possibility */
                   1381:             next_active_state--;
                   1382:             }
                   1383:           count++;
                   1384:           ADD_NEW_DATA(-state_offset, count, 0);
                   1385:           }
                   1386:         }
                   1387:       break;
                   1388: 
                   1389:       /*-----------------------------------------------------------------*/
                   1390:       case OP_HSPACE_EXTRA + OP_TYPEPLUS:
                   1391:       case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
                   1392:       case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
                   1393:       count = current_state->count;  /* Already matched */
                   1394:       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
                   1395:       if (clen > 0)
                   1396:         {
                   1397:         BOOL OK;
                   1398:         switch (c)
                   1399:           {
                   1400:           case 0x09:      /* HT */
                   1401:           case 0x20:      /* SPACE */
                   1402:           case 0xa0:      /* NBSP */
                   1403:           case 0x1680:    /* OGHAM SPACE MARK */
                   1404:           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
                   1405:           case 0x2000:    /* EN QUAD */
                   1406:           case 0x2001:    /* EM QUAD */
                   1407:           case 0x2002:    /* EN SPACE */
                   1408:           case 0x2003:    /* EM SPACE */
                   1409:           case 0x2004:    /* THREE-PER-EM SPACE */
                   1410:           case 0x2005:    /* FOUR-PER-EM SPACE */
                   1411:           case 0x2006:    /* SIX-PER-EM SPACE */
                   1412:           case 0x2007:    /* FIGURE SPACE */
                   1413:           case 0x2008:    /* PUNCTUATION SPACE */
                   1414:           case 0x2009:    /* THIN SPACE */
                   1415:           case 0x200A:    /* HAIR SPACE */
                   1416:           case 0x202f:    /* NARROW NO-BREAK SPACE */
                   1417:           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
                   1418:           case 0x3000:    /* IDEOGRAPHIC SPACE */
                   1419:           OK = TRUE;
                   1420:           break;
                   1421: 
                   1422:           default:
                   1423:           OK = FALSE;
                   1424:           break;
                   1425:           }
                   1426: 
                   1427:         if (OK == (d == OP_HSPACE))
                   1428:           {
                   1429:           if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
                   1430:             {
                   1431:             active_count--;           /* Remove non-match possibility */
                   1432:             next_active_state--;
                   1433:             }
                   1434:           count++;
                   1435:           ADD_NEW_DATA(-state_offset, count, 0);
                   1436:           }
                   1437:         }
                   1438:       break;
                   1439: 
                   1440:       /*-----------------------------------------------------------------*/
                   1441: #ifdef SUPPORT_UCP
                   1442:       case OP_PROP_EXTRA + OP_TYPEQUERY:
                   1443:       case OP_PROP_EXTRA + OP_TYPEMINQUERY:
                   1444:       case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
                   1445:       count = 4;
                   1446:       goto QS1;
                   1447: 
                   1448:       case OP_PROP_EXTRA + OP_TYPESTAR:
                   1449:       case OP_PROP_EXTRA + OP_TYPEMINSTAR:
                   1450:       case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
                   1451:       count = 0;
                   1452: 
                   1453:       QS1:
                   1454: 
                   1455:       ADD_ACTIVE(state_offset + 4, 0);
                   1456:       if (clen > 0)
                   1457:         {
                   1458:         BOOL OK;
                   1459:         const ucd_record * prop = GET_UCD(c);
                   1460:         switch(code[2])
                   1461:           {
                   1462:           case PT_ANY:
                   1463:           OK = TRUE;
                   1464:           break;
                   1465: 
                   1466:           case PT_LAMP:
                   1467:           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
                   1468:             prop->chartype == ucp_Lt;
                   1469:           break;
                   1470: 
                   1471:           case PT_GC:
1.1.1.2 ! misho    1472:           OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1.1       misho    1473:           break;
                   1474: 
                   1475:           case PT_PC:
                   1476:           OK = prop->chartype == code[3];
                   1477:           break;
                   1478: 
                   1479:           case PT_SC:
                   1480:           OK = prop->script == code[3];
                   1481:           break;
                   1482: 
                   1483:           /* These are specials for combination cases. */
                   1484: 
                   1485:           case PT_ALNUM:
1.1.1.2 ! misho    1486:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
        !          1487:                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1.1       misho    1488:           break;
                   1489: 
                   1490:           case PT_SPACE:    /* Perl space */
1.1.1.2 ! misho    1491:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.1       misho    1492:                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
                   1493:           break;
                   1494: 
                   1495:           case PT_PXSPACE:  /* POSIX space */
1.1.1.2 ! misho    1496:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.1       misho    1497:                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
                   1498:                c == CHAR_FF || c == CHAR_CR;
                   1499:           break;
                   1500: 
                   1501:           case PT_WORD:
1.1.1.2 ! misho    1502:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
        !          1503:                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1.1       misho    1504:                c == CHAR_UNDERSCORE;
                   1505:           break;
                   1506: 
                   1507:           /* Should never occur, but keep compilers from grumbling. */
                   1508: 
                   1509:           default:
                   1510:           OK = codevalue != OP_PROP;
                   1511:           break;
                   1512:           }
                   1513: 
                   1514:         if (OK == (d == OP_PROP))
                   1515:           {
                   1516:           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
                   1517:               codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
                   1518:             {
                   1519:             active_count--;           /* Remove non-match possibility */
                   1520:             next_active_state--;
                   1521:             }
                   1522:           ADD_NEW(state_offset + count, 0);
                   1523:           }
                   1524:         }
                   1525:       break;
                   1526: 
                   1527:       /*-----------------------------------------------------------------*/
                   1528:       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
                   1529:       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
                   1530:       case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
                   1531:       count = 2;
                   1532:       goto QS2;
                   1533: 
                   1534:       case OP_EXTUNI_EXTRA + OP_TYPESTAR:
                   1535:       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
                   1536:       case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
                   1537:       count = 0;
                   1538: 
                   1539:       QS2:
                   1540: 
                   1541:       ADD_ACTIVE(state_offset + 2, 0);
                   1542:       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
                   1543:         {
1.1.1.2 ! misho    1544:         const pcre_uchar *nptr = ptr + clen;
1.1       misho    1545:         int ncount = 0;
                   1546:         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
                   1547:             codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
                   1548:           {
                   1549:           active_count--;           /* Remove non-match possibility */
                   1550:           next_active_state--;
                   1551:           }
                   1552:         while (nptr < end_subject)
                   1553:           {
                   1554:           int nd;
                   1555:           int ndlen = 1;
                   1556:           GETCHARLEN(nd, nptr, ndlen);
                   1557:           if (UCD_CATEGORY(nd) != ucp_M) break;
                   1558:           ncount++;
                   1559:           nptr += ndlen;
                   1560:           }
                   1561:         ADD_NEW_DATA(-(state_offset + count), 0, ncount);
                   1562:         }
                   1563:       break;
                   1564: #endif
                   1565: 
                   1566:       /*-----------------------------------------------------------------*/
                   1567:       case OP_ANYNL_EXTRA + OP_TYPEQUERY:
                   1568:       case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
                   1569:       case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
                   1570:       count = 2;
                   1571:       goto QS3;
                   1572: 
                   1573:       case OP_ANYNL_EXTRA + OP_TYPESTAR:
                   1574:       case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
                   1575:       case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
                   1576:       count = 0;
                   1577: 
                   1578:       QS3:
                   1579:       ADD_ACTIVE(state_offset + 2, 0);
                   1580:       if (clen > 0)
                   1581:         {
                   1582:         int ncount = 0;
                   1583:         switch (c)
                   1584:           {
                   1585:           case 0x000b:
                   1586:           case 0x000c:
                   1587:           case 0x0085:
                   1588:           case 0x2028:
                   1589:           case 0x2029:
                   1590:           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
                   1591:           goto ANYNL02;
                   1592: 
                   1593:           case 0x000d:
                   1594:           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
                   1595:           /* Fall through */
                   1596: 
                   1597:           ANYNL02:
                   1598:           case 0x000a:
                   1599:           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
                   1600:               codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
                   1601:             {
                   1602:             active_count--;           /* Remove non-match possibility */
                   1603:             next_active_state--;
                   1604:             }
                   1605:           ADD_NEW_DATA(-(state_offset + count), 0, ncount);
                   1606:           break;
                   1607: 
                   1608:           default:
                   1609:           break;
                   1610:           }
                   1611:         }
                   1612:       break;
                   1613: 
                   1614:       /*-----------------------------------------------------------------*/
                   1615:       case OP_VSPACE_EXTRA + OP_TYPEQUERY:
                   1616:       case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
                   1617:       case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
                   1618:       count = 2;
                   1619:       goto QS4;
                   1620: 
                   1621:       case OP_VSPACE_EXTRA + OP_TYPESTAR:
                   1622:       case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
                   1623:       case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
                   1624:       count = 0;
                   1625: 
                   1626:       QS4:
                   1627:       ADD_ACTIVE(state_offset + 2, 0);
                   1628:       if (clen > 0)
                   1629:         {
                   1630:         BOOL OK;
                   1631:         switch (c)
                   1632:           {
                   1633:           case 0x000a:
                   1634:           case 0x000b:
                   1635:           case 0x000c:
                   1636:           case 0x000d:
                   1637:           case 0x0085:
                   1638:           case 0x2028:
                   1639:           case 0x2029:
                   1640:           OK = TRUE;
                   1641:           break;
                   1642: 
                   1643:           default:
                   1644:           OK = FALSE;
                   1645:           break;
                   1646:           }
                   1647:         if (OK == (d == OP_VSPACE))
                   1648:           {
                   1649:           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
                   1650:               codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
                   1651:             {
                   1652:             active_count--;           /* Remove non-match possibility */
                   1653:             next_active_state--;
                   1654:             }
                   1655:           ADD_NEW_DATA(-(state_offset + count), 0, 0);
                   1656:           }
                   1657:         }
                   1658:       break;
                   1659: 
                   1660:       /*-----------------------------------------------------------------*/
                   1661:       case OP_HSPACE_EXTRA + OP_TYPEQUERY:
                   1662:       case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
                   1663:       case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
                   1664:       count = 2;
                   1665:       goto QS5;
                   1666: 
                   1667:       case OP_HSPACE_EXTRA + OP_TYPESTAR:
                   1668:       case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
                   1669:       case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
                   1670:       count = 0;
                   1671: 
                   1672:       QS5:
                   1673:       ADD_ACTIVE(state_offset + 2, 0);
                   1674:       if (clen > 0)
                   1675:         {
                   1676:         BOOL OK;
                   1677:         switch (c)
                   1678:           {
                   1679:           case 0x09:      /* HT */
                   1680:           case 0x20:      /* SPACE */
                   1681:           case 0xa0:      /* NBSP */
                   1682:           case 0x1680:    /* OGHAM SPACE MARK */
                   1683:           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
                   1684:           case 0x2000:    /* EN QUAD */
                   1685:           case 0x2001:    /* EM QUAD */
                   1686:           case 0x2002:    /* EN SPACE */
                   1687:           case 0x2003:    /* EM SPACE */
                   1688:           case 0x2004:    /* THREE-PER-EM SPACE */
                   1689:           case 0x2005:    /* FOUR-PER-EM SPACE */
                   1690:           case 0x2006:    /* SIX-PER-EM SPACE */
                   1691:           case 0x2007:    /* FIGURE SPACE */
                   1692:           case 0x2008:    /* PUNCTUATION SPACE */
                   1693:           case 0x2009:    /* THIN SPACE */
                   1694:           case 0x200A:    /* HAIR SPACE */
                   1695:           case 0x202f:    /* NARROW NO-BREAK SPACE */
                   1696:           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
                   1697:           case 0x3000:    /* IDEOGRAPHIC SPACE */
                   1698:           OK = TRUE;
                   1699:           break;
                   1700: 
                   1701:           default:
                   1702:           OK = FALSE;
                   1703:           break;
                   1704:           }
                   1705: 
                   1706:         if (OK == (d == OP_HSPACE))
                   1707:           {
                   1708:           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
                   1709:               codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
                   1710:             {
                   1711:             active_count--;           /* Remove non-match possibility */
                   1712:             next_active_state--;
                   1713:             }
                   1714:           ADD_NEW_DATA(-(state_offset + count), 0, 0);
                   1715:           }
                   1716:         }
                   1717:       break;
                   1718: 
                   1719:       /*-----------------------------------------------------------------*/
                   1720: #ifdef SUPPORT_UCP
                   1721:       case OP_PROP_EXTRA + OP_TYPEEXACT:
                   1722:       case OP_PROP_EXTRA + OP_TYPEUPTO:
                   1723:       case OP_PROP_EXTRA + OP_TYPEMINUPTO:
                   1724:       case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
                   1725:       if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1.1.1.2 ! misho    1726:         { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1.1       misho    1727:       count = current_state->count;  /* Number already matched */
                   1728:       if (clen > 0)
                   1729:         {
                   1730:         BOOL OK;
                   1731:         const ucd_record * prop = GET_UCD(c);
1.1.1.2 ! misho    1732:         switch(code[1 + IMM2_SIZE + 1])
1.1       misho    1733:           {
                   1734:           case PT_ANY:
                   1735:           OK = TRUE;
                   1736:           break;
                   1737: 
                   1738:           case PT_LAMP:
                   1739:           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
                   1740:             prop->chartype == ucp_Lt;
                   1741:           break;
                   1742: 
                   1743:           case PT_GC:
1.1.1.2 ! misho    1744:           OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1.1       misho    1745:           break;
                   1746: 
                   1747:           case PT_PC:
1.1.1.2 ! misho    1748:           OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1.1       misho    1749:           break;
                   1750: 
                   1751:           case PT_SC:
1.1.1.2 ! misho    1752:           OK = prop->script == code[1 + IMM2_SIZE + 2];
1.1       misho    1753:           break;
                   1754: 
                   1755:           /* These are specials for combination cases. */
                   1756: 
                   1757:           case PT_ALNUM:
1.1.1.2 ! misho    1758:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
        !          1759:                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1.1       misho    1760:           break;
                   1761: 
                   1762:           case PT_SPACE:    /* Perl space */
1.1.1.2 ! misho    1763:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.1       misho    1764:                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
                   1765:           break;
                   1766: 
                   1767:           case PT_PXSPACE:  /* POSIX space */
1.1.1.2 ! misho    1768:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.1       misho    1769:                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
                   1770:                c == CHAR_FF || c == CHAR_CR;
                   1771:           break;
                   1772: 
                   1773:           case PT_WORD:
1.1.1.2 ! misho    1774:           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
        !          1775:                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1.1       misho    1776:                c == CHAR_UNDERSCORE;
                   1777:           break;
                   1778: 
                   1779:           /* Should never occur, but keep compilers from grumbling. */
                   1780: 
                   1781:           default:
                   1782:           OK = codevalue != OP_PROP;
                   1783:           break;
                   1784:           }
                   1785: 
                   1786:         if (OK == (d == OP_PROP))
                   1787:           {
                   1788:           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
                   1789:             {
                   1790:             active_count--;           /* Remove non-match possibility */
                   1791:             next_active_state--;
                   1792:             }
                   1793:           if (++count >= GET2(code, 1))
1.1.1.2 ! misho    1794:             { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1.1       misho    1795:           else
                   1796:             { ADD_NEW(state_offset, count); }
                   1797:           }
                   1798:         }
                   1799:       break;
                   1800: 
                   1801:       /*-----------------------------------------------------------------*/
                   1802:       case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
                   1803:       case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
                   1804:       case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
                   1805:       case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
                   1806:       if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1.1.1.2 ! misho    1807:         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1.1       misho    1808:       count = current_state->count;  /* Number already matched */
                   1809:       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
                   1810:         {
1.1.1.2 ! misho    1811:         const pcre_uchar *nptr = ptr + clen;
1.1       misho    1812:         int ncount = 0;
                   1813:         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
                   1814:           {
                   1815:           active_count--;           /* Remove non-match possibility */
                   1816:           next_active_state--;
                   1817:           }
                   1818:         while (nptr < end_subject)
                   1819:           {
                   1820:           int nd;
                   1821:           int ndlen = 1;
                   1822:           GETCHARLEN(nd, nptr, ndlen);
                   1823:           if (UCD_CATEGORY(nd) != ucp_M) break;
                   1824:           ncount++;
                   1825:           nptr += ndlen;
                   1826:           }
                   1827:         if (++count >= GET2(code, 1))
1.1.1.2 ! misho    1828:           { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1.1       misho    1829:         else
                   1830:           { ADD_NEW_DATA(-state_offset, count, ncount); }
                   1831:         }
                   1832:       break;
                   1833: #endif
                   1834: 
                   1835:       /*-----------------------------------------------------------------*/
                   1836:       case OP_ANYNL_EXTRA + OP_TYPEEXACT:
                   1837:       case OP_ANYNL_EXTRA + OP_TYPEUPTO:
                   1838:       case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
                   1839:       case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
                   1840:       if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1.1.1.2 ! misho    1841:         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1.1       misho    1842:       count = current_state->count;  /* Number already matched */
                   1843:       if (clen > 0)
                   1844:         {
                   1845:         int ncount = 0;
                   1846:         switch (c)
                   1847:           {
                   1848:           case 0x000b:
                   1849:           case 0x000c:
                   1850:           case 0x0085:
                   1851:           case 0x2028:
                   1852:           case 0x2029:
                   1853:           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
                   1854:           goto ANYNL03;
                   1855: 
                   1856:           case 0x000d:
                   1857:           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
                   1858:           /* Fall through */
                   1859: 
                   1860:           ANYNL03:
                   1861:           case 0x000a:
                   1862:           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
                   1863:             {
                   1864:             active_count--;           /* Remove non-match possibility */
                   1865:             next_active_state--;
                   1866:             }
                   1867:           if (++count >= GET2(code, 1))
1.1.1.2 ! misho    1868:             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1.1       misho    1869:           else
                   1870:             { ADD_NEW_DATA(-state_offset, count, ncount); }
                   1871:           break;
                   1872: 
                   1873:           default:
                   1874:           break;
                   1875:           }
                   1876:         }
                   1877:       break;
                   1878: 
                   1879:       /*-----------------------------------------------------------------*/
                   1880:       case OP_VSPACE_EXTRA + OP_TYPEEXACT:
                   1881:       case OP_VSPACE_EXTRA + OP_TYPEUPTO:
                   1882:       case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
                   1883:       case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
                   1884:       if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1.1.1.2 ! misho    1885:         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1.1       misho    1886:       count = current_state->count;  /* Number already matched */
                   1887:       if (clen > 0)
                   1888:         {
                   1889:         BOOL OK;
                   1890:         switch (c)
                   1891:           {
                   1892:           case 0x000a:
                   1893:           case 0x000b:
                   1894:           case 0x000c:
                   1895:           case 0x000d:
                   1896:           case 0x0085:
                   1897:           case 0x2028:
                   1898:           case 0x2029:
                   1899:           OK = TRUE;
                   1900:           break;
                   1901: 
                   1902:           default:
                   1903:           OK = FALSE;
                   1904:           }
                   1905: 
                   1906:         if (OK == (d == OP_VSPACE))
                   1907:           {
                   1908:           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
                   1909:             {
                   1910:             active_count--;           /* Remove non-match possibility */
                   1911:             next_active_state--;
                   1912:             }
                   1913:           if (++count >= GET2(code, 1))
1.1.1.2 ! misho    1914:             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1.1       misho    1915:           else
                   1916:             { ADD_NEW_DATA(-state_offset, count, 0); }
                   1917:           }
                   1918:         }
                   1919:       break;
                   1920: 
                   1921:       /*-----------------------------------------------------------------*/
                   1922:       case OP_HSPACE_EXTRA + OP_TYPEEXACT:
                   1923:       case OP_HSPACE_EXTRA + OP_TYPEUPTO:
                   1924:       case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
                   1925:       case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
                   1926:       if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1.1.1.2 ! misho    1927:         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1.1       misho    1928:       count = current_state->count;  /* Number already matched */
                   1929:       if (clen > 0)
                   1930:         {
                   1931:         BOOL OK;
                   1932:         switch (c)
                   1933:           {
                   1934:           case 0x09:      /* HT */
                   1935:           case 0x20:      /* SPACE */
                   1936:           case 0xa0:      /* NBSP */
                   1937:           case 0x1680:    /* OGHAM SPACE MARK */
                   1938:           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
                   1939:           case 0x2000:    /* EN QUAD */
                   1940:           case 0x2001:    /* EM QUAD */
                   1941:           case 0x2002:    /* EN SPACE */
                   1942:           case 0x2003:    /* EM SPACE */
                   1943:           case 0x2004:    /* THREE-PER-EM SPACE */
                   1944:           case 0x2005:    /* FOUR-PER-EM SPACE */
                   1945:           case 0x2006:    /* SIX-PER-EM SPACE */
                   1946:           case 0x2007:    /* FIGURE SPACE */
                   1947:           case 0x2008:    /* PUNCTUATION SPACE */
                   1948:           case 0x2009:    /* THIN SPACE */
                   1949:           case 0x200A:    /* HAIR SPACE */
                   1950:           case 0x202f:    /* NARROW NO-BREAK SPACE */
                   1951:           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
                   1952:           case 0x3000:    /* IDEOGRAPHIC SPACE */
                   1953:           OK = TRUE;
                   1954:           break;
                   1955: 
                   1956:           default:
                   1957:           OK = FALSE;
                   1958:           break;
                   1959:           }
                   1960: 
                   1961:         if (OK == (d == OP_HSPACE))
                   1962:           {
                   1963:           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
                   1964:             {
                   1965:             active_count--;           /* Remove non-match possibility */
                   1966:             next_active_state--;
                   1967:             }
                   1968:           if (++count >= GET2(code, 1))
1.1.1.2 ! misho    1969:             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1.1       misho    1970:           else
                   1971:             { ADD_NEW_DATA(-state_offset, count, 0); }
                   1972:           }
                   1973:         }
                   1974:       break;
                   1975: 
                   1976: /* ========================================================================== */
                   1977:       /* These opcodes are followed by a character that is usually compared
                   1978:       to the current subject character; it is loaded into d. We still get
                   1979:       here even if there is no subject character, because in some cases zero
                   1980:       repetitions are permitted. */
                   1981: 
                   1982:       /*-----------------------------------------------------------------*/
                   1983:       case OP_CHAR:
                   1984:       if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
                   1985:       break;
                   1986: 
                   1987:       /*-----------------------------------------------------------------*/
                   1988:       case OP_CHARI:
                   1989:       if (clen == 0) break;
                   1990: 
1.1.1.2 ! misho    1991: #ifdef SUPPORT_UTF
        !          1992:       if (utf)
1.1       misho    1993:         {
                   1994:         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
                   1995:           {
                   1996:           unsigned int othercase;
1.1.1.2 ! misho    1997:           if (c < 128)
        !          1998:             othercase = fcc[c];
        !          1999:           else
        !          2000:             /* If we have Unicode property support, we can use it to test the
        !          2001:             other case of the character. */
1.1       misho    2002: #ifdef SUPPORT_UCP
1.1.1.2 ! misho    2003:             othercase = UCD_OTHERCASE(c);
1.1       misho    2004: #else
1.1.1.2 ! misho    2005:             othercase = NOTACHAR;
1.1       misho    2006: #endif
                   2007: 
                   2008:           if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
                   2009:           }
                   2010:         }
                   2011:       else
1.1.1.2 ! misho    2012: #endif  /* SUPPORT_UTF */
        !          2013:       /* Not UTF mode */
1.1       misho    2014:         {
1.1.1.2 ! misho    2015:         if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
        !          2016:           { ADD_NEW(state_offset + 2, 0); }
1.1       misho    2017:         }
                   2018:       break;
                   2019: 
                   2020: 
                   2021: #ifdef SUPPORT_UCP
                   2022:       /*-----------------------------------------------------------------*/
                   2023:       /* This is a tricky one because it can match more than one character.
                   2024:       Find out how many characters to skip, and then set up a negative state
                   2025:       to wait for them to pass before continuing. */
                   2026: 
                   2027:       case OP_EXTUNI:
                   2028:       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
                   2029:         {
1.1.1.2 ! misho    2030:         const pcre_uchar *nptr = ptr + clen;
1.1       misho    2031:         int ncount = 0;
                   2032:         while (nptr < end_subject)
                   2033:           {
                   2034:           int nclen = 1;
                   2035:           GETCHARLEN(c, nptr, nclen);
                   2036:           if (UCD_CATEGORY(c) != ucp_M) break;
                   2037:           ncount++;
                   2038:           nptr += nclen;
                   2039:           }
                   2040:         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
                   2041:         }
                   2042:       break;
                   2043: #endif
                   2044: 
                   2045:       /*-----------------------------------------------------------------*/
                   2046:       /* This is a tricky like EXTUNI because it too can match more than one
                   2047:       character (when CR is followed by LF). In this case, set up a negative
                   2048:       state to wait for one character to pass before continuing. */
                   2049: 
                   2050:       case OP_ANYNL:
                   2051:       if (clen > 0) switch(c)
                   2052:         {
                   2053:         case 0x000b:
                   2054:         case 0x000c:
                   2055:         case 0x0085:
                   2056:         case 0x2028:
                   2057:         case 0x2029:
                   2058:         if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
                   2059: 
                   2060:         case 0x000a:
                   2061:         ADD_NEW(state_offset + 1, 0);
                   2062:         break;
                   2063: 
                   2064:         case 0x000d:
                   2065:         if (ptr + 1 < end_subject && ptr[1] == 0x0a)
                   2066:           {
                   2067:           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
                   2068:           }
                   2069:         else
                   2070:           {
                   2071:           ADD_NEW(state_offset + 1, 0);
                   2072:           }
                   2073:         break;
                   2074:         }
                   2075:       break;
                   2076: 
                   2077:       /*-----------------------------------------------------------------*/
                   2078:       case OP_NOT_VSPACE:
                   2079:       if (clen > 0) switch(c)
                   2080:         {
                   2081:         case 0x000a:
                   2082:         case 0x000b:
                   2083:         case 0x000c:
                   2084:         case 0x000d:
                   2085:         case 0x0085:
                   2086:         case 0x2028:
                   2087:         case 0x2029:
                   2088:         break;
                   2089: 
                   2090:         default:
                   2091:         ADD_NEW(state_offset + 1, 0);
                   2092:         break;
                   2093:         }
                   2094:       break;
                   2095: 
                   2096:       /*-----------------------------------------------------------------*/
                   2097:       case OP_VSPACE:
                   2098:       if (clen > 0) switch(c)
                   2099:         {
                   2100:         case 0x000a:
                   2101:         case 0x000b:
                   2102:         case 0x000c:
                   2103:         case 0x000d:
                   2104:         case 0x0085:
                   2105:         case 0x2028:
                   2106:         case 0x2029:
                   2107:         ADD_NEW(state_offset + 1, 0);
                   2108:         break;
                   2109: 
                   2110:         default: break;
                   2111:         }
                   2112:       break;
                   2113: 
                   2114:       /*-----------------------------------------------------------------*/
                   2115:       case OP_NOT_HSPACE:
                   2116:       if (clen > 0) switch(c)
                   2117:         {
                   2118:         case 0x09:      /* HT */
                   2119:         case 0x20:      /* SPACE */
                   2120:         case 0xa0:      /* NBSP */
                   2121:         case 0x1680:    /* OGHAM SPACE MARK */
                   2122:         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
                   2123:         case 0x2000:    /* EN QUAD */
                   2124:         case 0x2001:    /* EM QUAD */
                   2125:         case 0x2002:    /* EN SPACE */
                   2126:         case 0x2003:    /* EM SPACE */
                   2127:         case 0x2004:    /* THREE-PER-EM SPACE */
                   2128:         case 0x2005:    /* FOUR-PER-EM SPACE */
                   2129:         case 0x2006:    /* SIX-PER-EM SPACE */
                   2130:         case 0x2007:    /* FIGURE SPACE */
                   2131:         case 0x2008:    /* PUNCTUATION SPACE */
                   2132:         case 0x2009:    /* THIN SPACE */
                   2133:         case 0x200A:    /* HAIR SPACE */
                   2134:         case 0x202f:    /* NARROW NO-BREAK SPACE */
                   2135:         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
                   2136:         case 0x3000:    /* IDEOGRAPHIC SPACE */
                   2137:         break;
                   2138: 
                   2139:         default:
                   2140:         ADD_NEW(state_offset + 1, 0);
                   2141:         break;
                   2142:         }
                   2143:       break;
                   2144: 
                   2145:       /*-----------------------------------------------------------------*/
                   2146:       case OP_HSPACE:
                   2147:       if (clen > 0) switch(c)
                   2148:         {
                   2149:         case 0x09:      /* HT */
                   2150:         case 0x20:      /* SPACE */
                   2151:         case 0xa0:      /* NBSP */
                   2152:         case 0x1680:    /* OGHAM SPACE MARK */
                   2153:         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
                   2154:         case 0x2000:    /* EN QUAD */
                   2155:         case 0x2001:    /* EM QUAD */
                   2156:         case 0x2002:    /* EN SPACE */
                   2157:         case 0x2003:    /* EM SPACE */
                   2158:         case 0x2004:    /* THREE-PER-EM SPACE */
                   2159:         case 0x2005:    /* FOUR-PER-EM SPACE */
                   2160:         case 0x2006:    /* SIX-PER-EM SPACE */
                   2161:         case 0x2007:    /* FIGURE SPACE */
                   2162:         case 0x2008:    /* PUNCTUATION SPACE */
                   2163:         case 0x2009:    /* THIN SPACE */
                   2164:         case 0x200A:    /* HAIR SPACE */
                   2165:         case 0x202f:    /* NARROW NO-BREAK SPACE */
                   2166:         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
                   2167:         case 0x3000:    /* IDEOGRAPHIC SPACE */
                   2168:         ADD_NEW(state_offset + 1, 0);
                   2169:         break;
                   2170:         }
                   2171:       break;
                   2172: 
                   2173:       /*-----------------------------------------------------------------*/
                   2174:       /* Match a negated single character casefully. This is only used for
                   2175:       one-byte characters, that is, we know that d < 256. The character we are
                   2176:       checking (c) can be multibyte. */
                   2177: 
                   2178:       case OP_NOT:
                   2179:       if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
                   2180:       break;
                   2181: 
                   2182:       /*-----------------------------------------------------------------*/
                   2183:       /* Match a negated single character caselessly. This is only used for
                   2184:       one-byte characters, that is, we know that d < 256. The character we are
                   2185:       checking (c) can be multibyte. */
                   2186: 
                   2187:       case OP_NOTI:
                   2188:       if (clen > 0 && c != d && c != fcc[d])
                   2189:         { ADD_NEW(state_offset + dlen + 1, 0); }
                   2190:       break;
                   2191: 
                   2192:       /*-----------------------------------------------------------------*/
                   2193:       case OP_PLUSI:
                   2194:       case OP_MINPLUSI:
                   2195:       case OP_POSPLUSI:
                   2196:       case OP_NOTPLUSI:
                   2197:       case OP_NOTMINPLUSI:
                   2198:       case OP_NOTPOSPLUSI:
                   2199:       caseless = TRUE;
                   2200:       codevalue -= OP_STARI - OP_STAR;
                   2201: 
                   2202:       /* Fall through */
                   2203:       case OP_PLUS:
                   2204:       case OP_MINPLUS:
                   2205:       case OP_POSPLUS:
                   2206:       case OP_NOTPLUS:
                   2207:       case OP_NOTMINPLUS:
                   2208:       case OP_NOTPOSPLUS:
                   2209:       count = current_state->count;  /* Already matched */
                   2210:       if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
                   2211:       if (clen > 0)
                   2212:         {
                   2213:         unsigned int otherd = NOTACHAR;
                   2214:         if (caseless)
                   2215:           {
1.1.1.2 ! misho    2216: #ifdef SUPPORT_UTF
        !          2217:           if (utf && d >= 128)
1.1       misho    2218:             {
                   2219: #ifdef SUPPORT_UCP
                   2220:             otherd = UCD_OTHERCASE(d);
                   2221: #endif  /* SUPPORT_UCP */
                   2222:             }
                   2223:           else
1.1.1.2 ! misho    2224: #endif  /* SUPPORT_UTF */
        !          2225:           otherd = TABLE_GET(d, fcc, d);
1.1       misho    2226:           }
                   2227:         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
                   2228:           {
                   2229:           if (count > 0 &&
                   2230:               (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
                   2231:             {
                   2232:             active_count--;             /* Remove non-match possibility */
                   2233:             next_active_state--;
                   2234:             }
                   2235:           count++;
                   2236:           ADD_NEW(state_offset, count);
                   2237:           }
                   2238:         }
                   2239:       break;
                   2240: 
                   2241:       /*-----------------------------------------------------------------*/
                   2242:       case OP_QUERYI:
                   2243:       case OP_MINQUERYI:
                   2244:       case OP_POSQUERYI:
                   2245:       case OP_NOTQUERYI:
                   2246:       case OP_NOTMINQUERYI:
                   2247:       case OP_NOTPOSQUERYI:
                   2248:       caseless = TRUE;
                   2249:       codevalue -= OP_STARI - OP_STAR;
                   2250:       /* Fall through */
                   2251:       case OP_QUERY:
                   2252:       case OP_MINQUERY:
                   2253:       case OP_POSQUERY:
                   2254:       case OP_NOTQUERY:
                   2255:       case OP_NOTMINQUERY:
                   2256:       case OP_NOTPOSQUERY:
                   2257:       ADD_ACTIVE(state_offset + dlen + 1, 0);
                   2258:       if (clen > 0)
                   2259:         {
                   2260:         unsigned int otherd = NOTACHAR;
                   2261:         if (caseless)
                   2262:           {
1.1.1.2 ! misho    2263: #ifdef SUPPORT_UTF
        !          2264:           if (utf && d >= 128)
1.1       misho    2265:             {
                   2266: #ifdef SUPPORT_UCP
                   2267:             otherd = UCD_OTHERCASE(d);
                   2268: #endif  /* SUPPORT_UCP */
                   2269:             }
                   2270:           else
1.1.1.2 ! misho    2271: #endif  /* SUPPORT_UTF */
        !          2272:           otherd = TABLE_GET(d, fcc, d);
1.1       misho    2273:           }
                   2274:         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
                   2275:           {
                   2276:           if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
                   2277:             {
                   2278:             active_count--;            /* Remove non-match possibility */
                   2279:             next_active_state--;
                   2280:             }
                   2281:           ADD_NEW(state_offset + dlen + 1, 0);
                   2282:           }
                   2283:         }
                   2284:       break;
                   2285: 
                   2286:       /*-----------------------------------------------------------------*/
                   2287:       case OP_STARI:
                   2288:       case OP_MINSTARI:
                   2289:       case OP_POSSTARI:
                   2290:       case OP_NOTSTARI:
                   2291:       case OP_NOTMINSTARI:
                   2292:       case OP_NOTPOSSTARI:
                   2293:       caseless = TRUE;
                   2294:       codevalue -= OP_STARI - OP_STAR;
                   2295:       /* Fall through */
                   2296:       case OP_STAR:
                   2297:       case OP_MINSTAR:
                   2298:       case OP_POSSTAR:
                   2299:       case OP_NOTSTAR:
                   2300:       case OP_NOTMINSTAR:
                   2301:       case OP_NOTPOSSTAR:
                   2302:       ADD_ACTIVE(state_offset + dlen + 1, 0);
                   2303:       if (clen > 0)
                   2304:         {
                   2305:         unsigned int otherd = NOTACHAR;
                   2306:         if (caseless)
                   2307:           {
1.1.1.2 ! misho    2308: #ifdef SUPPORT_UTF
        !          2309:           if (utf && d >= 128)
1.1       misho    2310:             {
                   2311: #ifdef SUPPORT_UCP
                   2312:             otherd = UCD_OTHERCASE(d);
                   2313: #endif  /* SUPPORT_UCP */
                   2314:             }
                   2315:           else
1.1.1.2 ! misho    2316: #endif  /* SUPPORT_UTF */
        !          2317:           otherd = TABLE_GET(d, fcc, d);
1.1       misho    2318:           }
                   2319:         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
                   2320:           {
                   2321:           if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
                   2322:             {
                   2323:             active_count--;            /* Remove non-match possibility */
                   2324:             next_active_state--;
                   2325:             }
                   2326:           ADD_NEW(state_offset, 0);
                   2327:           }
                   2328:         }
                   2329:       break;
                   2330: 
                   2331:       /*-----------------------------------------------------------------*/
                   2332:       case OP_EXACTI:
                   2333:       case OP_NOTEXACTI:
                   2334:       caseless = TRUE;
                   2335:       codevalue -= OP_STARI - OP_STAR;
                   2336:       /* Fall through */
                   2337:       case OP_EXACT:
                   2338:       case OP_NOTEXACT:
                   2339:       count = current_state->count;  /* Number already matched */
                   2340:       if (clen > 0)
                   2341:         {
                   2342:         unsigned int otherd = NOTACHAR;
                   2343:         if (caseless)
                   2344:           {
1.1.1.2 ! misho    2345: #ifdef SUPPORT_UTF
        !          2346:           if (utf && d >= 128)
1.1       misho    2347:             {
                   2348: #ifdef SUPPORT_UCP
                   2349:             otherd = UCD_OTHERCASE(d);
                   2350: #endif  /* SUPPORT_UCP */
                   2351:             }
                   2352:           else
1.1.1.2 ! misho    2353: #endif  /* SUPPORT_UTF */
        !          2354:           otherd = TABLE_GET(d, fcc, d);
1.1       misho    2355:           }
                   2356:         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
                   2357:           {
                   2358:           if (++count >= GET2(code, 1))
1.1.1.2 ! misho    2359:             { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
1.1       misho    2360:           else
                   2361:             { ADD_NEW(state_offset, count); }
                   2362:           }
                   2363:         }
                   2364:       break;
                   2365: 
                   2366:       /*-----------------------------------------------------------------*/
                   2367:       case OP_UPTOI:
                   2368:       case OP_MINUPTOI:
                   2369:       case OP_POSUPTOI:
                   2370:       case OP_NOTUPTOI:
                   2371:       case OP_NOTMINUPTOI:
                   2372:       case OP_NOTPOSUPTOI:
                   2373:       caseless = TRUE;
                   2374:       codevalue -= OP_STARI - OP_STAR;
                   2375:       /* Fall through */
                   2376:       case OP_UPTO:
                   2377:       case OP_MINUPTO:
                   2378:       case OP_POSUPTO:
                   2379:       case OP_NOTUPTO:
                   2380:       case OP_NOTMINUPTO:
                   2381:       case OP_NOTPOSUPTO:
1.1.1.2 ! misho    2382:       ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
1.1       misho    2383:       count = current_state->count;  /* Number already matched */
                   2384:       if (clen > 0)
                   2385:         {
                   2386:         unsigned int otherd = NOTACHAR;
                   2387:         if (caseless)
                   2388:           {
1.1.1.2 ! misho    2389: #ifdef SUPPORT_UTF
        !          2390:           if (utf && d >= 128)
1.1       misho    2391:             {
                   2392: #ifdef SUPPORT_UCP
                   2393:             otherd = UCD_OTHERCASE(d);
                   2394: #endif  /* SUPPORT_UCP */
                   2395:             }
                   2396:           else
1.1.1.2 ! misho    2397: #endif  /* SUPPORT_UTF */
        !          2398:           otherd = TABLE_GET(d, fcc, d);
1.1       misho    2399:           }
                   2400:         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
                   2401:           {
                   2402:           if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
                   2403:             {
                   2404:             active_count--;             /* Remove non-match possibility */
                   2405:             next_active_state--;
                   2406:             }
                   2407:           if (++count >= GET2(code, 1))
1.1.1.2 ! misho    2408:             { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
1.1       misho    2409:           else
                   2410:             { ADD_NEW(state_offset, count); }
                   2411:           }
                   2412:         }
                   2413:       break;
                   2414: 
                   2415: 
                   2416: /* ========================================================================== */
                   2417:       /* These are the class-handling opcodes */
                   2418: 
                   2419:       case OP_CLASS:
                   2420:       case OP_NCLASS:
                   2421:       case OP_XCLASS:
                   2422:         {
                   2423:         BOOL isinclass = FALSE;
                   2424:         int next_state_offset;
1.1.1.2 ! misho    2425:         const pcre_uchar *ecode;
1.1       misho    2426: 
                   2427:         /* For a simple class, there is always just a 32-byte table, and we
                   2428:         can set isinclass from it. */
                   2429: 
                   2430:         if (codevalue != OP_XCLASS)
                   2431:           {
1.1.1.2 ! misho    2432:           ecode = code + 1 + (32 / sizeof(pcre_uchar));
1.1       misho    2433:           if (clen > 0)
                   2434:             {
                   2435:             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
1.1.1.2 ! misho    2436:               ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
1.1       misho    2437:             }
                   2438:           }
                   2439: 
                   2440:         /* An extended class may have a table or a list of single characters,
                   2441:         ranges, or both, and it may be positive or negative. There's a
                   2442:         function that sorts all this out. */
                   2443: 
                   2444:         else
                   2445:          {
                   2446:          ecode = code + GET(code, 1);
1.1.1.2 ! misho    2447:          if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
1.1       misho    2448:          }
                   2449: 
                   2450:         /* At this point, isinclass is set for all kinds of class, and ecode
                   2451:         points to the byte after the end of the class. If there is a
                   2452:         quantifier, this is where it will be. */
                   2453: 
                   2454:         next_state_offset = (int)(ecode - start_code);
                   2455: 
                   2456:         switch (*ecode)
                   2457:           {
                   2458:           case OP_CRSTAR:
                   2459:           case OP_CRMINSTAR:
                   2460:           ADD_ACTIVE(next_state_offset + 1, 0);
                   2461:           if (isinclass) { ADD_NEW(state_offset, 0); }
                   2462:           break;
                   2463: 
                   2464:           case OP_CRPLUS:
                   2465:           case OP_CRMINPLUS:
                   2466:           count = current_state->count;  /* Already matched */
                   2467:           if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
                   2468:           if (isinclass) { count++; ADD_NEW(state_offset, count); }
                   2469:           break;
                   2470: 
                   2471:           case OP_CRQUERY:
                   2472:           case OP_CRMINQUERY:
                   2473:           ADD_ACTIVE(next_state_offset + 1, 0);
                   2474:           if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
                   2475:           break;
                   2476: 
                   2477:           case OP_CRRANGE:
                   2478:           case OP_CRMINRANGE:
                   2479:           count = current_state->count;  /* Already matched */
                   2480:           if (count >= GET2(ecode, 1))
1.1.1.2 ! misho    2481:             { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
1.1       misho    2482:           if (isinclass)
                   2483:             {
1.1.1.2 ! misho    2484:             int max = GET2(ecode, 1 + IMM2_SIZE);
1.1       misho    2485:             if (++count >= max && max != 0)   /* Max 0 => no limit */
1.1.1.2 ! misho    2486:               { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
1.1       misho    2487:             else
                   2488:               { ADD_NEW(state_offset, count); }
                   2489:             }
                   2490:           break;
                   2491: 
                   2492:           default:
                   2493:           if (isinclass) { ADD_NEW(next_state_offset, 0); }
                   2494:           break;
                   2495:           }
                   2496:         }
                   2497:       break;
                   2498: 
                   2499: /* ========================================================================== */
                   2500:       /* These are the opcodes for fancy brackets of various kinds. We have
                   2501:       to use recursion in order to handle them. The "always failing" assertion
                   2502:       (?!) is optimised to OP_FAIL when compiling, so we have to support that,
                   2503:       though the other "backtracking verbs" are not supported. */
                   2504: 
                   2505:       case OP_FAIL:
                   2506:       forced_fail++;    /* Count FAILs for multiple states */
                   2507:       break;
                   2508: 
                   2509:       case OP_ASSERT:
                   2510:       case OP_ASSERT_NOT:
                   2511:       case OP_ASSERTBACK:
                   2512:       case OP_ASSERTBACK_NOT:
                   2513:         {
                   2514:         int rc;
                   2515:         int local_offsets[2];
                   2516:         int local_workspace[1000];
1.1.1.2 ! misho    2517:         const pcre_uchar *endasscode = code + GET(code, 1);
1.1       misho    2518: 
                   2519:         while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
                   2520: 
                   2521:         rc = internal_dfa_exec(
                   2522:           md,                                   /* static match data */
                   2523:           code,                                 /* this subexpression's code */
                   2524:           ptr,                                  /* where we currently are */
                   2525:           (int)(ptr - start_subject),           /* start offset */
                   2526:           local_offsets,                        /* offset vector */
                   2527:           sizeof(local_offsets)/sizeof(int),    /* size of same */
                   2528:           local_workspace,                      /* workspace vector */
                   2529:           sizeof(local_workspace)/sizeof(int),  /* size of same */
                   2530:           rlevel);                              /* function recursion level */
                   2531: 
                   2532:         if (rc == PCRE_ERROR_DFA_UITEM) return rc;
                   2533:         if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
                   2534:             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
                   2535:         }
                   2536:       break;
                   2537: 
                   2538:       /*-----------------------------------------------------------------*/
                   2539:       case OP_COND:
                   2540:       case OP_SCOND:
                   2541:         {
                   2542:         int local_offsets[1000];
                   2543:         int local_workspace[1000];
                   2544:         int codelink = GET(code, 1);
                   2545:         int condcode;
                   2546: 
                   2547:         /* Because of the way auto-callout works during compile, a callout item
                   2548:         is inserted between OP_COND and an assertion condition. This does not
                   2549:         happen for the other conditions. */
                   2550: 
                   2551:         if (code[LINK_SIZE+1] == OP_CALLOUT)
                   2552:           {
                   2553:           rrc = 0;
1.1.1.2 ! misho    2554:           if (PUBL(callout) != NULL)
1.1       misho    2555:             {
1.1.1.2 ! misho    2556:             PUBL(callout_block) cb;
1.1       misho    2557:             cb.version          = 1;   /* Version 1 of the callout block */
                   2558:             cb.callout_number   = code[LINK_SIZE+2];
                   2559:             cb.offset_vector    = offsets;
1.1.1.2 ! misho    2560: #ifdef COMPILE_PCRE8
1.1       misho    2561:             cb.subject          = (PCRE_SPTR)start_subject;
1.1.1.2 ! misho    2562: #else
        !          2563:             cb.subject          = (PCRE_SPTR16)start_subject;
        !          2564: #endif
1.1       misho    2565:             cb.subject_length   = (int)(end_subject - start_subject);
                   2566:             cb.start_match      = (int)(current_subject - start_subject);
                   2567:             cb.current_position = (int)(ptr - start_subject);
                   2568:             cb.pattern_position = GET(code, LINK_SIZE + 3);
                   2569:             cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
                   2570:             cb.capture_top      = 1;
                   2571:             cb.capture_last     = -1;
                   2572:             cb.callout_data     = md->callout_data;
                   2573:             cb.mark             = NULL;   /* No (*MARK) support */
1.1.1.2 ! misho    2574:             if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
1.1       misho    2575:             }
                   2576:           if (rrc > 0) break;                      /* Fail this thread */
1.1.1.2 ! misho    2577:           code += PRIV(OP_lengths)[OP_CALLOUT];    /* Skip callout data */
1.1       misho    2578:           }
                   2579: 
                   2580:         condcode = code[LINK_SIZE+1];
                   2581: 
                   2582:         /* Back reference conditions are not supported */
                   2583: 
                   2584:         if (condcode == OP_CREF || condcode == OP_NCREF)
                   2585:           return PCRE_ERROR_DFA_UCOND;
                   2586: 
                   2587:         /* The DEFINE condition is always false */
                   2588: 
                   2589:         if (condcode == OP_DEF)
                   2590:           { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
                   2591: 
                   2592:         /* The only supported version of OP_RREF is for the value RREF_ANY,
                   2593:         which means "test if in any recursion". We can't test for specifically
                   2594:         recursed groups. */
                   2595: 
                   2596:         else if (condcode == OP_RREF || condcode == OP_NRREF)
                   2597:           {
1.1.1.2 ! misho    2598:           int value = GET2(code, LINK_SIZE + 2);
1.1       misho    2599:           if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
                   2600:           if (md->recursive != NULL)
1.1.1.2 ! misho    2601:             { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
1.1       misho    2602:           else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
                   2603:           }
                   2604: 
                   2605:         /* Otherwise, the condition is an assertion */
                   2606: 
                   2607:         else
                   2608:           {
                   2609:           int rc;
1.1.1.2 ! misho    2610:           const pcre_uchar *asscode = code + LINK_SIZE + 1;
        !          2611:           const pcre_uchar *endasscode = asscode + GET(asscode, 1);
1.1       misho    2612: 
                   2613:           while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
                   2614: 
                   2615:           rc = internal_dfa_exec(
                   2616:             md,                                   /* fixed match data */
                   2617:             asscode,                              /* this subexpression's code */
                   2618:             ptr,                                  /* where we currently are */
                   2619:             (int)(ptr - start_subject),           /* start offset */
                   2620:             local_offsets,                        /* offset vector */
                   2621:             sizeof(local_offsets)/sizeof(int),    /* size of same */
                   2622:             local_workspace,                      /* workspace vector */
                   2623:             sizeof(local_workspace)/sizeof(int),  /* size of same */
                   2624:             rlevel);                              /* function recursion level */
                   2625: 
                   2626:           if (rc == PCRE_ERROR_DFA_UITEM) return rc;
                   2627:           if ((rc >= 0) ==
                   2628:                 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
                   2629:             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
                   2630:           else
                   2631:             { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
                   2632:           }
                   2633:         }
                   2634:       break;
                   2635: 
                   2636:       /*-----------------------------------------------------------------*/
                   2637:       case OP_RECURSE:
                   2638:         {
                   2639:         dfa_recursion_info *ri;
                   2640:         int local_offsets[1000];
                   2641:         int local_workspace[1000];
1.1.1.2 ! misho    2642:         const pcre_uchar *callpat = start_code + GET(code, 1);
1.1       misho    2643:         int recno = (callpat == md->start_code)? 0 :
                   2644:           GET2(callpat, 1 + LINK_SIZE);
                   2645:         int rc;
                   2646: 
                   2647:         DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
                   2648: 
                   2649:         /* Check for repeating a recursion without advancing the subject
                   2650:         pointer. This should catch convoluted mutual recursions. (Some simple
                   2651:         cases are caught at compile time.) */
                   2652: 
                   2653:         for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
                   2654:           if (recno == ri->group_num && ptr == ri->subject_position)
                   2655:             return PCRE_ERROR_RECURSELOOP;
                   2656: 
                   2657:         /* Remember this recursion and where we started it so as to
                   2658:         catch infinite loops. */
                   2659: 
                   2660:         new_recursive.group_num = recno;
                   2661:         new_recursive.subject_position = ptr;
                   2662:         new_recursive.prevrec = md->recursive;
                   2663:         md->recursive = &new_recursive;
                   2664: 
                   2665:         rc = internal_dfa_exec(
                   2666:           md,                                   /* fixed match data */
                   2667:           callpat,                              /* this subexpression's code */
                   2668:           ptr,                                  /* where we currently are */
                   2669:           (int)(ptr - start_subject),           /* start offset */
                   2670:           local_offsets,                        /* offset vector */
                   2671:           sizeof(local_offsets)/sizeof(int),    /* size of same */
                   2672:           local_workspace,                      /* workspace vector */
                   2673:           sizeof(local_workspace)/sizeof(int),  /* size of same */
                   2674:           rlevel);                              /* function recursion level */
                   2675: 
                   2676:         md->recursive = new_recursive.prevrec;  /* Done this recursion */
                   2677: 
                   2678:         DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
                   2679:           rc));
                   2680: 
                   2681:         /* Ran out of internal offsets */
                   2682: 
                   2683:         if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
                   2684: 
                   2685:         /* For each successful matched substring, set up the next state with a
                   2686:         count of characters to skip before trying it. Note that the count is in
                   2687:         characters, not bytes. */
                   2688: 
                   2689:         if (rc > 0)
                   2690:           {
                   2691:           for (rc = rc*2 - 2; rc >= 0; rc -= 2)
                   2692:             {
                   2693:             int charcount = local_offsets[rc+1] - local_offsets[rc];
1.1.1.2 ! misho    2694: #ifdef SUPPORT_UTF
        !          2695:             const pcre_uchar *p = start_subject + local_offsets[rc];
        !          2696:             const pcre_uchar *pp = start_subject + local_offsets[rc+1];
        !          2697:             while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
        !          2698: #endif
1.1       misho    2699:             if (charcount > 0)
                   2700:               {
                   2701:               ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
                   2702:               }
                   2703:             else
                   2704:               {
                   2705:               ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
                   2706:               }
                   2707:             }
                   2708:           }
                   2709:         else if (rc != PCRE_ERROR_NOMATCH) return rc;
                   2710:         }
                   2711:       break;
                   2712: 
                   2713:       /*-----------------------------------------------------------------*/
                   2714:       case OP_BRAPOS:
                   2715:       case OP_SBRAPOS:
                   2716:       case OP_CBRAPOS:
                   2717:       case OP_SCBRAPOS:
                   2718:       case OP_BRAPOSZERO:
                   2719:         {
                   2720:         int charcount, matched_count;
1.1.1.2 ! misho    2721:         const pcre_uchar *local_ptr = ptr;
1.1       misho    2722:         BOOL allow_zero;
                   2723: 
                   2724:         if (codevalue == OP_BRAPOSZERO)
                   2725:           {
                   2726:           allow_zero = TRUE;
                   2727:           codevalue = *(++code);  /* Codevalue will be one of above BRAs */
                   2728:           }
                   2729:         else allow_zero = FALSE;
                   2730: 
                   2731:         /* Loop to match the subpattern as many times as possible as if it were
                   2732:         a complete pattern. */
                   2733: 
                   2734:         for (matched_count = 0;; matched_count++)
                   2735:           {
                   2736:           int local_offsets[2];
                   2737:           int local_workspace[1000];
                   2738: 
                   2739:           int rc = internal_dfa_exec(
                   2740:             md,                                   /* fixed match data */
                   2741:             code,                                 /* this subexpression's code */
                   2742:             local_ptr,                            /* where we currently are */
                   2743:             (int)(ptr - start_subject),           /* start offset */
                   2744:             local_offsets,                        /* offset vector */
                   2745:             sizeof(local_offsets)/sizeof(int),    /* size of same */
                   2746:             local_workspace,                      /* workspace vector */
                   2747:             sizeof(local_workspace)/sizeof(int),  /* size of same */
                   2748:             rlevel);                              /* function recursion level */
                   2749: 
                   2750:           /* Failed to match */
                   2751: 
                   2752:           if (rc < 0)
                   2753:             {
                   2754:             if (rc != PCRE_ERROR_NOMATCH) return rc;
                   2755:             break;
                   2756:             }
                   2757: 
                   2758:           /* Matched: break the loop if zero characters matched. */
                   2759: 
                   2760:           charcount = local_offsets[1] - local_offsets[0];
                   2761:           if (charcount == 0) break;
                   2762:           local_ptr += charcount;    /* Advance temporary position ptr */
                   2763:           }
                   2764: 
                   2765:         /* At this point we have matched the subpattern matched_count
                   2766:         times, and local_ptr is pointing to the character after the end of the
                   2767:         last match. */
                   2768: 
                   2769:         if (matched_count > 0 || allow_zero)
                   2770:           {
1.1.1.2 ! misho    2771:           const pcre_uchar *end_subpattern = code;
1.1       misho    2772:           int next_state_offset;
                   2773: 
                   2774:           do { end_subpattern += GET(end_subpattern, 1); }
                   2775:             while (*end_subpattern == OP_ALT);
                   2776:           next_state_offset =
                   2777:             (int)(end_subpattern - start_code + LINK_SIZE + 1);
                   2778: 
                   2779:           /* Optimization: if there are no more active states, and there
                   2780:           are no new states yet set up, then skip over the subject string
                   2781:           right here, to save looping. Otherwise, set up the new state to swing
                   2782:           into action when the end of the matched substring is reached. */
                   2783: 
                   2784:           if (i + 1 >= active_count && new_count == 0)
                   2785:             {
                   2786:             ptr = local_ptr;
                   2787:             clen = 0;
                   2788:             ADD_NEW(next_state_offset, 0);
                   2789:             }
                   2790:           else
                   2791:             {
1.1.1.2 ! misho    2792:             const pcre_uchar *p = ptr;
        !          2793:             const pcre_uchar *pp = local_ptr;
1.1       misho    2794:             charcount = (int)(pp - p);
1.1.1.2 ! misho    2795: #ifdef SUPPORT_UTF
        !          2796:             while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
        !          2797: #endif
1.1       misho    2798:             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
                   2799:             }
                   2800:           }
                   2801:         }
                   2802:       break;
                   2803: 
                   2804:       /*-----------------------------------------------------------------*/
                   2805:       case OP_ONCE:
                   2806:       case OP_ONCE_NC:
                   2807:         {
                   2808:         int local_offsets[2];
                   2809:         int local_workspace[1000];
                   2810: 
                   2811:         int rc = internal_dfa_exec(
                   2812:           md,                                   /* fixed match data */
                   2813:           code,                                 /* this subexpression's code */
                   2814:           ptr,                                  /* where we currently are */
                   2815:           (int)(ptr - start_subject),           /* start offset */
                   2816:           local_offsets,                        /* offset vector */
                   2817:           sizeof(local_offsets)/sizeof(int),    /* size of same */
                   2818:           local_workspace,                      /* workspace vector */
                   2819:           sizeof(local_workspace)/sizeof(int),  /* size of same */
                   2820:           rlevel);                              /* function recursion level */
                   2821: 
                   2822:         if (rc >= 0)
                   2823:           {
1.1.1.2 ! misho    2824:           const pcre_uchar *end_subpattern = code;
1.1       misho    2825:           int charcount = local_offsets[1] - local_offsets[0];
                   2826:           int next_state_offset, repeat_state_offset;
                   2827: 
                   2828:           do { end_subpattern += GET(end_subpattern, 1); }
                   2829:             while (*end_subpattern == OP_ALT);
                   2830:           next_state_offset =
                   2831:             (int)(end_subpattern - start_code + LINK_SIZE + 1);
                   2832: 
                   2833:           /* If the end of this subpattern is KETRMAX or KETRMIN, we must
                   2834:           arrange for the repeat state also to be added to the relevant list.
                   2835:           Calculate the offset, or set -1 for no repeat. */
                   2836: 
                   2837:           repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
                   2838:                                  *end_subpattern == OP_KETRMIN)?
                   2839:             (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
                   2840: 
                   2841:           /* If we have matched an empty string, add the next state at the
                   2842:           current character pointer. This is important so that the duplicate
                   2843:           checking kicks in, which is what breaks infinite loops that match an
                   2844:           empty string. */
                   2845: 
                   2846:           if (charcount == 0)
                   2847:             {
                   2848:             ADD_ACTIVE(next_state_offset, 0);
                   2849:             }
                   2850: 
                   2851:           /* Optimization: if there are no more active states, and there
                   2852:           are no new states yet set up, then skip over the subject string
                   2853:           right here, to save looping. Otherwise, set up the new state to swing
                   2854:           into action when the end of the matched substring is reached. */
                   2855: 
                   2856:           else if (i + 1 >= active_count && new_count == 0)
                   2857:             {
                   2858:             ptr += charcount;
                   2859:             clen = 0;
                   2860:             ADD_NEW(next_state_offset, 0);
                   2861: 
                   2862:             /* If we are adding a repeat state at the new character position,
                   2863:             we must fudge things so that it is the only current state.
                   2864:             Otherwise, it might be a duplicate of one we processed before, and
                   2865:             that would cause it to be skipped. */
                   2866: 
                   2867:             if (repeat_state_offset >= 0)
                   2868:               {
                   2869:               next_active_state = active_states;
                   2870:               active_count = 0;
                   2871:               i = -1;
                   2872:               ADD_ACTIVE(repeat_state_offset, 0);
                   2873:               }
                   2874:             }
                   2875:           else
                   2876:             {
1.1.1.2 ! misho    2877: #ifdef SUPPORT_UTF
        !          2878:             const pcre_uchar *p = start_subject + local_offsets[0];
        !          2879:             const pcre_uchar *pp = start_subject + local_offsets[1];
        !          2880:             while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
        !          2881: #endif
1.1       misho    2882:             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
                   2883:             if (repeat_state_offset >= 0)
                   2884:               { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
                   2885:             }
                   2886:           }
                   2887:         else if (rc != PCRE_ERROR_NOMATCH) return rc;
                   2888:         }
                   2889:       break;
                   2890: 
                   2891: 
                   2892: /* ========================================================================== */
                   2893:       /* Handle callouts */
                   2894: 
                   2895:       case OP_CALLOUT:
                   2896:       rrc = 0;
1.1.1.2 ! misho    2897:       if (PUBL(callout) != NULL)
1.1       misho    2898:         {
1.1.1.2 ! misho    2899:         PUBL(callout_block) cb;
1.1       misho    2900:         cb.version          = 1;   /* Version 1 of the callout block */
                   2901:         cb.callout_number   = code[1];
                   2902:         cb.offset_vector    = offsets;
1.1.1.2 ! misho    2903: #ifdef COMPILE_PCRE8
1.1       misho    2904:         cb.subject          = (PCRE_SPTR)start_subject;
1.1.1.2 ! misho    2905: #else
        !          2906:         cb.subject          = (PCRE_SPTR16)start_subject;
        !          2907: #endif
1.1       misho    2908:         cb.subject_length   = (int)(end_subject - start_subject);
                   2909:         cb.start_match      = (int)(current_subject - start_subject);
                   2910:         cb.current_position = (int)(ptr - start_subject);
                   2911:         cb.pattern_position = GET(code, 2);
                   2912:         cb.next_item_length = GET(code, 2 + LINK_SIZE);
                   2913:         cb.capture_top      = 1;
                   2914:         cb.capture_last     = -1;
                   2915:         cb.callout_data     = md->callout_data;
                   2916:         cb.mark             = NULL;   /* No (*MARK) support */
1.1.1.2 ! misho    2917:         if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
1.1       misho    2918:         }
                   2919:       if (rrc == 0)
1.1.1.2 ! misho    2920:         { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
1.1       misho    2921:       break;
                   2922: 
                   2923: 
                   2924: /* ========================================================================== */
                   2925:       default:        /* Unsupported opcode */
                   2926:       return PCRE_ERROR_DFA_UITEM;
                   2927:       }
                   2928: 
                   2929:     NEXT_ACTIVE_STATE: continue;
                   2930: 
                   2931:     }      /* End of loop scanning active states */
                   2932: 
                   2933:   /* We have finished the processing at the current subject character. If no
                   2934:   new states have been set for the next character, we have found all the
                   2935:   matches that we are going to find. If we are at the top level and partial
                   2936:   matching has been requested, check for appropriate conditions.
                   2937: 
                   2938:   The "forced_ fail" variable counts the number of (*F) encountered for the
                   2939:   character. If it is equal to the original active_count (saved in
                   2940:   workspace[1]) it means that (*F) was found on every active state. In this
                   2941:   case we don't want to give a partial match.
                   2942: 
                   2943:   The "could_continue" variable is true if a state could have continued but
                   2944:   for the fact that the end of the subject was reached. */
                   2945: 
                   2946:   if (new_count <= 0)
                   2947:     {
                   2948:     if (rlevel == 1 &&                               /* Top level, and */
                   2949:         could_continue &&                            /* Some could go on */
                   2950:         forced_fail != workspace[1] &&               /* Not all forced fail & */
                   2951:         (                                            /* either... */
                   2952:         (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
                   2953:         ||                                           /* or... */
                   2954:         ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
                   2955:          match_count < 0)                            /* no matches */
                   2956:         ) &&                                         /* And... */
                   2957:         ptr >= end_subject &&                  /* Reached end of subject */
                   2958:         ptr > md->start_used_ptr)              /* Inspected non-empty string */
                   2959:       {
                   2960:       if (offsetcount >= 2)
                   2961:         {
                   2962:         offsets[0] = (int)(md->start_used_ptr - start_subject);
                   2963:         offsets[1] = (int)(end_subject - start_subject);
                   2964:         }
                   2965:       match_count = PCRE_ERROR_PARTIAL;
                   2966:       }
                   2967: 
                   2968:     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
                   2969:       "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
                   2970:       rlevel*2-2, SP));
                   2971:     break;        /* In effect, "return", but see the comment below */
                   2972:     }
                   2973: 
                   2974:   /* One or more states are active for the next character. */
                   2975: 
                   2976:   ptr += clen;    /* Advance to next subject character */
                   2977:   }               /* Loop to move along the subject string */
                   2978: 
                   2979: /* Control gets here from "break" a few lines above. We do it this way because
                   2980: if we use "return" above, we have compiler trouble. Some compilers warn if
                   2981: there's nothing here because they think the function doesn't return a value. On
                   2982: the other hand, if we put a dummy statement here, some more clever compilers
                   2983: complain that it can't be reached. Sigh. */
                   2984: 
                   2985: return match_count;
                   2986: }
                   2987: 
                   2988: 
                   2989: 
                   2990: 
                   2991: /*************************************************
                   2992: *    Execute a Regular Expression - DFA engine   *
                   2993: *************************************************/
                   2994: 
                   2995: /* This external function applies a compiled re to a subject string using a DFA
                   2996: engine. This function calls the internal function multiple times if the pattern
                   2997: is not anchored.
                   2998: 
                   2999: Arguments:
                   3000:   argument_re     points to the compiled expression
                   3001:   extra_data      points to extra data or is NULL
                   3002:   subject         points to the subject string
                   3003:   length          length of subject string (may contain binary zeros)
                   3004:   start_offset    where to start in the subject string
                   3005:   options         option bits
                   3006:   offsets         vector of match offsets
                   3007:   offsetcount     size of same
                   3008:   workspace       workspace vector
                   3009:   wscount         size of same
                   3010: 
                   3011: Returns:          > 0 => number of match offset pairs placed in offsets
                   3012:                   = 0 => offsets overflowed; longest matches are present
                   3013:                    -1 => failed to match
                   3014:                  < -1 => some kind of unexpected problem
                   3015: */
                   3016: 
1.1.1.2 ! misho    3017: #ifdef COMPILE_PCRE8
1.1       misho    3018: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
                   3019: pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
                   3020:   const char *subject, int length, int start_offset, int options, int *offsets,
                   3021:   int offsetcount, int *workspace, int wscount)
1.1.1.2 ! misho    3022: #else
        !          3023: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
        !          3024: pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
        !          3025:   PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
        !          3026:   int offsetcount, int *workspace, int wscount)
        !          3027: #endif
1.1       misho    3028: {
1.1.1.2 ! misho    3029: REAL_PCRE *re = (REAL_PCRE *)argument_re;
1.1       misho    3030: dfa_match_data match_block;
                   3031: dfa_match_data *md = &match_block;
1.1.1.2 ! misho    3032: BOOL utf, anchored, startline, firstline;
        !          3033: const pcre_uchar *current_subject, *end_subject;
1.1       misho    3034: const pcre_study_data *study = NULL;
                   3035: 
1.1.1.2 ! misho    3036: const pcre_uchar *req_char_ptr;
        !          3037: const pcre_uint8 *start_bits = NULL;
        !          3038: BOOL has_first_char = FALSE;
        !          3039: BOOL has_req_char = FALSE;
        !          3040: pcre_uchar first_char = 0;
        !          3041: pcre_uchar first_char2 = 0;
        !          3042: pcre_uchar req_char = 0;
        !          3043: pcre_uchar req_char2 = 0;
1.1       misho    3044: int newline;
                   3045: 
                   3046: /* Plausibility checks */
                   3047: 
                   3048: if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
                   3049: if (re == NULL || subject == NULL || workspace == NULL ||
                   3050:    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
                   3051: if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
                   3052: if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
                   3053: if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
                   3054: 
                   3055: /* We need to find the pointer to any study data before we test for byte
                   3056: flipping, so we scan the extra_data block first. This may set two fields in the
                   3057: match block, so we must initialize them beforehand. However, the other fields
                   3058: in the match block must not be set until after the byte flipping. */
                   3059: 
                   3060: md->tables = re->tables;
                   3061: md->callout_data = NULL;
                   3062: 
                   3063: if (extra_data != NULL)
                   3064:   {
                   3065:   unsigned int flags = extra_data->flags;
                   3066:   if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
                   3067:     study = (const pcre_study_data *)extra_data->study_data;
                   3068:   if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
                   3069:   if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
                   3070:     return PCRE_ERROR_DFA_UMLIMIT;
                   3071:   if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
                   3072:     md->callout_data = extra_data->callout_data;
                   3073:   if ((flags & PCRE_EXTRA_TABLES) != 0)
                   3074:     md->tables = extra_data->tables;
                   3075:   }
                   3076: 
                   3077: /* Check that the first field in the block is the magic number. If it is not,
1.1.1.2 ! misho    3078: return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
        !          3079: REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
        !          3080: means that the pattern is likely compiled with different endianness. */
1.1       misho    3081: 
                   3082: if (re->magic_number != MAGIC_NUMBER)
1.1.1.2 ! misho    3083:   return re->magic_number == REVERSED_MAGIC_NUMBER?
        !          3084:     PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
        !          3085: if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
1.1       misho    3086: 
                   3087: /* Set some local values */
                   3088: 
1.1.1.2 ! misho    3089: current_subject = (const pcre_uchar *)subject + start_offset;
        !          3090: end_subject = (const pcre_uchar *)subject + length;
        !          3091: req_char_ptr = current_subject - 1;
        !          3092: 
        !          3093: #ifdef SUPPORT_UTF
        !          3094: /* PCRE_UTF16 has the same value as PCRE_UTF8. */
        !          3095: utf = (re->options & PCRE_UTF8) != 0;
1.1       misho    3096: #else
1.1.1.2 ! misho    3097: utf = FALSE;
1.1       misho    3098: #endif
                   3099: 
                   3100: anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
                   3101:   (re->options & PCRE_ANCHORED) != 0;
                   3102: 
                   3103: /* The remaining fixed data for passing around. */
                   3104: 
1.1.1.2 ! misho    3105: md->start_code = (const pcre_uchar *)argument_re +
1.1       misho    3106:     re->name_table_offset + re->name_count * re->name_entry_size;
1.1.1.2 ! misho    3107: md->start_subject = (const pcre_uchar *)subject;
1.1       misho    3108: md->end_subject = end_subject;
                   3109: md->start_offset = start_offset;
                   3110: md->moptions = options;
                   3111: md->poptions = re->options;
                   3112: 
                   3113: /* If the BSR option is not set at match time, copy what was set
                   3114: at compile time. */
                   3115: 
                   3116: if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
                   3117:   {
                   3118:   if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
                   3119:     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
                   3120: #ifdef BSR_ANYCRLF
                   3121:   else md->moptions |= PCRE_BSR_ANYCRLF;
                   3122: #endif
                   3123:   }
                   3124: 
                   3125: /* Handle different types of newline. The three bits give eight cases. If
                   3126: nothing is set at run time, whatever was used at compile time applies. */
                   3127: 
                   3128: switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
                   3129:          PCRE_NEWLINE_BITS)
                   3130:   {
                   3131:   case 0: newline = NEWLINE; break;   /* Compile-time default */
                   3132:   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
                   3133:   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
                   3134:   case PCRE_NEWLINE_CR+
                   3135:        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
                   3136:   case PCRE_NEWLINE_ANY: newline = -1; break;
                   3137:   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
                   3138:   default: return PCRE_ERROR_BADNEWLINE;
                   3139:   }
                   3140: 
                   3141: if (newline == -2)
                   3142:   {
                   3143:   md->nltype = NLTYPE_ANYCRLF;
                   3144:   }
                   3145: else if (newline < 0)
                   3146:   {
                   3147:   md->nltype = NLTYPE_ANY;
                   3148:   }
                   3149: else
                   3150:   {
                   3151:   md->nltype = NLTYPE_FIXED;
                   3152:   if (newline > 255)
                   3153:     {
                   3154:     md->nllen = 2;
                   3155:     md->nl[0] = (newline >> 8) & 255;
                   3156:     md->nl[1] = newline & 255;
                   3157:     }
                   3158:   else
                   3159:     {
                   3160:     md->nllen = 1;
                   3161:     md->nl[0] = newline;
                   3162:     }
                   3163:   }
                   3164: 
                   3165: /* Check a UTF-8 string if required. Unfortunately there's no way of passing
                   3166: back the character offset. */
                   3167: 
1.1.1.2 ! misho    3168: #ifdef SUPPORT_UTF
        !          3169: if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
1.1       misho    3170:   {
                   3171:   int erroroffset;
1.1.1.2 ! misho    3172:   int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
1.1       misho    3173:   if (errorcode != 0)
                   3174:     {
                   3175:     if (offsetcount >= 2)
                   3176:       {
                   3177:       offsets[0] = erroroffset;
                   3178:       offsets[1] = errorcode;
                   3179:       }
                   3180:     return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
                   3181:       PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
                   3182:     }
                   3183:   if (start_offset > 0 && start_offset < length &&
1.1.1.2 ! misho    3184:         NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
1.1       misho    3185:     return PCRE_ERROR_BADUTF8_OFFSET;
                   3186:   }
                   3187: #endif
                   3188: 
                   3189: /* If the exec call supplied NULL for tables, use the inbuilt ones. This
                   3190: is a feature that makes it possible to save compiled regex and re-use them
                   3191: in other programs later. */
                   3192: 
1.1.1.2 ! misho    3193: if (md->tables == NULL) md->tables = PRIV(default_tables);
1.1       misho    3194: 
1.1.1.2 ! misho    3195: /* The "must be at the start of a line" flags are used in a loop when finding
        !          3196: where to start. */
1.1       misho    3197: 
                   3198: startline = (re->flags & PCRE_STARTLINE) != 0;
                   3199: firstline = (re->options & PCRE_FIRSTLINE) != 0;
                   3200: 
                   3201: /* Set up the first character to match, if available. The first_byte value is
                   3202: never set for an anchored regular expression, but the anchoring may be forced
                   3203: at run time, so we have to test for anchoring. The first char may be unset for
                   3204: an unanchored pattern, of course. If there's no first char and the pattern was
                   3205: studied, there may be a bitmap of possible first characters. */
                   3206: 
                   3207: if (!anchored)
                   3208:   {
                   3209:   if ((re->flags & PCRE_FIRSTSET) != 0)
                   3210:     {
1.1.1.2 ! misho    3211:     has_first_char = TRUE;
        !          3212:     first_char = first_char2 = (pcre_uchar)(re->first_char);
        !          3213:     if ((re->flags & PCRE_FCH_CASELESS) != 0)
        !          3214:       {
        !          3215:       first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
        !          3216: #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
        !          3217:       if (utf && first_char > 127)
        !          3218:         first_char2 = UCD_OTHERCASE(first_char);
        !          3219: #endif
        !          3220:       }
1.1       misho    3221:     }
                   3222:   else
                   3223:     {
                   3224:     if (!startline && study != NULL &&
                   3225:          (study->flags & PCRE_STUDY_MAPPED) != 0)
                   3226:       start_bits = study->start_bits;
                   3227:     }
                   3228:   }
                   3229: 
                   3230: /* For anchored or unanchored matches, there may be a "last known required
                   3231: character" set. */
                   3232: 
                   3233: if ((re->flags & PCRE_REQCHSET) != 0)
                   3234:   {
1.1.1.2 ! misho    3235:   has_req_char = TRUE;
        !          3236:   req_char = req_char2 = (pcre_uchar)(re->req_char);
        !          3237:   if ((re->flags & PCRE_RCH_CASELESS) != 0)
        !          3238:     {
        !          3239:     req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
        !          3240: #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
        !          3241:     if (utf && req_char > 127)
        !          3242:       req_char2 = UCD_OTHERCASE(req_char);
        !          3243: #endif
        !          3244:     }
1.1       misho    3245:   }
                   3246: 
                   3247: /* Call the main matching function, looping for a non-anchored regex after a
                   3248: failed match. If not restarting, perform certain optimizations at the start of
                   3249: a match. */
                   3250: 
                   3251: for (;;)
                   3252:   {
                   3253:   int rc;
                   3254: 
                   3255:   if ((options & PCRE_DFA_RESTART) == 0)
                   3256:     {
1.1.1.2 ! misho    3257:     const pcre_uchar *save_end_subject = end_subject;
1.1       misho    3258: 
                   3259:     /* If firstline is TRUE, the start of the match is constrained to the first
                   3260:     line of a multiline string. Implement this by temporarily adjusting
                   3261:     end_subject so that we stop scanning at a newline. If the match fails at
                   3262:     the newline, later code breaks this loop. */
                   3263: 
                   3264:     if (firstline)
                   3265:       {
1.1.1.2 ! misho    3266:       PCRE_PUCHAR t = current_subject;
        !          3267: #ifdef SUPPORT_UTF
        !          3268:       if (utf)
1.1       misho    3269:         {
                   3270:         while (t < md->end_subject && !IS_NEWLINE(t))
                   3271:           {
                   3272:           t++;
1.1.1.2 ! misho    3273:           ACROSSCHAR(t < end_subject, *t, t++);
1.1       misho    3274:           }
                   3275:         }
                   3276:       else
                   3277: #endif
                   3278:       while (t < md->end_subject && !IS_NEWLINE(t)) t++;
                   3279:       end_subject = t;
                   3280:       }
                   3281: 
                   3282:     /* There are some optimizations that avoid running the match if a known
                   3283:     starting point is not found. However, there is an option that disables
                   3284:     these, for testing and for ensuring that all callouts do actually occur.
                   3285:     The option can be set in the regex by (*NO_START_OPT) or passed in
                   3286:     match-time options. */
                   3287: 
                   3288:     if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
                   3289:       {
1.1.1.2 ! misho    3290:       /* Advance to a known first char. */
1.1       misho    3291: 
1.1.1.2 ! misho    3292:       if (has_first_char)
1.1       misho    3293:         {
1.1.1.2 ! misho    3294:         if (first_char != first_char2)
1.1       misho    3295:           while (current_subject < end_subject &&
1.1.1.2 ! misho    3296:               *current_subject != first_char && *current_subject != first_char2)
1.1       misho    3297:             current_subject++;
                   3298:         else
                   3299:           while (current_subject < end_subject &&
1.1.1.2 ! misho    3300:                  *current_subject != first_char)
1.1       misho    3301:             current_subject++;
                   3302:         }
                   3303: 
                   3304:       /* Or to just after a linebreak for a multiline match if possible */
                   3305: 
                   3306:       else if (startline)
                   3307:         {
                   3308:         if (current_subject > md->start_subject + start_offset)
                   3309:           {
1.1.1.2 ! misho    3310: #ifdef SUPPORT_UTF
        !          3311:           if (utf)
1.1       misho    3312:             {
                   3313:             while (current_subject < end_subject &&
                   3314:                    !WAS_NEWLINE(current_subject))
                   3315:               {
                   3316:               current_subject++;
1.1.1.2 ! misho    3317:               ACROSSCHAR(current_subject < end_subject, *current_subject,
        !          3318:                 current_subject++);
1.1       misho    3319:               }
                   3320:             }
                   3321:           else
                   3322: #endif
                   3323:           while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
                   3324:             current_subject++;
                   3325: 
                   3326:           /* If we have just passed a CR and the newline option is ANY or
                   3327:           ANYCRLF, and we are now at a LF, advance the match position by one
                   3328:           more character. */
                   3329: 
                   3330:           if (current_subject[-1] == CHAR_CR &&
                   3331:                (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
                   3332:                current_subject < end_subject &&
                   3333:                *current_subject == CHAR_NL)
                   3334:             current_subject++;
                   3335:           }
                   3336:         }
                   3337: 
                   3338:       /* Or to a non-unique first char after study */
                   3339: 
                   3340:       else if (start_bits != NULL)
                   3341:         {
                   3342:         while (current_subject < end_subject)
                   3343:           {
                   3344:           register unsigned int c = *current_subject;
1.1.1.2 ! misho    3345: #ifndef COMPILE_PCRE8
        !          3346:           if (c > 255) c = 255;
        !          3347: #endif
1.1       misho    3348:           if ((start_bits[c/8] & (1 << (c&7))) == 0)
                   3349:             {
                   3350:             current_subject++;
1.1.1.2 ! misho    3351: #if defined SUPPORT_UTF && defined COMPILE_PCRE8
        !          3352:             /* In non 8-bit mode, the iteration will stop for
        !          3353:             characters > 255 at the beginning or not stop at all. */
        !          3354:             if (utf)
        !          3355:               ACROSSCHAR(current_subject < end_subject, *current_subject,
        !          3356:                 current_subject++);
1.1       misho    3357: #endif
                   3358:             }
                   3359:           else break;
                   3360:           }
                   3361:         }
                   3362:       }
                   3363: 
                   3364:     /* Restore fudged end_subject */
                   3365: 
                   3366:     end_subject = save_end_subject;
                   3367: 
                   3368:     /* The following two optimizations are disabled for partial matching or if
                   3369:     disabling is explicitly requested (and of course, by the test above, this
                   3370:     code is not obeyed when restarting after a partial match). */
                   3371: 
                   3372:     if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
                   3373:         (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
                   3374:       {
                   3375:       /* If the pattern was studied, a minimum subject length may be set. This
                   3376:       is a lower bound; no actual string of that length may actually match the
                   3377:       pattern. Although the value is, strictly, in characters, we treat it as
                   3378:       bytes to avoid spending too much time in this optimization. */
                   3379: 
                   3380:       if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
                   3381:           (pcre_uint32)(end_subject - current_subject) < study->minlength)
                   3382:         return PCRE_ERROR_NOMATCH;
                   3383: 
1.1.1.2 ! misho    3384:       /* If req_char is set, we know that that character must appear in the
        !          3385:       subject for the match to succeed. If the first character is set, req_char
1.1       misho    3386:       must be later in the subject; otherwise the test starts at the match
                   3387:       point. This optimization can save a huge amount of work in patterns with
                   3388:       nested unlimited repeats that aren't going to match. Writing separate
                   3389:       code for cased/caseless versions makes it go faster, as does using an
                   3390:       autoincrement and backing off on a match.
                   3391: 
                   3392:       HOWEVER: when the subject string is very, very long, searching to its end
                   3393:       can take a long time, and give bad performance on quite ordinary
                   3394:       patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
                   3395:       string... so we don't do this when the string is sufficiently long. */
                   3396: 
1.1.1.2 ! misho    3397:       if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
1.1       misho    3398:         {
1.1.1.2 ! misho    3399:         register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
1.1       misho    3400: 
                   3401:         /* We don't need to repeat the search if we haven't yet reached the
                   3402:         place we found it at last time. */
                   3403: 
1.1.1.2 ! misho    3404:         if (p > req_char_ptr)
1.1       misho    3405:           {
1.1.1.2 ! misho    3406:           if (req_char != req_char2)
1.1       misho    3407:             {
                   3408:             while (p < end_subject)
                   3409:               {
                   3410:               register int pp = *p++;
1.1.1.2 ! misho    3411:               if (pp == req_char || pp == req_char2) { p--; break; }
1.1       misho    3412:               }
                   3413:             }
                   3414:           else
                   3415:             {
                   3416:             while (p < end_subject)
                   3417:               {
1.1.1.2 ! misho    3418:               if (*p++ == req_char) { p--; break; }
1.1       misho    3419:               }
                   3420:             }
                   3421: 
                   3422:           /* If we can't find the required character, break the matching loop,
                   3423:           which will cause a return or PCRE_ERROR_NOMATCH. */
                   3424: 
                   3425:           if (p >= end_subject) break;
                   3426: 
                   3427:           /* If we have found the required character, save the point where we
                   3428:           found it, so that we don't search again next time round the loop if
                   3429:           the start hasn't passed this character yet. */
                   3430: 
1.1.1.2 ! misho    3431:           req_char_ptr = p;
1.1       misho    3432:           }
                   3433:         }
                   3434:       }
                   3435:     }   /* End of optimizations that are done when not restarting */
                   3436: 
                   3437:   /* OK, now we can do the business */
                   3438: 
                   3439:   md->start_used_ptr = current_subject;
                   3440:   md->recursive = NULL;
                   3441: 
                   3442:   rc = internal_dfa_exec(
                   3443:     md,                                /* fixed match data */
                   3444:     md->start_code,                    /* this subexpression's code */
                   3445:     current_subject,                   /* where we currently are */
                   3446:     start_offset,                      /* start offset in subject */
                   3447:     offsets,                           /* offset vector */
                   3448:     offsetcount,                       /* size of same */
                   3449:     workspace,                         /* workspace vector */
                   3450:     wscount,                           /* size of same */
                   3451:     0);                                /* function recurse level */
                   3452: 
                   3453:   /* Anything other than "no match" means we are done, always; otherwise, carry
                   3454:   on only if not anchored. */
                   3455: 
                   3456:   if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
                   3457: 
                   3458:   /* Advance to the next subject character unless we are at the end of a line
                   3459:   and firstline is set. */
                   3460: 
                   3461:   if (firstline && IS_NEWLINE(current_subject)) break;
                   3462:   current_subject++;
1.1.1.2 ! misho    3463: #ifdef SUPPORT_UTF
        !          3464:   if (utf)
1.1       misho    3465:     {
1.1.1.2 ! misho    3466:     ACROSSCHAR(current_subject < end_subject, *current_subject,
        !          3467:       current_subject++);
1.1       misho    3468:     }
1.1.1.2 ! misho    3469: #endif
1.1       misho    3470:   if (current_subject > end_subject) break;
                   3471: 
                   3472:   /* If we have just passed a CR and we are now at a LF, and the pattern does
                   3473:   not contain any explicit matches for \r or \n, and the newline option is CRLF
                   3474:   or ANY or ANYCRLF, advance the match position by one more character. */
                   3475: 
                   3476:   if (current_subject[-1] == CHAR_CR &&
                   3477:       current_subject < end_subject &&
                   3478:       *current_subject == CHAR_NL &&
                   3479:       (re->flags & PCRE_HASCRORLF) == 0 &&
                   3480:         (md->nltype == NLTYPE_ANY ||
                   3481:          md->nltype == NLTYPE_ANYCRLF ||
                   3482:          md->nllen == 2))
                   3483:     current_subject++;
                   3484: 
                   3485:   }   /* "Bumpalong" loop */
                   3486: 
                   3487: return PCRE_ERROR_NOMATCH;
                   3488: }
                   3489: 
                   3490: /* End of pcre_dfa_exec.c */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>