File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / pcre / pcre_compile.c
Revision 1.1.1.5 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Sun Jun 15 19:46:03 2014 UTC (10 years ago) by misho
Branches: pcre, MAIN
CVS tags: v8_34, HEAD
pcre 8.34

    1: /*************************************************
    2: *      Perl-Compatible Regular Expressions       *
    3: *************************************************/
    4: 
    5: /* PCRE is a library of functions to support regular expressions whose syntax
    6: and semantics are as close as possible to those of the Perl 5 language.
    7: 
    8:                        Written by Philip Hazel
    9:            Copyright (c) 1997-2013 University of Cambridge
   10: 
   11: -----------------------------------------------------------------------------
   12: Redistribution and use in source and binary forms, with or without
   13: modification, are permitted provided that the following conditions are met:
   14: 
   15:     * Redistributions of source code must retain the above copyright notice,
   16:       this list of conditions and the following disclaimer.
   17: 
   18:     * Redistributions in binary form must reproduce the above copyright
   19:       notice, this list of conditions and the following disclaimer in the
   20:       documentation and/or other materials provided with the distribution.
   21: 
   22:     * Neither the name of the University of Cambridge nor the names of its
   23:       contributors may be used to endorse or promote products derived from
   24:       this software without specific prior written permission.
   25: 
   26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
   27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
   30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   36: POSSIBILITY OF SUCH DAMAGE.
   37: -----------------------------------------------------------------------------
   38: */
   39: 
   40: 
   41: /* This module contains the external function pcre_compile(), along with
   42: supporting internal functions that are not used by other modules. */
   43: 
   44: 
   45: #ifdef HAVE_CONFIG_H
   46: #include "config.h"
   47: #endif
   48: 
   49: #define NLBLOCK cd             /* Block containing newline information */
   50: #define PSSTART start_pattern  /* Field containing processed string start */
   51: #define PSEND   end_pattern    /* Field containing processed string end */
   52: 
   53: #include "pcre_internal.h"
   54: 
   55: 
   56: /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
   57: is also used by pcretest. PCRE_DEBUG is not defined when building a production
   58: library. We do not need to select pcre16_printint.c specially, because the
   59: COMPILE_PCREx macro will already be appropriately set. */
   60: 
   61: #ifdef PCRE_DEBUG
   62: /* pcre_printint.c should not include any headers */
   63: #define PCRE_INCLUDED
   64: #include "pcre_printint.c"
   65: #undef PCRE_INCLUDED
   66: #endif
   67: 
   68: 
   69: /* Macro for setting individual bits in class bitmaps. */
   70: 
   71: #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
   72: 
   73: /* Maximum length value to check against when making sure that the integer that
   74: holds the compiled pattern length does not overflow. We make it a bit less than
   75: INT_MAX to allow for adding in group terminating bytes, so that we don't have
   76: to check them every time. */
   77: 
   78: #define OFLOW_MAX (INT_MAX - 20)
   79: 
   80: /* Definitions to allow mutual recursion */
   81: 
   82: static int
   83:   add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
   84:     const pcre_uint32 *, unsigned int);
   85: 
   86: static BOOL
   87:   compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
   88:     pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
   89:     compile_data *, int *);
   90: 
   91: 
   92: 
   93: /*************************************************
   94: *      Code parameters and static tables         *
   95: *************************************************/
   96: 
   97: /* This value specifies the size of stack workspace that is used during the
   98: first pre-compile phase that determines how much memory is required. The regex
   99: is partly compiled into this space, but the compiled parts are discarded as
  100: soon as they can be, so that hopefully there will never be an overrun. The code
  101: does, however, check for an overrun. The largest amount I've seen used is 218,
  102: so this number is very generous.
  103: 
  104: The same workspace is used during the second, actual compile phase for
  105: remembering forward references to groups so that they can be filled in at the
  106: end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
  107: is 4 there is plenty of room for most patterns. However, the memory can get
  108: filled up by repetitions of forward references, for example patterns like
  109: /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
  110: that the workspace is expanded using malloc() in this situation. The value
  111: below is therefore a minimum, and we put a maximum on it for safety. The
  112: minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
  113: kicks in at the same number of forward references in all cases. */
  114: 
  115: #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
  116: #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
  117: 
  118: /* This value determines the size of the initial vector that is used for
  119: remembering named groups during the pre-compile. It is allocated on the stack,
  120: but if it is too small, it is expanded using malloc(), in a similar way to the
  121: workspace. The value is the number of slots in the list. */
  122: 
  123: #define NAMED_GROUP_LIST_SIZE  20
  124: 
  125: /* The overrun tests check for a slightly smaller size so that they detect the
  126: overrun before it actually does run off the end of the data block. */
  127: 
  128: #define WORK_SIZE_SAFETY_MARGIN (100)
  129: 
  130: /* Private flags added to firstchar and reqchar. */
  131: 
  132: #define REQ_CASELESS    (1 << 0)        /* Indicates caselessness */
  133: #define REQ_VARY        (1 << 1)        /* Reqchar followed non-literal item */
  134: /* Negative values for the firstchar and reqchar flags */
  135: #define REQ_UNSET       (-2)
  136: #define REQ_NONE        (-1)
  137: 
  138: /* Repeated character flags. */
  139: 
  140: #define UTF_LENGTH     0x10000000l      /* The char contains its length. */
  141: 
  142: /* Table for handling escaped characters in the range '0'-'z'. Positive returns
  143: are simple data values; negative values are for special things like \d and so
  144: on. Zero means further processing is needed (for things like \x), or the escape
  145: is invalid. */
  146: 
  147: #ifndef EBCDIC
  148: 
  149: /* This is the "normal" table for ASCII systems or for EBCDIC systems running
  150: in UTF-8 mode. */
  151: 
  152: static const short int escapes[] = {
  153:      0,                       0,
  154:      0,                       0,
  155:      0,                       0,
  156:      0,                       0,
  157:      0,                       0,
  158:      CHAR_COLON,              CHAR_SEMICOLON,
  159:      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
  160:      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
  161:      CHAR_COMMERCIAL_AT,      -ESC_A,
  162:      -ESC_B,                  -ESC_C,
  163:      -ESC_D,                  -ESC_E,
  164:      0,                       -ESC_G,
  165:      -ESC_H,                  0,
  166:      0,                       -ESC_K,
  167:      0,                       0,
  168:      -ESC_N,                  0,
  169:      -ESC_P,                  -ESC_Q,
  170:      -ESC_R,                  -ESC_S,
  171:      0,                       0,
  172:      -ESC_V,                  -ESC_W,
  173:      -ESC_X,                  0,
  174:      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
  175:      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
  176:      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
  177:      CHAR_GRAVE_ACCENT,       7,
  178:      -ESC_b,                  0,
  179:      -ESC_d,                  ESC_e,
  180:      ESC_f,                   0,
  181:      -ESC_h,                  0,
  182:      0,                       -ESC_k,
  183:      0,                       0,
  184:      ESC_n,                   0,
  185:      -ESC_p,                  0,
  186:      ESC_r,                   -ESC_s,
  187:      ESC_tee,                 0,
  188:      -ESC_v,                  -ESC_w,
  189:      0,                       0,
  190:      -ESC_z
  191: };
  192: 
  193: #else
  194: 
  195: /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
  196: 
  197: static const short int escapes[] = {
  198: /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
  199: /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
  200: /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
  201: /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
  202: /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
  203: /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
  204: /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
  205: /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
  206: /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
  207: /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
  208: /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
  209: /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
  210: /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
  211: /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
  212: /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
  213: /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
  214: /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
  215: /*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
  216: /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
  217: /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
  218: /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
  219: /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
  220: /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
  221: };
  222: #endif
  223: 
  224: 
  225: /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
  226: searched linearly. Put all the names into a single string, in order to reduce
  227: the number of relocations when a shared library is dynamically linked. The
  228: string is built from string macros so that it works in UTF-8 mode on EBCDIC
  229: platforms. */
  230: 
  231: typedef struct verbitem {
  232:   int   len;                 /* Length of verb name */
  233:   int   op;                  /* Op when no arg, or -1 if arg mandatory */
  234:   int   op_arg;              /* Op when arg present, or -1 if not allowed */
  235: } verbitem;
  236: 
  237: static const char verbnames[] =
  238:   "\0"                       /* Empty name is a shorthand for MARK */
  239:   STRING_MARK0
  240:   STRING_ACCEPT0
  241:   STRING_COMMIT0
  242:   STRING_F0
  243:   STRING_FAIL0
  244:   STRING_PRUNE0
  245:   STRING_SKIP0
  246:   STRING_THEN;
  247: 
  248: static const verbitem verbs[] = {
  249:   { 0, -1,        OP_MARK },
  250:   { 4, -1,        OP_MARK },
  251:   { 6, OP_ACCEPT, -1 },
  252:   { 6, OP_COMMIT, -1 },
  253:   { 1, OP_FAIL,   -1 },
  254:   { 4, OP_FAIL,   -1 },
  255:   { 5, OP_PRUNE,  OP_PRUNE_ARG },
  256:   { 4, OP_SKIP,   OP_SKIP_ARG  },
  257:   { 4, OP_THEN,   OP_THEN_ARG  }
  258: };
  259: 
  260: static const int verbcount = sizeof(verbs)/sizeof(verbitem);
  261: 
  262: 
  263: /* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
  264: another regex library. */
  265: 
  266: static const pcre_uchar sub_start_of_word[] = {
  267:   CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
  268:   CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
  269: 
  270: static const pcre_uchar sub_end_of_word[] = {
  271:   CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
  272:   CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
  273:   CHAR_RIGHT_PARENTHESIS, '\0' };
  274: 
  275: 
  276: /* Tables of names of POSIX character classes and their lengths. The names are
  277: now all in a single string, to reduce the number of relocations when a shared
  278: library is dynamically loaded. The list of lengths is terminated by a zero
  279: length entry. The first three must be alpha, lower, upper, as this is assumed
  280: for handling case independence. The indices for graph, print, and punct are
  281: needed, so identify them. */
  282: 
  283: static const char posix_names[] =
  284:   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
  285:   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
  286:   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
  287:   STRING_word0  STRING_xdigit;
  288: 
  289: static const pcre_uint8 posix_name_lengths[] = {
  290:   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
  291: 
  292: #define PC_GRAPH  8
  293: #define PC_PRINT  9
  294: #define PC_PUNCT 10
  295: 
  296: 
  297: /* Table of class bit maps for each POSIX class. Each class is formed from a
  298: base map, with an optional addition or removal of another map. Then, for some
  299: classes, there is some additional tweaking: for [:blank:] the vertical space
  300: characters are removed, and for [:alpha:] and [:alnum:] the underscore
  301: character is removed. The triples in the table consist of the base map offset,
  302: second map offset or -1 if no second map, and a non-negative value for map
  303: addition or a negative value for map subtraction (if there are two maps). The
  304: absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
  305: remove vertical space characters, 2 => remove underscore. */
  306: 
  307: static const int posix_class_maps[] = {
  308:   cbit_word,  cbit_digit, -2,             /* alpha */
  309:   cbit_lower, -1,          0,             /* lower */
  310:   cbit_upper, -1,          0,             /* upper */
  311:   cbit_word,  -1,          2,             /* alnum - word without underscore */
  312:   cbit_print, cbit_cntrl,  0,             /* ascii */
  313:   cbit_space, -1,          1,             /* blank - a GNU extension */
  314:   cbit_cntrl, -1,          0,             /* cntrl */
  315:   cbit_digit, -1,          0,             /* digit */
  316:   cbit_graph, -1,          0,             /* graph */
  317:   cbit_print, -1,          0,             /* print */
  318:   cbit_punct, -1,          0,             /* punct */
  319:   cbit_space, -1,          0,             /* space */
  320:   cbit_word,  -1,          0,             /* word - a Perl extension */
  321:   cbit_xdigit,-1,          0              /* xdigit */
  322: };
  323: 
  324: /* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
  325: Unicode property escapes. */
  326: 
  327: #ifdef SUPPORT_UCP
  328: static const pcre_uchar string_PNd[]  = {
  329:   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
  330:   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  331: static const pcre_uchar string_pNd[]  = {
  332:   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
  333:   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  334: static const pcre_uchar string_PXsp[] = {
  335:   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
  336:   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  337: static const pcre_uchar string_pXsp[] = {
  338:   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
  339:   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  340: static const pcre_uchar string_PXwd[] = {
  341:   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
  342:   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  343: static const pcre_uchar string_pXwd[] = {
  344:   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
  345:   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  346: 
  347: static const pcre_uchar *substitutes[] = {
  348:   string_PNd,           /* \D */
  349:   string_pNd,           /* \d */
  350:   string_PXsp,          /* \S */   /* Xsp is Perl space, but from 8.34, Perl */
  351:   string_pXsp,          /* \s */   /* space and POSIX space are the same. */
  352:   string_PXwd,          /* \W */
  353:   string_pXwd           /* \w */
  354: };
  355: 
  356: /* The POSIX class substitutes must be in the order of the POSIX class names,
  357: defined above, and there are both positive and negative cases. NULL means no
  358: general substitute of a Unicode property escape (\p or \P). However, for some
  359: POSIX classes (e.g. graph, print, punct) a special property code is compiled
  360: directly. */
  361: 
  362: static const pcre_uchar string_pL[] =   {
  363:   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
  364:   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  365: static const pcre_uchar string_pLl[] =  {
  366:   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
  367:   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  368: static const pcre_uchar string_pLu[] =  {
  369:   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
  370:   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  371: static const pcre_uchar string_pXan[] = {
  372:   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
  373:   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  374: static const pcre_uchar string_h[] =    {
  375:   CHAR_BACKSLASH, CHAR_h, '\0' };
  376: static const pcre_uchar string_pXps[] = {
  377:   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
  378:   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  379: static const pcre_uchar string_PL[] =   {
  380:   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
  381:   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  382: static const pcre_uchar string_PLl[] =  {
  383:   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
  384:   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  385: static const pcre_uchar string_PLu[] =  {
  386:   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
  387:   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  388: static const pcre_uchar string_PXan[] = {
  389:   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
  390:   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  391: static const pcre_uchar string_H[] =    {
  392:   CHAR_BACKSLASH, CHAR_H, '\0' };
  393: static const pcre_uchar string_PXps[] = {
  394:   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
  395:   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  396: 
  397: static const pcre_uchar *posix_substitutes[] = {
  398:   string_pL,            /* alpha */
  399:   string_pLl,           /* lower */
  400:   string_pLu,           /* upper */
  401:   string_pXan,          /* alnum */
  402:   NULL,                 /* ascii */
  403:   string_h,             /* blank */
  404:   NULL,                 /* cntrl */
  405:   string_pNd,           /* digit */
  406:   NULL,                 /* graph */
  407:   NULL,                 /* print */
  408:   NULL,                 /* punct */
  409:   string_pXps,          /* space */   /* Xps is POSIX space, but from 8.34 */
  410:   string_pXwd,          /* word  */   /* Perl and POSIX space are the same */
  411:   NULL,                 /* xdigit */
  412:   /* Negated cases */
  413:   string_PL,            /* ^alpha */
  414:   string_PLl,           /* ^lower */
  415:   string_PLu,           /* ^upper */
  416:   string_PXan,          /* ^alnum */
  417:   NULL,                 /* ^ascii */
  418:   string_H,             /* ^blank */
  419:   NULL,                 /* ^cntrl */
  420:   string_PNd,           /* ^digit */
  421:   NULL,                 /* ^graph */
  422:   NULL,                 /* ^print */
  423:   NULL,                 /* ^punct */
  424:   string_PXps,          /* ^space */  /* Xps is POSIX space, but from 8.34 */
  425:   string_PXwd,          /* ^word */   /* Perl and POSIX space are the same */
  426:   NULL                  /* ^xdigit */
  427: };
  428: #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
  429: #endif
  430: 
  431: #define STRING(a)  # a
  432: #define XSTRING(s) STRING(s)
  433: 
  434: /* The texts of compile-time error messages. These are "char *" because they
  435: are passed to the outside world. Do not ever re-use any error number, because
  436: they are documented. Always add a new error instead. Messages marked DEAD below
  437: are no longer used. This used to be a table of strings, but in order to reduce
  438: the number of relocations needed when a shared library is loaded dynamically,
  439: it is now one long string. We cannot use a table of offsets, because the
  440: lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
  441: simply count through to the one we want - this isn't a performance issue
  442: because these strings are used only when there is a compilation error.
  443: 
  444: Each substring ends with \0 to insert a null character. This includes the final
  445: substring, so that the whole string ends with \0\0, which can be detected when
  446: counting through. */
  447: 
  448: static const char error_texts[] =
  449:   "no error\0"
  450:   "\\ at end of pattern\0"
  451:   "\\c at end of pattern\0"
  452:   "unrecognized character follows \\\0"
  453:   "numbers out of order in {} quantifier\0"
  454:   /* 5 */
  455:   "number too big in {} quantifier\0"
  456:   "missing terminating ] for character class\0"
  457:   "invalid escape sequence in character class\0"
  458:   "range out of order in character class\0"
  459:   "nothing to repeat\0"
  460:   /* 10 */
  461:   "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
  462:   "internal error: unexpected repeat\0"
  463:   "unrecognized character after (? or (?-\0"
  464:   "POSIX named classes are supported only within a class\0"
  465:   "missing )\0"
  466:   /* 15 */
  467:   "reference to non-existent subpattern\0"
  468:   "erroffset passed as NULL\0"
  469:   "unknown option bit(s) set\0"
  470:   "missing ) after comment\0"
  471:   "parentheses nested too deeply\0"  /** DEAD **/
  472:   /* 20 */
  473:   "regular expression is too large\0"
  474:   "failed to get memory\0"
  475:   "unmatched parentheses\0"
  476:   "internal error: code overflow\0"
  477:   "unrecognized character after (?<\0"
  478:   /* 25 */
  479:   "lookbehind assertion is not fixed length\0"
  480:   "malformed number or name after (?(\0"
  481:   "conditional group contains more than two branches\0"
  482:   "assertion expected after (?(\0"
  483:   "(?R or (?[+-]digits must be followed by )\0"
  484:   /* 30 */
  485:   "unknown POSIX class name\0"
  486:   "POSIX collating elements are not supported\0"
  487:   "this version of PCRE is compiled without UTF support\0"
  488:   "spare error\0"  /** DEAD **/
  489:   "character value in \\x{} or \\o{} is too large\0"
  490:   /* 35 */
  491:   "invalid condition (?(0)\0"
  492:   "\\C not allowed in lookbehind assertion\0"
  493:   "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
  494:   "number after (?C is > 255\0"
  495:   "closing ) for (?C expected\0"
  496:   /* 40 */
  497:   "recursive call could loop indefinitely\0"
  498:   "unrecognized character after (?P\0"
  499:   "syntax error in subpattern name (missing terminator)\0"
  500:   "two named subpatterns have the same name\0"
  501:   "invalid UTF-8 string\0"
  502:   /* 45 */
  503:   "support for \\P, \\p, and \\X has not been compiled\0"
  504:   "malformed \\P or \\p sequence\0"
  505:   "unknown property name after \\P or \\p\0"
  506:   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
  507:   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
  508:   /* 50 */
  509:   "repeated subpattern is too long\0"    /** DEAD **/
  510:   "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
  511:   "internal error: overran compiling workspace\0"
  512:   "internal error: previously-checked referenced subpattern not found\0"
  513:   "DEFINE group contains more than one branch\0"
  514:   /* 55 */
  515:   "repeating a DEFINE group is not allowed\0"  /** DEAD **/
  516:   "inconsistent NEWLINE options\0"
  517:   "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
  518:   "a numbered reference must not be zero\0"
  519:   "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
  520:   /* 60 */
  521:   "(*VERB) not recognized or malformed\0"
  522:   "number is too big\0"
  523:   "subpattern name expected\0"
  524:   "digit expected after (?+\0"
  525:   "] is an invalid data character in JavaScript compatibility mode\0"
  526:   /* 65 */
  527:   "different names for subpatterns of the same number are not allowed\0"
  528:   "(*MARK) must have an argument\0"
  529:   "this version of PCRE is not compiled with Unicode property support\0"
  530:   "\\c must be followed by an ASCII character\0"
  531:   "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
  532:   /* 70 */
  533:   "internal error: unknown opcode in find_fixedlength()\0"
  534:   "\\N is not supported in a class\0"
  535:   "too many forward references\0"
  536:   "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
  537:   "invalid UTF-16 string\0"
  538:   /* 75 */
  539:   "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
  540:   "character value in \\u.... sequence is too large\0"
  541:   "invalid UTF-32 string\0"
  542:   "setting UTF is disabled by the application\0"
  543:   "non-hex character in \\x{} (closing brace missing?)\0"
  544:   /* 80 */
  545:   "non-octal character in \\o{} (closing brace missing?)\0"
  546:   "missing opening brace after \\o\0"
  547:   "parentheses are too deeply nested\0"
  548:   "invalid range in character class\0"
  549:   "group name must start with a non-digit\0"
  550:   ;
  551: 
  552: /* Table to identify digits and hex digits. This is used when compiling
  553: patterns. Note that the tables in chartables are dependent on the locale, and
  554: may mark arbitrary characters as digits - but the PCRE compiling code expects
  555: to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
  556: a private table here. It costs 256 bytes, but it is a lot faster than doing
  557: character value tests (at least in some simple cases I timed), and in some
  558: applications one wants PCRE to compile efficiently as well as match
  559: efficiently.
  560: 
  561: For convenience, we use the same bit definitions as in chartables:
  562: 
  563:   0x04   decimal digit
  564:   0x08   hexadecimal digit
  565: 
  566: Then we can use ctype_digit and ctype_xdigit in the code. */
  567: 
  568: /* Using a simple comparison for decimal numbers rather than a memory read
  569: is much faster, and the resulting code is simpler (the compiler turns it
  570: into a subtraction and unsigned comparison). */
  571: 
  572: #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
  573: 
  574: #ifndef EBCDIC
  575: 
  576: /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
  577: UTF-8 mode. */
  578: 
  579: static const pcre_uint8 digitab[] =
  580:   {
  581:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
  582:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
  583:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
  584:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
  585:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
  586:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
  587:   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
  588:   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
  589:   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
  590:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
  591:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
  592:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
  593:   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
  594:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
  595:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
  596:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
  597:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
  598:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
  599:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
  600:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
  601:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
  602:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
  603:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
  604:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
  605:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
  606:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
  607:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
  608:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
  609:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
  610:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
  611:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
  612:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
  613: 
  614: #else
  615: 
  616: /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
  617: 
  618: static const pcre_uint8 digitab[] =
  619:   {
  620:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
  621:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
  622:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
  623:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
  624:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
  625:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
  626:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
  627:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
  628:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
  629:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
  630:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
  631:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
  632:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
  633:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
  634:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
  635:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
  636:   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
  637:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
  638:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
  639:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
  640:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
  641:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
  642:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
  643:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
  644:   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
  645:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
  646:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
  647:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
  648:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
  649:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
  650:   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
  651:   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
  652: 
  653: static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
  654:   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
  655:   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
  656:   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
  657:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
  658:   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
  659:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
  660:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
  661:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
  662:   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
  663:   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
  664:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
  665:   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
  666:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
  667:   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
  668:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
  669:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
  670:   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
  671:   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
  672:   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
  673:   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
  674:   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
  675:   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
  676:   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
  677:   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
  678:   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
  679:   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
  680:   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
  681:   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
  682:   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
  683:   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
  684:   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
  685:   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
  686: #endif
  687: 
  688: 
  689: /* This table is used to check whether auto-possessification is possible
  690: between adjacent character-type opcodes. The left-hand (repeated) opcode is
  691: used to select the row, and the right-hand opcode is use to select the column.
  692: A value of 1 means that auto-possessification is OK. For example, the second
  693: value in the first row means that \D+\d can be turned into \D++\d.
  694: 
  695: The Unicode property types (\P and \p) have to be present to fill out the table
  696: because of what their opcode values are, but the table values should always be
  697: zero because property types are handled separately in the code. The last four
  698: columns apply to items that cannot be repeated, so there is no need to have
  699: rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
  700: *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
  701: 
  702: #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
  703: #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
  704: 
  705: static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
  706: /* \D \d \S \s \W \w  . .+ \C \P \p \R \H \h \V \v \X \Z \z  $ $M */
  707:   { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \D */
  708:   { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \d */
  709:   { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \S */
  710:   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \s */
  711:   { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \W */
  712:   { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \w */
  713:   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .  */
  714:   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .+ */
  715:   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \C */
  716:   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \P */
  717:   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \p */
  718:   { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \R */
  719:   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \H */
  720:   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \h */
  721:   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \V */
  722:   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 },  /* \v */
  723:   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }   /* \X */
  724: };
  725: 
  726: 
  727: /* This table is used to check whether auto-possessification is possible
  728: between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
  729: left-hand (repeated) opcode is used to select the row, and the right-hand
  730: opcode is used to select the column. The values are as follows:
  731: 
  732:   0   Always return FALSE (never auto-possessify)
  733:   1   Character groups are distinct (possessify if both are OP_PROP)
  734:   2   Check character categories in the same group (general or particular)
  735:   3   TRUE if the two opcodes are not the same (PROP vs NOTPROP)
  736: 
  737:   4   Check left general category vs right particular category
  738:   5   Check right general category vs left particular category
  739: 
  740:   6   Left alphanum vs right general category
  741:   7   Left space vs right general category
  742:   8   Left word vs right general category
  743: 
  744:   9   Right alphanum vs left general category
  745:  10   Right space vs left general category
  746:  11   Right word vs left general category
  747: 
  748:  12   Left alphanum vs right particular category
  749:  13   Left space vs right particular category
  750:  14   Left word vs right particular category
  751: 
  752:  15   Right alphanum vs left particular category
  753:  16   Right space vs left particular category
  754:  17   Right word vs left particular category
  755: */
  756: 
  757: static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
  758: /* ANY LAMP GC  PC  SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
  759:   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_ANY */
  760:   { 0,  3,  0,  0,  0,    3,    1,      1,   0,    0,   0 },  /* PT_LAMP */
  761:   { 0,  0,  2,  4,  0,    9,   10,     10,  11,    0,   0 },  /* PT_GC */
  762:   { 0,  0,  5,  2,  0,   15,   16,     16,  17,    0,   0 },  /* PT_PC */
  763:   { 0,  0,  0,  0,  2,    0,    0,      0,   0,    0,   0 },  /* PT_SC */
  764:   { 0,  3,  6, 12,  0,    3,    1,      1,   0,    0,   0 },  /* PT_ALNUM */
  765:   { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_SPACE */
  766:   { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_PXSPACE */
  767:   { 0,  0,  8, 14,  0,    0,    1,      1,   3,    0,   0 },  /* PT_WORD */
  768:   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_CLIST */
  769:   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   3 }   /* PT_UCNC */
  770: };
  771: 
  772: /* This table is used to check whether auto-possessification is possible
  773: between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
  774: specifies a general category and the other specifies a particular category. The
  775: row is selected by the general category and the column by the particular
  776: category. The value is 1 if the particular category is not part of the general
  777: category. */
  778: 
  779: static const pcre_uint8 catposstab[7][30] = {
  780: /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
  781:   { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* C */
  782:   { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* L */
  783:   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* M */
  784:   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* N */
  785:   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },  /* P */
  786:   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 },  /* S */
  787:   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }   /* Z */
  788: };
  789: 
  790: /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
  791: a general or particular category. The properties in each row are those
  792: that apply to the character set in question. Duplication means that a little
  793: unnecessary work is done when checking, but this keeps things much simpler
  794: because they can all use the same code. For more details see the comment where
  795: this table is used.
  796: 
  797: Note: SPACE and PXSPACE used to be different because Perl excluded VT from
  798: "space", but from Perl 5.18 it's included, so both categories are treated the
  799: same here. */
  800: 
  801: static const pcre_uint8 posspropstab[3][4] = {
  802:   { ucp_L, ucp_N, ucp_N, ucp_Nl },  /* ALNUM, 3rd and 4th values redundant */
  803:   { ucp_Z, ucp_Z, ucp_C, ucp_Cc },  /* SPACE and PXSPACE, 2nd value redundant */
  804:   { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
  805: };
  806: 
  807: /* This table is used when converting repeating opcodes into possessified
  808: versions as a result of an explicit possessive quantifier such as ++. A zero
  809: value means there is no possessified version - in those cases the item in
  810: question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
  811: because all relevant opcodes are less than that. */
  812: 
  813: static const pcre_uint8 opcode_possessify[] = {
  814:   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
  815:   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
  816: 
  817:   0,                       /* NOTI */
  818:   OP_POSSTAR, 0,           /* STAR, MINSTAR */
  819:   OP_POSPLUS, 0,           /* PLUS, MINPLUS */
  820:   OP_POSQUERY, 0,          /* QUERY, MINQUERY */
  821:   OP_POSUPTO, 0,           /* UPTO, MINUPTO */
  822:   0,                       /* EXACT */
  823:   0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
  824: 
  825:   OP_POSSTARI, 0,          /* STARI, MINSTARI */
  826:   OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
  827:   OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
  828:   OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
  829:   0,                       /* EXACTI */
  830:   0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
  831: 
  832:   OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
  833:   OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
  834:   OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
  835:   OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
  836:   0,                       /* NOTEXACT */
  837:   0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
  838: 
  839:   OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
  840:   OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
  841:   OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
  842:   OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
  843:   0,                       /* NOTEXACTI */
  844:   0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
  845: 
  846:   OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
  847:   OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
  848:   OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
  849:   OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
  850:   0,                       /* TYPEEXACT */
  851:   0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
  852: 
  853:   OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
  854:   OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
  855:   OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
  856:   OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
  857:   0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
  858: 
  859:   0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
  860:   0, 0,                    /* REF, REFI */
  861:   0, 0,                    /* DNREF, DNREFI */
  862:   0, 0                     /* RECURSE, CALLOUT */
  863: };
  864: 
  865: 
  866: 
  867: /*************************************************
  868: *            Find an error text                  *
  869: *************************************************/
  870: 
  871: /* The error texts are now all in one long string, to save on relocations. As
  872: some of the text is of unknown length, we can't use a table of offsets.
  873: Instead, just count through the strings. This is not a performance issue
  874: because it happens only when there has been a compilation error.
  875: 
  876: Argument:   the error number
  877: Returns:    pointer to the error string
  878: */
  879: 
  880: static const char *
  881: find_error_text(int n)
  882: {
  883: const char *s = error_texts;
  884: for (; n > 0; n--)
  885:   {
  886:   while (*s++ != CHAR_NULL) {};
  887:   if (*s == CHAR_NULL) return "Error text not found (please report)";
  888:   }
  889: return s;
  890: }
  891: 
  892: 
  893: 
  894: /*************************************************
  895: *           Expand the workspace                 *
  896: *************************************************/
  897: 
  898: /* This function is called during the second compiling phase, if the number of
  899: forward references fills the existing workspace, which is originally a block on
  900: the stack. A larger block is obtained from malloc() unless the ultimate limit
  901: has been reached or the increase will be rather small.
  902: 
  903: Argument: pointer to the compile data block
  904: Returns:  0 if all went well, else an error number
  905: */
  906: 
  907: static int
  908: expand_workspace(compile_data *cd)
  909: {
  910: pcre_uchar *newspace;
  911: int newsize = cd->workspace_size * 2;
  912: 
  913: if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
  914: if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
  915:     newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
  916:  return ERR72;
  917: 
  918: newspace = (PUBL(malloc))(IN_UCHARS(newsize));
  919: if (newspace == NULL) return ERR21;
  920: memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
  921: cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
  922: if (cd->workspace_size > COMPILE_WORK_SIZE)
  923:   (PUBL(free))((void *)cd->start_workspace);
  924: cd->start_workspace = newspace;
  925: cd->workspace_size = newsize;
  926: return 0;
  927: }
  928: 
  929: 
  930: 
  931: /*************************************************
  932: *            Check for counted repeat            *
  933: *************************************************/
  934: 
  935: /* This function is called when a '{' is encountered in a place where it might
  936: start a quantifier. It looks ahead to see if it really is a quantifier or not.
  937: It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
  938: where the ddds are digits.
  939: 
  940: Arguments:
  941:   p         pointer to the first char after '{'
  942: 
  943: Returns:    TRUE or FALSE
  944: */
  945: 
  946: static BOOL
  947: is_counted_repeat(const pcre_uchar *p)
  948: {
  949: if (!IS_DIGIT(*p)) return FALSE;
  950: p++;
  951: while (IS_DIGIT(*p)) p++;
  952: if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
  953: 
  954: if (*p++ != CHAR_COMMA) return FALSE;
  955: if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
  956: 
  957: if (!IS_DIGIT(*p)) return FALSE;
  958: p++;
  959: while (IS_DIGIT(*p)) p++;
  960: 
  961: return (*p == CHAR_RIGHT_CURLY_BRACKET);
  962: }
  963: 
  964: 
  965: 
  966: /*************************************************
  967: *            Handle escapes                      *
  968: *************************************************/
  969: 
  970: /* This function is called when a \ has been encountered. It either returns a
  971: positive value for a simple escape such as \n, or 0 for a data character which
  972: will be placed in chptr. A backreference to group n is returned as negative n.
  973: When UTF-8 is enabled, a positive value greater than 255 may be returned in
  974: chptr. On entry, ptr is pointing at the \. On exit, it is on the final
  975: character of the escape sequence.
  976: 
  977: Arguments:
  978:   ptrptr         points to the pattern position pointer
  979:   chptr          points to a returned data character
  980:   errorcodeptr   points to the errorcode variable
  981:   bracount       number of previous extracting brackets
  982:   options        the options bits
  983:   isclass        TRUE if inside a character class
  984: 
  985: Returns:         zero => a data character
  986:                  positive => a special escape sequence
  987:                  negative => a back reference
  988:                  on error, errorcodeptr is set
  989: */
  990: 
  991: static int
  992: check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
  993:   int bracount, int options, BOOL isclass)
  994: {
  995: /* PCRE_UTF16 has the same value as PCRE_UTF8. */
  996: BOOL utf = (options & PCRE_UTF8) != 0;
  997: const pcre_uchar *ptr = *ptrptr + 1;
  998: pcre_uint32 c;
  999: int escape = 0;
 1000: int i;
 1001: 
 1002: GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
 1003: ptr--;                            /* Set pointer back to the last byte */
 1004: 
 1005: /* If backslash is at the end of the pattern, it's an error. */
 1006: 
 1007: if (c == CHAR_NULL) *errorcodeptr = ERR1;
 1008: 
 1009: /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
 1010: in a table. A non-zero result is something that can be returned immediately.
 1011: Otherwise further processing may be required. */
 1012: 
 1013: #ifndef EBCDIC  /* ASCII/UTF-8 coding */
 1014: /* Not alphanumeric */
 1015: else if (c < CHAR_0 || c > CHAR_z) {}
 1016: else if ((i = escapes[c - CHAR_0]) != 0)
 1017:   { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
 1018: 
 1019: #else           /* EBCDIC coding */
 1020: /* Not alphanumeric */
 1021: else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
 1022: else if ((i = escapes[c - 0x48]) != 0)  { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
 1023: #endif
 1024: 
 1025: /* Escapes that need further processing, or are illegal. */
 1026: 
 1027: else
 1028:   {
 1029:   const pcre_uchar *oldptr;
 1030:   BOOL braced, negated, overflow;
 1031:   int s;
 1032: 
 1033:   switch (c)
 1034:     {
 1035:     /* A number of Perl escapes are not handled by PCRE. We give an explicit
 1036:     error. */
 1037: 
 1038:     case CHAR_l:
 1039:     case CHAR_L:
 1040:     *errorcodeptr = ERR37;
 1041:     break;
 1042: 
 1043:     case CHAR_u:
 1044:     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
 1045:       {
 1046:       /* In JavaScript, \u must be followed by four hexadecimal numbers.
 1047:       Otherwise it is a lowercase u letter. */
 1048:       if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
 1049:         && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
 1050:         && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
 1051:         && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
 1052:         {
 1053:         c = 0;
 1054:         for (i = 0; i < 4; ++i)
 1055:           {
 1056:           register pcre_uint32 cc = *(++ptr);
 1057: #ifndef EBCDIC  /* ASCII/UTF-8 coding */
 1058:           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
 1059:           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
 1060: #else           /* EBCDIC coding */
 1061:           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
 1062:           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
 1063: #endif
 1064:           }
 1065: 
 1066: #if defined COMPILE_PCRE8
 1067:         if (c > (utf ? 0x10ffffU : 0xffU))
 1068: #elif defined COMPILE_PCRE16
 1069:         if (c > (utf ? 0x10ffffU : 0xffffU))
 1070: #elif defined COMPILE_PCRE32
 1071:         if (utf && c > 0x10ffffU)
 1072: #endif
 1073:           {
 1074:           *errorcodeptr = ERR76;
 1075:           }
 1076:         else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
 1077:         }
 1078:       }
 1079:     else
 1080:       *errorcodeptr = ERR37;
 1081:     break;
 1082: 
 1083:     case CHAR_U:
 1084:     /* In JavaScript, \U is an uppercase U letter. */
 1085:     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
 1086:     break;
 1087: 
 1088:     /* In a character class, \g is just a literal "g". Outside a character
 1089:     class, \g must be followed by one of a number of specific things:
 1090: 
 1091:     (1) A number, either plain or braced. If positive, it is an absolute
 1092:     backreference. If negative, it is a relative backreference. This is a Perl
 1093:     5.10 feature.
 1094: 
 1095:     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
 1096:     is part of Perl's movement towards a unified syntax for back references. As
 1097:     this is synonymous with \k{name}, we fudge it up by pretending it really
 1098:     was \k.
 1099: 
 1100:     (3) For Oniguruma compatibility we also support \g followed by a name or a
 1101:     number either in angle brackets or in single quotes. However, these are
 1102:     (possibly recursive) subroutine calls, _not_ backreferences. Just return
 1103:     the ESC_g code (cf \k). */
 1104: 
 1105:     case CHAR_g:
 1106:     if (isclass) break;
 1107:     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
 1108:       {
 1109:       escape = ESC_g;
 1110:       break;
 1111:       }
 1112: 
 1113:     /* Handle the Perl-compatible cases */
 1114: 
 1115:     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
 1116:       {
 1117:       const pcre_uchar *p;
 1118:       for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
 1119:         if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
 1120:       if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
 1121:         {
 1122:         escape = ESC_k;
 1123:         break;
 1124:         }
 1125:       braced = TRUE;
 1126:       ptr++;
 1127:       }
 1128:     else braced = FALSE;
 1129: 
 1130:     if (ptr[1] == CHAR_MINUS)
 1131:       {
 1132:       negated = TRUE;
 1133:       ptr++;
 1134:       }
 1135:     else negated = FALSE;
 1136: 
 1137:     /* The integer range is limited by the machine's int representation. */
 1138:     s = 0;
 1139:     overflow = FALSE;
 1140:     while (IS_DIGIT(ptr[1]))
 1141:       {
 1142:       if (s > INT_MAX / 10 - 1) /* Integer overflow */
 1143:         {
 1144:         overflow = TRUE;
 1145:         break;
 1146:         }
 1147:       s = s * 10 + (int)(*(++ptr) - CHAR_0);
 1148:       }
 1149:     if (overflow) /* Integer overflow */
 1150:       {
 1151:       while (IS_DIGIT(ptr[1]))
 1152:         ptr++;
 1153:       *errorcodeptr = ERR61;
 1154:       break;
 1155:       }
 1156: 
 1157:     if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
 1158:       {
 1159:       *errorcodeptr = ERR57;
 1160:       break;
 1161:       }
 1162: 
 1163:     if (s == 0)
 1164:       {
 1165:       *errorcodeptr = ERR58;
 1166:       break;
 1167:       }
 1168: 
 1169:     if (negated)
 1170:       {
 1171:       if (s > bracount)
 1172:         {
 1173:         *errorcodeptr = ERR15;
 1174:         break;
 1175:         }
 1176:       s = bracount - (s - 1);
 1177:       }
 1178: 
 1179:     escape = -s;
 1180:     break;
 1181: 
 1182:     /* The handling of escape sequences consisting of a string of digits
 1183:     starting with one that is not zero is not straightforward. Perl has changed
 1184:     over the years. Nowadays \g{} for backreferences and \o{} for octal are
 1185:     recommended to avoid the ambiguities in the old syntax.
 1186: 
 1187:     Outside a character class, the digits are read as a decimal number. If the
 1188:     number is less than 8 (used to be 10), or if there are that many previous
 1189:     extracting left brackets, then it is a back reference. Otherwise, up to
 1190:     three octal digits are read to form an escaped byte. Thus \123 is likely to
 1191:     be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
 1192:     the octal value is greater than 377, the least significant 8 bits are
 1193:     taken. \8 and \9 are treated as the literal characters 8 and 9.
 1194: 
 1195:     Inside a character class, \ followed by a digit is always either a literal
 1196:     8 or 9 or an octal number. */
 1197: 
 1198:     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
 1199:     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
 1200: 
 1201:     if (!isclass)
 1202:       {
 1203:       oldptr = ptr;
 1204:       /* The integer range is limited by the machine's int representation. */
 1205:       s = (int)(c -CHAR_0);
 1206:       overflow = FALSE;
 1207:       while (IS_DIGIT(ptr[1]))
 1208:         {
 1209:         if (s > INT_MAX / 10 - 1) /* Integer overflow */
 1210:           {
 1211:           overflow = TRUE;
 1212:           break;
 1213:           }
 1214:         s = s * 10 + (int)(*(++ptr) - CHAR_0);
 1215:         }
 1216:       if (overflow) /* Integer overflow */
 1217:         {
 1218:         while (IS_DIGIT(ptr[1]))
 1219:           ptr++;
 1220:         *errorcodeptr = ERR61;
 1221:         break;
 1222:         }
 1223:       if (s < 8 || s <= bracount)  /* Check for back reference */
 1224:         {
 1225:         escape = -s;
 1226:         break;
 1227:         }
 1228:       ptr = oldptr;      /* Put the pointer back and fall through */
 1229:       }
 1230: 
 1231:     /* Handle a digit following \ when the number is not a back reference. If
 1232:     the first digit is 8 or 9, Perl used to generate a binary zero byte and
 1233:     then treat the digit as a following literal. At least by Perl 5.18 this
 1234:     changed so as not to insert the binary zero. */
 1235: 
 1236:     if ((c = *ptr) >= CHAR_8) break;
 1237: 
 1238:     /* Fall through with a digit less than 8 */
 1239: 
 1240:     /* \0 always starts an octal number, but we may drop through to here with a
 1241:     larger first octal digit. The original code used just to take the least
 1242:     significant 8 bits of octal numbers (I think this is what early Perls used
 1243:     to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
 1244:     but no more than 3 octal digits. */
 1245: 
 1246:     case CHAR_0:
 1247:     c -= CHAR_0;
 1248:     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
 1249:         c = c * 8 + *(++ptr) - CHAR_0;
 1250: #ifdef COMPILE_PCRE8
 1251:     if (!utf && c > 0xff) *errorcodeptr = ERR51;
 1252: #endif
 1253:     break;
 1254: 
 1255:     /* \o is a relatively new Perl feature, supporting a more general way of
 1256:     specifying character codes in octal. The only supported form is \o{ddd}. */
 1257: 
 1258:     case CHAR_o:
 1259:     if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
 1260:       {
 1261:       ptr += 2;
 1262:       c = 0;
 1263:       overflow = FALSE;
 1264:       while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
 1265:         {
 1266:         register pcre_uint32 cc = *ptr++;
 1267:         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
 1268: #ifdef COMPILE_PCRE32
 1269:         if (c >= 0x20000000l) { overflow = TRUE; break; }
 1270: #endif
 1271:         c = (c << 3) + cc - CHAR_0 ;
 1272: #if defined COMPILE_PCRE8
 1273:         if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
 1274: #elif defined COMPILE_PCRE16
 1275:         if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
 1276: #elif defined COMPILE_PCRE32
 1277:         if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
 1278: #endif
 1279:         }
 1280:       if (overflow)
 1281:         {
 1282:         while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
 1283:         *errorcodeptr = ERR34;
 1284:         }
 1285:       else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
 1286:         {
 1287:         if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
 1288:         }
 1289:       else *errorcodeptr = ERR80;
 1290:       }
 1291:     break;
 1292: 
 1293:     /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
 1294:     numbers. Otherwise it is a lowercase x letter. */
 1295: 
 1296:     case CHAR_x:
 1297:     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
 1298:       {
 1299:       if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
 1300:         && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
 1301:         {
 1302:         c = 0;
 1303:         for (i = 0; i < 2; ++i)
 1304:           {
 1305:           register pcre_uint32 cc = *(++ptr);
 1306: #ifndef EBCDIC  /* ASCII/UTF-8 coding */
 1307:           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
 1308:           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
 1309: #else           /* EBCDIC coding */
 1310:           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
 1311:           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
 1312: #endif
 1313:           }
 1314:         }
 1315:       }    /* End JavaScript handling */
 1316: 
 1317:     /* Handle \x in Perl's style. \x{ddd} is a character number which can be
 1318:     greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
 1319:     digits. If not, { used to be treated as a data character. However, Perl
 1320:     seems to read hex digits up to the first non-such, and ignore the rest, so
 1321:     that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
 1322:     now gives an error. */
 1323: 
 1324:     else
 1325:       {
 1326:       if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
 1327:         {
 1328:         ptr += 2;
 1329:         c = 0;
 1330:         overflow = FALSE;
 1331:         while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
 1332:           {
 1333:           register pcre_uint32 cc = *ptr++;
 1334:           if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
 1335: 
 1336: #ifdef COMPILE_PCRE32
 1337:           if (c >= 0x10000000l) { overflow = TRUE; break; }
 1338: #endif
 1339: 
 1340: #ifndef EBCDIC  /* ASCII/UTF-8 coding */
 1341:           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
 1342:           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
 1343: #else           /* EBCDIC coding */
 1344:           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
 1345:           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
 1346: #endif
 1347: 
 1348: #if defined COMPILE_PCRE8
 1349:           if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
 1350: #elif defined COMPILE_PCRE16
 1351:           if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
 1352: #elif defined COMPILE_PCRE32
 1353:           if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
 1354: #endif
 1355:           }
 1356: 
 1357:         if (overflow)
 1358:           {
 1359:           while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
 1360:           *errorcodeptr = ERR34;
 1361:           }
 1362: 
 1363:         else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
 1364:           {
 1365:           if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
 1366:           }
 1367: 
 1368:         /* If the sequence of hex digits does not end with '}', give an error.
 1369:         We used just to recognize this construct and fall through to the normal
 1370:         \x handling, but nowadays Perl gives an error, which seems much more
 1371:         sensible, so we do too. */
 1372: 
 1373:         else *errorcodeptr = ERR79;
 1374:         }   /* End of \x{} processing */
 1375: 
 1376:       /* Read a single-byte hex-defined char (up to two hex digits after \x) */
 1377: 
 1378:       else
 1379:         {
 1380:         c = 0;
 1381:         while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
 1382:           {
 1383:           pcre_uint32 cc;                          /* Some compilers don't like */
 1384:           cc = *(++ptr);                           /* ++ in initializers */
 1385: #ifndef EBCDIC  /* ASCII/UTF-8 coding */
 1386:           if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
 1387:           c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
 1388: #else           /* EBCDIC coding */
 1389:           if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
 1390:           c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
 1391: #endif
 1392:           }
 1393:         }     /* End of \xdd handling */
 1394:       }       /* End of Perl-style \x handling */
 1395:     break;
 1396: 
 1397:     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
 1398:     An error is given if the byte following \c is not an ASCII character. This
 1399:     coding is ASCII-specific, but then the whole concept of \cx is
 1400:     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
 1401: 
 1402:     case CHAR_c:
 1403:     c = *(++ptr);
 1404:     if (c == CHAR_NULL)
 1405:       {
 1406:       *errorcodeptr = ERR2;
 1407:       break;
 1408:       }
 1409: #ifndef EBCDIC    /* ASCII/UTF-8 coding */
 1410:     if (c > 127)  /* Excludes all non-ASCII in either mode */
 1411:       {
 1412:       *errorcodeptr = ERR68;
 1413:       break;
 1414:       }
 1415:     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
 1416:     c ^= 0x40;
 1417: #else             /* EBCDIC coding */
 1418:     if (c >= CHAR_a && c <= CHAR_z) c += 64;
 1419:     c ^= 0xC0;
 1420: #endif
 1421:     break;
 1422: 
 1423:     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
 1424:     other alphanumeric following \ is an error if PCRE_EXTRA was set;
 1425:     otherwise, for Perl compatibility, it is a literal. This code looks a bit
 1426:     odd, but there used to be some cases other than the default, and there may
 1427:     be again in future, so I haven't "optimized" it. */
 1428: 
 1429:     default:
 1430:     if ((options & PCRE_EXTRA) != 0) switch(c)
 1431:       {
 1432:       default:
 1433:       *errorcodeptr = ERR3;
 1434:       break;
 1435:       }
 1436:     break;
 1437:     }
 1438:   }
 1439: 
 1440: /* Perl supports \N{name} for character names, as well as plain \N for "not
 1441: newline". PCRE does not support \N{name}. However, it does support
 1442: quantification such as \N{2,3}. */
 1443: 
 1444: if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
 1445:      !is_counted_repeat(ptr+2))
 1446:   *errorcodeptr = ERR37;
 1447: 
 1448: /* If PCRE_UCP is set, we change the values for \d etc. */
 1449: 
 1450: if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
 1451:   escape += (ESC_DU - ESC_D);
 1452: 
 1453: /* Set the pointer to the final character before returning. */
 1454: 
 1455: *ptrptr = ptr;
 1456: *chptr = c;
 1457: return escape;
 1458: }
 1459: 
 1460: 
 1461: 
 1462: #ifdef SUPPORT_UCP
 1463: /*************************************************
 1464: *               Handle \P and \p                 *
 1465: *************************************************/
 1466: 
 1467: /* This function is called after \P or \p has been encountered, provided that
 1468: PCRE is compiled with support for Unicode properties. On entry, ptrptr is
 1469: pointing at the P or p. On exit, it is pointing at the final character of the
 1470: escape sequence.
 1471: 
 1472: Argument:
 1473:   ptrptr         points to the pattern position pointer
 1474:   negptr         points to a boolean that is set TRUE for negation else FALSE
 1475:   ptypeptr       points to an unsigned int that is set to the type value
 1476:   pdataptr       points to an unsigned int that is set to the detailed property value
 1477:   errorcodeptr   points to the error code variable
 1478: 
 1479: Returns:         TRUE if the type value was found, or FALSE for an invalid type
 1480: */
 1481: 
 1482: static BOOL
 1483: get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
 1484:   unsigned int *pdataptr, int *errorcodeptr)
 1485: {
 1486: pcre_uchar c;
 1487: int i, bot, top;
 1488: const pcre_uchar *ptr = *ptrptr;
 1489: pcre_uchar name[32];
 1490: 
 1491: c = *(++ptr);
 1492: if (c == CHAR_NULL) goto ERROR_RETURN;
 1493: 
 1494: *negptr = FALSE;
 1495: 
 1496: /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
 1497: negation. */
 1498: 
 1499: if (c == CHAR_LEFT_CURLY_BRACKET)
 1500:   {
 1501:   if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
 1502:     {
 1503:     *negptr = TRUE;
 1504:     ptr++;
 1505:     }
 1506:   for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
 1507:     {
 1508:     c = *(++ptr);
 1509:     if (c == CHAR_NULL) goto ERROR_RETURN;
 1510:     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
 1511:     name[i] = c;
 1512:     }
 1513:   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
 1514:   name[i] = 0;
 1515:   }
 1516: 
 1517: /* Otherwise there is just one following character */
 1518: 
 1519: else
 1520:   {
 1521:   name[0] = c;
 1522:   name[1] = 0;
 1523:   }
 1524: 
 1525: *ptrptr = ptr;
 1526: 
 1527: /* Search for a recognized property name using binary chop */
 1528: 
 1529: bot = 0;
 1530: top = PRIV(utt_size);
 1531: 
 1532: while (bot < top)
 1533:   {
 1534:   int r;
 1535:   i = (bot + top) >> 1;
 1536:   r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
 1537:   if (r == 0)
 1538:     {
 1539:     *ptypeptr = PRIV(utt)[i].type;
 1540:     *pdataptr = PRIV(utt)[i].value;
 1541:     return TRUE;
 1542:     }
 1543:   if (r > 0) bot = i + 1; else top = i;
 1544:   }
 1545: 
 1546: *errorcodeptr = ERR47;
 1547: *ptrptr = ptr;
 1548: return FALSE;
 1549: 
 1550: ERROR_RETURN:
 1551: *errorcodeptr = ERR46;
 1552: *ptrptr = ptr;
 1553: return FALSE;
 1554: }
 1555: #endif
 1556: 
 1557: 
 1558: 
 1559: /*************************************************
 1560: *         Read repeat counts                     *
 1561: *************************************************/
 1562: 
 1563: /* Read an item of the form {n,m} and return the values. This is called only
 1564: after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
 1565: so the syntax is guaranteed to be correct, but we need to check the values.
 1566: 
 1567: Arguments:
 1568:   p              pointer to first char after '{'
 1569:   minp           pointer to int for min
 1570:   maxp           pointer to int for max
 1571:                  returned as -1 if no max
 1572:   errorcodeptr   points to error code variable
 1573: 
 1574: Returns:         pointer to '}' on success;
 1575:                  current ptr on error, with errorcodeptr set non-zero
 1576: */
 1577: 
 1578: static const pcre_uchar *
 1579: read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
 1580: {
 1581: int min = 0;
 1582: int max = -1;
 1583: 
 1584: /* Read the minimum value and do a paranoid check: a negative value indicates
 1585: an integer overflow. */
 1586: 
 1587: while (IS_DIGIT(*p)) min = min * 10 + (int)(*p++ - CHAR_0);
 1588: if (min < 0 || min > 65535)
 1589:   {
 1590:   *errorcodeptr = ERR5;
 1591:   return p;
 1592:   }
 1593: 
 1594: /* Read the maximum value if there is one, and again do a paranoid on its size.
 1595: Also, max must not be less than min. */
 1596: 
 1597: if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
 1598:   {
 1599:   if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
 1600:     {
 1601:     max = 0;
 1602:     while(IS_DIGIT(*p)) max = max * 10 + (int)(*p++ - CHAR_0);
 1603:     if (max < 0 || max > 65535)
 1604:       {
 1605:       *errorcodeptr = ERR5;
 1606:       return p;
 1607:       }
 1608:     if (max < min)
 1609:       {
 1610:       *errorcodeptr = ERR4;
 1611:       return p;
 1612:       }
 1613:     }
 1614:   }
 1615: 
 1616: /* Fill in the required variables, and pass back the pointer to the terminating
 1617: '}'. */
 1618: 
 1619: *minp = min;
 1620: *maxp = max;
 1621: return p;
 1622: }
 1623: 
 1624: 
 1625: 
 1626: /*************************************************
 1627: *      Find first significant op code            *
 1628: *************************************************/
 1629: 
 1630: /* This is called by several functions that scan a compiled expression looking
 1631: for a fixed first character, or an anchoring op code etc. It skips over things
 1632: that do not influence this. For some calls, it makes sense to skip negative
 1633: forward and all backward assertions, and also the \b assertion; for others it
 1634: does not.
 1635: 
 1636: Arguments:
 1637:   code         pointer to the start of the group
 1638:   skipassert   TRUE if certain assertions are to be skipped
 1639: 
 1640: Returns:       pointer to the first significant opcode
 1641: */
 1642: 
 1643: static const pcre_uchar*
 1644: first_significant_code(const pcre_uchar *code, BOOL skipassert)
 1645: {
 1646: for (;;)
 1647:   {
 1648:   switch ((int)*code)
 1649:     {
 1650:     case OP_ASSERT_NOT:
 1651:     case OP_ASSERTBACK:
 1652:     case OP_ASSERTBACK_NOT:
 1653:     if (!skipassert) return code;
 1654:     do code += GET(code, 1); while (*code == OP_ALT);
 1655:     code += PRIV(OP_lengths)[*code];
 1656:     break;
 1657: 
 1658:     case OP_WORD_BOUNDARY:
 1659:     case OP_NOT_WORD_BOUNDARY:
 1660:     if (!skipassert) return code;
 1661:     /* Fall through */
 1662: 
 1663:     case OP_CALLOUT:
 1664:     case OP_CREF:
 1665:     case OP_DNCREF:
 1666:     case OP_RREF:
 1667:     case OP_DNRREF:
 1668:     case OP_DEF:
 1669:     code += PRIV(OP_lengths)[*code];
 1670:     break;
 1671: 
 1672:     default:
 1673:     return code;
 1674:     }
 1675:   }
 1676: /* Control never reaches here */
 1677: }
 1678: 
 1679: 
 1680: 
 1681: /*************************************************
 1682: *        Find the fixed length of a branch       *
 1683: *************************************************/
 1684: 
 1685: /* Scan a branch and compute the fixed length of subject that will match it,
 1686: if the length is fixed. This is needed for dealing with backward assertions.
 1687: In UTF8 mode, the result is in characters rather than bytes. The branch is
 1688: temporarily terminated with OP_END when this function is called.
 1689: 
 1690: This function is called when a backward assertion is encountered, so that if it
 1691: fails, the error message can point to the correct place in the pattern.
 1692: However, we cannot do this when the assertion contains subroutine calls,
 1693: because they can be forward references. We solve this by remembering this case
 1694: and doing the check at the end; a flag specifies which mode we are running in.
 1695: 
 1696: Arguments:
 1697:   code     points to the start of the pattern (the bracket)
 1698:   utf      TRUE in UTF-8 / UTF-16 / UTF-32 mode
 1699:   atend    TRUE if called when the pattern is complete
 1700:   cd       the "compile data" structure
 1701: 
 1702: Returns:   the fixed length,
 1703:              or -1 if there is no fixed length,
 1704:              or -2 if \C was encountered (in UTF-8 mode only)
 1705:              or -3 if an OP_RECURSE item was encountered and atend is FALSE
 1706:              or -4 if an unknown opcode was encountered (internal error)
 1707: */
 1708: 
 1709: static int
 1710: find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
 1711: {
 1712: int length = -1;
 1713: 
 1714: register int branchlength = 0;
 1715: register pcre_uchar *cc = code + 1 + LINK_SIZE;
 1716: 
 1717: /* Scan along the opcodes for this branch. If we get to the end of the
 1718: branch, check the length against that of the other branches. */
 1719: 
 1720: for (;;)
 1721:   {
 1722:   int d;
 1723:   pcre_uchar *ce, *cs;
 1724:   register pcre_uchar op = *cc;
 1725: 
 1726:   switch (op)
 1727:     {
 1728:     /* We only need to continue for OP_CBRA (normal capturing bracket) and
 1729:     OP_BRA (normal non-capturing bracket) because the other variants of these
 1730:     opcodes are all concerned with unlimited repeated groups, which of course
 1731:     are not of fixed length. */
 1732: 
 1733:     case OP_CBRA:
 1734:     case OP_BRA:
 1735:     case OP_ONCE:
 1736:     case OP_ONCE_NC:
 1737:     case OP_COND:
 1738:     d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
 1739:     if (d < 0) return d;
 1740:     branchlength += d;
 1741:     do cc += GET(cc, 1); while (*cc == OP_ALT);
 1742:     cc += 1 + LINK_SIZE;
 1743:     break;
 1744: 
 1745:     /* Reached end of a branch; if it's a ket it is the end of a nested call.
 1746:     If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
 1747:     an ALT. If it is END it's the end of the outer call. All can be handled by
 1748:     the same code. Note that we must not include the OP_KETRxxx opcodes here,
 1749:     because they all imply an unlimited repeat. */
 1750: 
 1751:     case OP_ALT:
 1752:     case OP_KET:
 1753:     case OP_END:
 1754:     case OP_ACCEPT:
 1755:     case OP_ASSERT_ACCEPT:
 1756:     if (length < 0) length = branchlength;
 1757:       else if (length != branchlength) return -1;
 1758:     if (*cc != OP_ALT) return length;
 1759:     cc += 1 + LINK_SIZE;
 1760:     branchlength = 0;
 1761:     break;
 1762: 
 1763:     /* A true recursion implies not fixed length, but a subroutine call may
 1764:     be OK. If the subroutine is a forward reference, we can't deal with
 1765:     it until the end of the pattern, so return -3. */
 1766: 
 1767:     case OP_RECURSE:
 1768:     if (!atend) return -3;
 1769:     cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
 1770:     do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
 1771:     if (cc > cs && cc < ce) return -1;                    /* Recursion */
 1772:     d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
 1773:     if (d < 0) return d;
 1774:     branchlength += d;
 1775:     cc += 1 + LINK_SIZE;
 1776:     break;
 1777: 
 1778:     /* Skip over assertive subpatterns */
 1779: 
 1780:     case OP_ASSERT:
 1781:     case OP_ASSERT_NOT:
 1782:     case OP_ASSERTBACK:
 1783:     case OP_ASSERTBACK_NOT:
 1784:     do cc += GET(cc, 1); while (*cc == OP_ALT);
 1785:     cc += PRIV(OP_lengths)[*cc];
 1786:     break;
 1787: 
 1788:     /* Skip over things that don't match chars */
 1789: 
 1790:     case OP_MARK:
 1791:     case OP_PRUNE_ARG:
 1792:     case OP_SKIP_ARG:
 1793:     case OP_THEN_ARG:
 1794:     cc += cc[1] + PRIV(OP_lengths)[*cc];
 1795:     break;
 1796: 
 1797:     case OP_CALLOUT:
 1798:     case OP_CIRC:
 1799:     case OP_CIRCM:
 1800:     case OP_CLOSE:
 1801:     case OP_COMMIT:
 1802:     case OP_CREF:
 1803:     case OP_DEF:
 1804:     case OP_DNCREF:
 1805:     case OP_DNRREF:
 1806:     case OP_DOLL:
 1807:     case OP_DOLLM:
 1808:     case OP_EOD:
 1809:     case OP_EODN:
 1810:     case OP_FAIL:
 1811:     case OP_NOT_WORD_BOUNDARY:
 1812:     case OP_PRUNE:
 1813:     case OP_REVERSE:
 1814:     case OP_RREF:
 1815:     case OP_SET_SOM:
 1816:     case OP_SKIP:
 1817:     case OP_SOD:
 1818:     case OP_SOM:
 1819:     case OP_THEN:
 1820:     case OP_WORD_BOUNDARY:
 1821:     cc += PRIV(OP_lengths)[*cc];
 1822:     break;
 1823: 
 1824:     /* Handle literal characters */
 1825: 
 1826:     case OP_CHAR:
 1827:     case OP_CHARI:
 1828:     case OP_NOT:
 1829:     case OP_NOTI:
 1830:     branchlength++;
 1831:     cc += 2;
 1832: #ifdef SUPPORT_UTF
 1833:     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 1834: #endif
 1835:     break;
 1836: 
 1837:     /* Handle exact repetitions. The count is already in characters, but we
 1838:     need to skip over a multibyte character in UTF8 mode.  */
 1839: 
 1840:     case OP_EXACT:
 1841:     case OP_EXACTI:
 1842:     case OP_NOTEXACT:
 1843:     case OP_NOTEXACTI:
 1844:     branchlength += (int)GET2(cc,1);
 1845:     cc += 2 + IMM2_SIZE;
 1846: #ifdef SUPPORT_UTF
 1847:     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 1848: #endif
 1849:     break;
 1850: 
 1851:     case OP_TYPEEXACT:
 1852:     branchlength += GET2(cc,1);
 1853:     if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
 1854:       cc += 2;
 1855:     cc += 1 + IMM2_SIZE + 1;
 1856:     break;
 1857: 
 1858:     /* Handle single-char matchers */
 1859: 
 1860:     case OP_PROP:
 1861:     case OP_NOTPROP:
 1862:     cc += 2;
 1863:     /* Fall through */
 1864: 
 1865:     case OP_HSPACE:
 1866:     case OP_VSPACE:
 1867:     case OP_NOT_HSPACE:
 1868:     case OP_NOT_VSPACE:
 1869:     case OP_NOT_DIGIT:
 1870:     case OP_DIGIT:
 1871:     case OP_NOT_WHITESPACE:
 1872:     case OP_WHITESPACE:
 1873:     case OP_NOT_WORDCHAR:
 1874:     case OP_WORDCHAR:
 1875:     case OP_ANY:
 1876:     case OP_ALLANY:
 1877:     branchlength++;
 1878:     cc++;
 1879:     break;
 1880: 
 1881:     /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
 1882:     otherwise \C is coded as OP_ALLANY. */
 1883: 
 1884:     case OP_ANYBYTE:
 1885:     return -2;
 1886: 
 1887:     /* Check a class for variable quantification */
 1888: 
 1889:     case OP_CLASS:
 1890:     case OP_NCLASS:
 1891: #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
 1892:     case OP_XCLASS:
 1893:     /* The original code caused an unsigned overflow in 64 bit systems,
 1894:     so now we use a conditional statement. */
 1895:     if (op == OP_XCLASS)
 1896:       cc += GET(cc, 1);
 1897:     else
 1898:       cc += PRIV(OP_lengths)[OP_CLASS];
 1899: #else
 1900:     cc += PRIV(OP_lengths)[OP_CLASS];
 1901: #endif
 1902: 
 1903:     switch (*cc)
 1904:       {
 1905:       case OP_CRSTAR:
 1906:       case OP_CRMINSTAR:
 1907:       case OP_CRPLUS:
 1908:       case OP_CRMINPLUS:
 1909:       case OP_CRQUERY:
 1910:       case OP_CRMINQUERY:
 1911:       case OP_CRPOSSTAR:
 1912:       case OP_CRPOSPLUS:
 1913:       case OP_CRPOSQUERY:
 1914:       return -1;
 1915: 
 1916:       case OP_CRRANGE:
 1917:       case OP_CRMINRANGE:
 1918:       case OP_CRPOSRANGE:
 1919:       if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
 1920:       branchlength += (int)GET2(cc,1);
 1921:       cc += 1 + 2 * IMM2_SIZE;
 1922:       break;
 1923: 
 1924:       default:
 1925:       branchlength++;
 1926:       }
 1927:     break;
 1928: 
 1929:     /* Anything else is variable length */
 1930: 
 1931:     case OP_ANYNL:
 1932:     case OP_BRAMINZERO:
 1933:     case OP_BRAPOS:
 1934:     case OP_BRAPOSZERO:
 1935:     case OP_BRAZERO:
 1936:     case OP_CBRAPOS:
 1937:     case OP_EXTUNI:
 1938:     case OP_KETRMAX:
 1939:     case OP_KETRMIN:
 1940:     case OP_KETRPOS:
 1941:     case OP_MINPLUS:
 1942:     case OP_MINPLUSI:
 1943:     case OP_MINQUERY:
 1944:     case OP_MINQUERYI:
 1945:     case OP_MINSTAR:
 1946:     case OP_MINSTARI:
 1947:     case OP_MINUPTO:
 1948:     case OP_MINUPTOI:
 1949:     case OP_NOTMINPLUS:
 1950:     case OP_NOTMINPLUSI:
 1951:     case OP_NOTMINQUERY:
 1952:     case OP_NOTMINQUERYI:
 1953:     case OP_NOTMINSTAR:
 1954:     case OP_NOTMINSTARI:
 1955:     case OP_NOTMINUPTO:
 1956:     case OP_NOTMINUPTOI:
 1957:     case OP_NOTPLUS:
 1958:     case OP_NOTPLUSI:
 1959:     case OP_NOTPOSPLUS:
 1960:     case OP_NOTPOSPLUSI:
 1961:     case OP_NOTPOSQUERY:
 1962:     case OP_NOTPOSQUERYI:
 1963:     case OP_NOTPOSSTAR:
 1964:     case OP_NOTPOSSTARI:
 1965:     case OP_NOTPOSUPTO:
 1966:     case OP_NOTPOSUPTOI:
 1967:     case OP_NOTQUERY:
 1968:     case OP_NOTQUERYI:
 1969:     case OP_NOTSTAR:
 1970:     case OP_NOTSTARI:
 1971:     case OP_NOTUPTO:
 1972:     case OP_NOTUPTOI:
 1973:     case OP_PLUS:
 1974:     case OP_PLUSI:
 1975:     case OP_POSPLUS:
 1976:     case OP_POSPLUSI:
 1977:     case OP_POSQUERY:
 1978:     case OP_POSQUERYI:
 1979:     case OP_POSSTAR:
 1980:     case OP_POSSTARI:
 1981:     case OP_POSUPTO:
 1982:     case OP_POSUPTOI:
 1983:     case OP_QUERY:
 1984:     case OP_QUERYI:
 1985:     case OP_REF:
 1986:     case OP_REFI:
 1987:     case OP_DNREF:
 1988:     case OP_DNREFI:
 1989:     case OP_SBRA:
 1990:     case OP_SBRAPOS:
 1991:     case OP_SCBRA:
 1992:     case OP_SCBRAPOS:
 1993:     case OP_SCOND:
 1994:     case OP_SKIPZERO:
 1995:     case OP_STAR:
 1996:     case OP_STARI:
 1997:     case OP_TYPEMINPLUS:
 1998:     case OP_TYPEMINQUERY:
 1999:     case OP_TYPEMINSTAR:
 2000:     case OP_TYPEMINUPTO:
 2001:     case OP_TYPEPLUS:
 2002:     case OP_TYPEPOSPLUS:
 2003:     case OP_TYPEPOSQUERY:
 2004:     case OP_TYPEPOSSTAR:
 2005:     case OP_TYPEPOSUPTO:
 2006:     case OP_TYPEQUERY:
 2007:     case OP_TYPESTAR:
 2008:     case OP_TYPEUPTO:
 2009:     case OP_UPTO:
 2010:     case OP_UPTOI:
 2011:     return -1;
 2012: 
 2013:     /* Catch unrecognized opcodes so that when new ones are added they
 2014:     are not forgotten, as has happened in the past. */
 2015: 
 2016:     default:
 2017:     return -4;
 2018:     }
 2019:   }
 2020: /* Control never gets here */
 2021: }
 2022: 
 2023: 
 2024: 
 2025: /*************************************************
 2026: *    Scan compiled regex for specific bracket    *
 2027: *************************************************/
 2028: 
 2029: /* This little function scans through a compiled pattern until it finds a
 2030: capturing bracket with the given number, or, if the number is negative, an
 2031: instance of OP_REVERSE for a lookbehind. The function is global in the C sense
 2032: so that it can be called from pcre_study() when finding the minimum matching
 2033: length.
 2034: 
 2035: Arguments:
 2036:   code        points to start of expression
 2037:   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
 2038:   number      the required bracket number or negative to find a lookbehind
 2039: 
 2040: Returns:      pointer to the opcode for the bracket, or NULL if not found
 2041: */
 2042: 
 2043: const pcre_uchar *
 2044: PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
 2045: {
 2046: for (;;)
 2047:   {
 2048:   register pcre_uchar c = *code;
 2049: 
 2050:   if (c == OP_END) return NULL;
 2051: 
 2052:   /* XCLASS is used for classes that cannot be represented just by a bit
 2053:   map. This includes negated single high-valued characters. The length in
 2054:   the table is zero; the actual length is stored in the compiled code. */
 2055: 
 2056:   if (c == OP_XCLASS) code += GET(code, 1);
 2057: 
 2058:   /* Handle recursion */
 2059: 
 2060:   else if (c == OP_REVERSE)
 2061:     {
 2062:     if (number < 0) return (pcre_uchar *)code;
 2063:     code += PRIV(OP_lengths)[c];
 2064:     }
 2065: 
 2066:   /* Handle capturing bracket */
 2067: 
 2068:   else if (c == OP_CBRA || c == OP_SCBRA ||
 2069:            c == OP_CBRAPOS || c == OP_SCBRAPOS)
 2070:     {
 2071:     int n = (int)GET2(code, 1+LINK_SIZE);
 2072:     if (n == number) return (pcre_uchar *)code;
 2073:     code += PRIV(OP_lengths)[c];
 2074:     }
 2075: 
 2076:   /* Otherwise, we can get the item's length from the table, except that for
 2077:   repeated character types, we have to test for \p and \P, which have an extra
 2078:   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
 2079:   must add in its length. */
 2080: 
 2081:   else
 2082:     {
 2083:     switch(c)
 2084:       {
 2085:       case OP_TYPESTAR:
 2086:       case OP_TYPEMINSTAR:
 2087:       case OP_TYPEPLUS:
 2088:       case OP_TYPEMINPLUS:
 2089:       case OP_TYPEQUERY:
 2090:       case OP_TYPEMINQUERY:
 2091:       case OP_TYPEPOSSTAR:
 2092:       case OP_TYPEPOSPLUS:
 2093:       case OP_TYPEPOSQUERY:
 2094:       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
 2095:       break;
 2096: 
 2097:       case OP_TYPEUPTO:
 2098:       case OP_TYPEMINUPTO:
 2099:       case OP_TYPEEXACT:
 2100:       case OP_TYPEPOSUPTO:
 2101:       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
 2102:         code += 2;
 2103:       break;
 2104: 
 2105:       case OP_MARK:
 2106:       case OP_PRUNE_ARG:
 2107:       case OP_SKIP_ARG:
 2108:       case OP_THEN_ARG:
 2109:       code += code[1];
 2110:       break;
 2111:       }
 2112: 
 2113:     /* Add in the fixed length from the table */
 2114: 
 2115:     code += PRIV(OP_lengths)[c];
 2116: 
 2117:   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
 2118:   a multi-byte character. The length in the table is a minimum, so we have to
 2119:   arrange to skip the extra bytes. */
 2120: 
 2121: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
 2122:     if (utf) switch(c)
 2123:       {
 2124:       case OP_CHAR:
 2125:       case OP_CHARI:
 2126:       case OP_EXACT:
 2127:       case OP_EXACTI:
 2128:       case OP_UPTO:
 2129:       case OP_UPTOI:
 2130:       case OP_MINUPTO:
 2131:       case OP_MINUPTOI:
 2132:       case OP_POSUPTO:
 2133:       case OP_POSUPTOI:
 2134:       case OP_STAR:
 2135:       case OP_STARI:
 2136:       case OP_MINSTAR:
 2137:       case OP_MINSTARI:
 2138:       case OP_POSSTAR:
 2139:       case OP_POSSTARI:
 2140:       case OP_PLUS:
 2141:       case OP_PLUSI:
 2142:       case OP_MINPLUS:
 2143:       case OP_MINPLUSI:
 2144:       case OP_POSPLUS:
 2145:       case OP_POSPLUSI:
 2146:       case OP_QUERY:
 2147:       case OP_QUERYI:
 2148:       case OP_MINQUERY:
 2149:       case OP_MINQUERYI:
 2150:       case OP_POSQUERY:
 2151:       case OP_POSQUERYI:
 2152:       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
 2153:       break;
 2154:       }
 2155: #else
 2156:     (void)(utf);  /* Keep compiler happy by referencing function argument */
 2157: #endif
 2158:     }
 2159:   }
 2160: }
 2161: 
 2162: 
 2163: 
 2164: /*************************************************
 2165: *   Scan compiled regex for recursion reference  *
 2166: *************************************************/
 2167: 
 2168: /* This little function scans through a compiled pattern until it finds an
 2169: instance of OP_RECURSE.
 2170: 
 2171: Arguments:
 2172:   code        points to start of expression
 2173:   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
 2174: 
 2175: Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
 2176: */
 2177: 
 2178: static const pcre_uchar *
 2179: find_recurse(const pcre_uchar *code, BOOL utf)
 2180: {
 2181: for (;;)
 2182:   {
 2183:   register pcre_uchar c = *code;
 2184:   if (c == OP_END) return NULL;
 2185:   if (c == OP_RECURSE) return code;
 2186: 
 2187:   /* XCLASS is used for classes that cannot be represented just by a bit
 2188:   map. This includes negated single high-valued characters. The length in
 2189:   the table is zero; the actual length is stored in the compiled code. */
 2190: 
 2191:   if (c == OP_XCLASS) code += GET(code, 1);
 2192: 
 2193:   /* Otherwise, we can get the item's length from the table, except that for
 2194:   repeated character types, we have to test for \p and \P, which have an extra
 2195:   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
 2196:   must add in its length. */
 2197: 
 2198:   else
 2199:     {
 2200:     switch(c)
 2201:       {
 2202:       case OP_TYPESTAR:
 2203:       case OP_TYPEMINSTAR:
 2204:       case OP_TYPEPLUS:
 2205:       case OP_TYPEMINPLUS:
 2206:       case OP_TYPEQUERY:
 2207:       case OP_TYPEMINQUERY:
 2208:       case OP_TYPEPOSSTAR:
 2209:       case OP_TYPEPOSPLUS:
 2210:       case OP_TYPEPOSQUERY:
 2211:       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
 2212:       break;
 2213: 
 2214:       case OP_TYPEPOSUPTO:
 2215:       case OP_TYPEUPTO:
 2216:       case OP_TYPEMINUPTO:
 2217:       case OP_TYPEEXACT:
 2218:       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
 2219:         code += 2;
 2220:       break;
 2221: 
 2222:       case OP_MARK:
 2223:       case OP_PRUNE_ARG:
 2224:       case OP_SKIP_ARG:
 2225:       case OP_THEN_ARG:
 2226:       code += code[1];
 2227:       break;
 2228:       }
 2229: 
 2230:     /* Add in the fixed length from the table */
 2231: 
 2232:     code += PRIV(OP_lengths)[c];
 2233: 
 2234:     /* In UTF-8 mode, opcodes that are followed by a character may be followed
 2235:     by a multi-byte character. The length in the table is a minimum, so we have
 2236:     to arrange to skip the extra bytes. */
 2237: 
 2238: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
 2239:     if (utf) switch(c)
 2240:       {
 2241:       case OP_CHAR:
 2242:       case OP_CHARI:
 2243:       case OP_NOT:
 2244:       case OP_NOTI:
 2245:       case OP_EXACT:
 2246:       case OP_EXACTI:
 2247:       case OP_NOTEXACT:
 2248:       case OP_NOTEXACTI:
 2249:       case OP_UPTO:
 2250:       case OP_UPTOI:
 2251:       case OP_NOTUPTO:
 2252:       case OP_NOTUPTOI:
 2253:       case OP_MINUPTO:
 2254:       case OP_MINUPTOI:
 2255:       case OP_NOTMINUPTO:
 2256:       case OP_NOTMINUPTOI:
 2257:       case OP_POSUPTO:
 2258:       case OP_POSUPTOI:
 2259:       case OP_NOTPOSUPTO:
 2260:       case OP_NOTPOSUPTOI:
 2261:       case OP_STAR:
 2262:       case OP_STARI:
 2263:       case OP_NOTSTAR:
 2264:       case OP_NOTSTARI:
 2265:       case OP_MINSTAR:
 2266:       case OP_MINSTARI:
 2267:       case OP_NOTMINSTAR:
 2268:       case OP_NOTMINSTARI:
 2269:       case OP_POSSTAR:
 2270:       case OP_POSSTARI:
 2271:       case OP_NOTPOSSTAR:
 2272:       case OP_NOTPOSSTARI:
 2273:       case OP_PLUS:
 2274:       case OP_PLUSI:
 2275:       case OP_NOTPLUS:
 2276:       case OP_NOTPLUSI:
 2277:       case OP_MINPLUS:
 2278:       case OP_MINPLUSI:
 2279:       case OP_NOTMINPLUS:
 2280:       case OP_NOTMINPLUSI:
 2281:       case OP_POSPLUS:
 2282:       case OP_POSPLUSI:
 2283:       case OP_NOTPOSPLUS:
 2284:       case OP_NOTPOSPLUSI:
 2285:       case OP_QUERY:
 2286:       case OP_QUERYI:
 2287:       case OP_NOTQUERY:
 2288:       case OP_NOTQUERYI:
 2289:       case OP_MINQUERY:
 2290:       case OP_MINQUERYI:
 2291:       case OP_NOTMINQUERY:
 2292:       case OP_NOTMINQUERYI:
 2293:       case OP_POSQUERY:
 2294:       case OP_POSQUERYI:
 2295:       case OP_NOTPOSQUERY:
 2296:       case OP_NOTPOSQUERYI:
 2297:       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
 2298:       break;
 2299:       }
 2300: #else
 2301:     (void)(utf);  /* Keep compiler happy by referencing function argument */
 2302: #endif
 2303:     }
 2304:   }
 2305: }
 2306: 
 2307: 
 2308: 
 2309: /*************************************************
 2310: *    Scan compiled branch for non-emptiness      *
 2311: *************************************************/
 2312: 
 2313: /* This function scans through a branch of a compiled pattern to see whether it
 2314: can match the empty string or not. It is called from could_be_empty()
 2315: below and from compile_branch() when checking for an unlimited repeat of a
 2316: group that can match nothing. Note that first_significant_code() skips over
 2317: backward and negative forward assertions when its final argument is TRUE. If we
 2318: hit an unclosed bracket, we return "empty" - this means we've struck an inner
 2319: bracket whose current branch will already have been scanned.
 2320: 
 2321: Arguments:
 2322:   code        points to start of search
 2323:   endcode     points to where to stop
 2324:   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
 2325:   cd          contains pointers to tables etc.
 2326:   recurses    chain of recurse_check to catch mutual recursion
 2327: 
 2328: Returns:      TRUE if what is matched could be empty
 2329: */
 2330: 
 2331: typedef struct recurse_check {
 2332:   struct recurse_check *prev;
 2333:   const pcre_uchar *group;
 2334: } recurse_check;
 2335: 
 2336: static BOOL
 2337: could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
 2338:   BOOL utf, compile_data *cd, recurse_check *recurses)
 2339: {
 2340: register pcre_uchar c;
 2341: recurse_check this_recurse;
 2342: 
 2343: for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
 2344:      code < endcode;
 2345:      code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
 2346:   {
 2347:   const pcre_uchar *ccode;
 2348: 
 2349:   c = *code;
 2350: 
 2351:   /* Skip over forward assertions; the other assertions are skipped by
 2352:   first_significant_code() with a TRUE final argument. */
 2353: 
 2354:   if (c == OP_ASSERT)
 2355:     {
 2356:     do code += GET(code, 1); while (*code == OP_ALT);
 2357:     c = *code;
 2358:     continue;
 2359:     }
 2360: 
 2361:   /* For a recursion/subroutine call, if its end has been reached, which
 2362:   implies a backward reference subroutine call, we can scan it. If it's a
 2363:   forward reference subroutine call, we can't. To detect forward reference
 2364:   we have to scan up the list that is kept in the workspace. This function is
 2365:   called only when doing the real compile, not during the pre-compile that
 2366:   measures the size of the compiled pattern. */
 2367: 
 2368:   if (c == OP_RECURSE)
 2369:     {
 2370:     const pcre_uchar *scode = cd->start_code + GET(code, 1);
 2371:     BOOL empty_branch;
 2372: 
 2373:     /* Test for forward reference or uncompleted reference. This is disabled
 2374:     when called to scan a completed pattern by setting cd->start_workspace to
 2375:     NULL. */
 2376: 
 2377:     if (cd->start_workspace != NULL)
 2378:       {
 2379:       const pcre_uchar *tcode;
 2380:       for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
 2381:         if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
 2382:       if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
 2383:       }
 2384: 
 2385:     /* If we are scanning a completed pattern, there are no forward references
 2386:     and all groups are complete. We need to detect whether this is a recursive
 2387:     call, as otherwise there will be an infinite loop. If it is a recursion,
 2388:     just skip over it. Simple recursions are easily detected. For mutual
 2389:     recursions we keep a chain on the stack. */
 2390: 
 2391:     else
 2392:       {
 2393:       recurse_check *r = recurses;
 2394:       const pcre_uchar *endgroup = scode;
 2395: 
 2396:       do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
 2397:       if (code >= scode && code <= endgroup) continue;  /* Simple recursion */
 2398: 
 2399:       for (r = recurses; r != NULL; r = r->prev)
 2400:         if (r->group == scode) break;
 2401:       if (r != NULL) continue;   /* Mutual recursion */
 2402:       }
 2403: 
 2404:     /* Completed reference; scan the referenced group, remembering it on the
 2405:     stack chain to detect mutual recursions. */
 2406: 
 2407:     empty_branch = FALSE;
 2408:     this_recurse.prev = recurses;
 2409:     this_recurse.group = scode;
 2410: 
 2411:     do
 2412:       {
 2413:       if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
 2414:         {
 2415:         empty_branch = TRUE;
 2416:         break;
 2417:         }
 2418:       scode += GET(scode, 1);
 2419:       }
 2420:     while (*scode == OP_ALT);
 2421: 
 2422:     if (!empty_branch) return FALSE;  /* All branches are non-empty */
 2423:     continue;
 2424:     }
 2425: 
 2426:   /* Groups with zero repeats can of course be empty; skip them. */
 2427: 
 2428:   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
 2429:       c == OP_BRAPOSZERO)
 2430:     {
 2431:     code += PRIV(OP_lengths)[c];
 2432:     do code += GET(code, 1); while (*code == OP_ALT);
 2433:     c = *code;
 2434:     continue;
 2435:     }
 2436: 
 2437:   /* A nested group that is already marked as "could be empty" can just be
 2438:   skipped. */
 2439: 
 2440:   if (c == OP_SBRA  || c == OP_SBRAPOS ||
 2441:       c == OP_SCBRA || c == OP_SCBRAPOS)
 2442:     {
 2443:     do code += GET(code, 1); while (*code == OP_ALT);
 2444:     c = *code;
 2445:     continue;
 2446:     }
 2447: 
 2448:   /* For other groups, scan the branches. */
 2449: 
 2450:   if (c == OP_BRA  || c == OP_BRAPOS ||
 2451:       c == OP_CBRA || c == OP_CBRAPOS ||
 2452:       c == OP_ONCE || c == OP_ONCE_NC ||
 2453:       c == OP_COND)
 2454:     {
 2455:     BOOL empty_branch;
 2456:     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
 2457: 
 2458:     /* If a conditional group has only one branch, there is a second, implied,
 2459:     empty branch, so just skip over the conditional, because it could be empty.
 2460:     Otherwise, scan the individual branches of the group. */
 2461: 
 2462:     if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
 2463:       code += GET(code, 1);
 2464:     else
 2465:       {
 2466:       empty_branch = FALSE;
 2467:       do
 2468:         {
 2469:         if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd, NULL))
 2470:           empty_branch = TRUE;
 2471:         code += GET(code, 1);
 2472:         }
 2473:       while (*code == OP_ALT);
 2474:       if (!empty_branch) return FALSE;   /* All branches are non-empty */
 2475:       }
 2476: 
 2477:     c = *code;
 2478:     continue;
 2479:     }
 2480: 
 2481:   /* Handle the other opcodes */
 2482: 
 2483:   switch (c)
 2484:     {
 2485:     /* Check for quantifiers after a class. XCLASS is used for classes that
 2486:     cannot be represented just by a bit map. This includes negated single
 2487:     high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
 2488:     actual length is stored in the compiled code, so we must update "code"
 2489:     here. */
 2490: 
 2491: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 2492:     case OP_XCLASS:
 2493:     ccode = code += GET(code, 1);
 2494:     goto CHECK_CLASS_REPEAT;
 2495: #endif
 2496: 
 2497:     case OP_CLASS:
 2498:     case OP_NCLASS:
 2499:     ccode = code + PRIV(OP_lengths)[OP_CLASS];
 2500: 
 2501: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 2502:     CHECK_CLASS_REPEAT:
 2503: #endif
 2504: 
 2505:     switch (*ccode)
 2506:       {
 2507:       case OP_CRSTAR:            /* These could be empty; continue */
 2508:       case OP_CRMINSTAR:
 2509:       case OP_CRQUERY:
 2510:       case OP_CRMINQUERY:
 2511:       case OP_CRPOSSTAR:
 2512:       case OP_CRPOSQUERY:
 2513:       break;
 2514: 
 2515:       default:                   /* Non-repeat => class must match */
 2516:       case OP_CRPLUS:            /* These repeats aren't empty */
 2517:       case OP_CRMINPLUS:
 2518:       case OP_CRPOSPLUS:
 2519:       return FALSE;
 2520: 
 2521:       case OP_CRRANGE:
 2522:       case OP_CRMINRANGE:
 2523:       case OP_CRPOSRANGE:
 2524:       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
 2525:       break;
 2526:       }
 2527:     break;
 2528: 
 2529:     /* Opcodes that must match a character */
 2530: 
 2531:     case OP_ANY:
 2532:     case OP_ALLANY:
 2533:     case OP_ANYBYTE:
 2534: 
 2535:     case OP_PROP:
 2536:     case OP_NOTPROP:
 2537:     case OP_ANYNL:
 2538: 
 2539:     case OP_NOT_HSPACE:
 2540:     case OP_HSPACE:
 2541:     case OP_NOT_VSPACE:
 2542:     case OP_VSPACE:
 2543:     case OP_EXTUNI:
 2544: 
 2545:     case OP_NOT_DIGIT:
 2546:     case OP_DIGIT:
 2547:     case OP_NOT_WHITESPACE:
 2548:     case OP_WHITESPACE:
 2549:     case OP_NOT_WORDCHAR:
 2550:     case OP_WORDCHAR:
 2551: 
 2552:     case OP_CHAR:
 2553:     case OP_CHARI:
 2554:     case OP_NOT:
 2555:     case OP_NOTI:
 2556: 
 2557:     case OP_PLUS:
 2558:     case OP_PLUSI:
 2559:     case OP_MINPLUS:
 2560:     case OP_MINPLUSI:
 2561: 
 2562:     case OP_NOTPLUS:
 2563:     case OP_NOTPLUSI:
 2564:     case OP_NOTMINPLUS:
 2565:     case OP_NOTMINPLUSI:
 2566: 
 2567:     case OP_POSPLUS:
 2568:     case OP_POSPLUSI:
 2569:     case OP_NOTPOSPLUS:
 2570:     case OP_NOTPOSPLUSI:
 2571: 
 2572:     case OP_EXACT:
 2573:     case OP_EXACTI:
 2574:     case OP_NOTEXACT:
 2575:     case OP_NOTEXACTI:
 2576: 
 2577:     case OP_TYPEPLUS:
 2578:     case OP_TYPEMINPLUS:
 2579:     case OP_TYPEPOSPLUS:
 2580:     case OP_TYPEEXACT:
 2581: 
 2582:     return FALSE;
 2583: 
 2584:     /* These are going to continue, as they may be empty, but we have to
 2585:     fudge the length for the \p and \P cases. */
 2586: 
 2587:     case OP_TYPESTAR:
 2588:     case OP_TYPEMINSTAR:
 2589:     case OP_TYPEPOSSTAR:
 2590:     case OP_TYPEQUERY:
 2591:     case OP_TYPEMINQUERY:
 2592:     case OP_TYPEPOSQUERY:
 2593:     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
 2594:     break;
 2595: 
 2596:     /* Same for these */
 2597: 
 2598:     case OP_TYPEUPTO:
 2599:     case OP_TYPEMINUPTO:
 2600:     case OP_TYPEPOSUPTO:
 2601:     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
 2602:       code += 2;
 2603:     break;
 2604: 
 2605:     /* End of branch */
 2606: 
 2607:     case OP_KET:
 2608:     case OP_KETRMAX:
 2609:     case OP_KETRMIN:
 2610:     case OP_KETRPOS:
 2611:     case OP_ALT:
 2612:     return TRUE;
 2613: 
 2614:     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
 2615:     MINUPTO, and POSUPTO and their caseless and negative versions may be
 2616:     followed by a multibyte character. */
 2617: 
 2618: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
 2619:     case OP_STAR:
 2620:     case OP_STARI:
 2621:     case OP_NOTSTAR:
 2622:     case OP_NOTSTARI:
 2623: 
 2624:     case OP_MINSTAR:
 2625:     case OP_MINSTARI:
 2626:     case OP_NOTMINSTAR:
 2627:     case OP_NOTMINSTARI:
 2628: 
 2629:     case OP_POSSTAR:
 2630:     case OP_POSSTARI:
 2631:     case OP_NOTPOSSTAR:
 2632:     case OP_NOTPOSSTARI:
 2633: 
 2634:     case OP_QUERY:
 2635:     case OP_QUERYI:
 2636:     case OP_NOTQUERY:
 2637:     case OP_NOTQUERYI:
 2638: 
 2639:     case OP_MINQUERY:
 2640:     case OP_MINQUERYI:
 2641:     case OP_NOTMINQUERY:
 2642:     case OP_NOTMINQUERYI:
 2643: 
 2644:     case OP_POSQUERY:
 2645:     case OP_POSQUERYI:
 2646:     case OP_NOTPOSQUERY:
 2647:     case OP_NOTPOSQUERYI:
 2648: 
 2649:     if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
 2650:     break;
 2651: 
 2652:     case OP_UPTO:
 2653:     case OP_UPTOI:
 2654:     case OP_NOTUPTO:
 2655:     case OP_NOTUPTOI:
 2656: 
 2657:     case OP_MINUPTO:
 2658:     case OP_MINUPTOI:
 2659:     case OP_NOTMINUPTO:
 2660:     case OP_NOTMINUPTOI:
 2661: 
 2662:     case OP_POSUPTO:
 2663:     case OP_POSUPTOI:
 2664:     case OP_NOTPOSUPTO:
 2665:     case OP_NOTPOSUPTOI:
 2666: 
 2667:     if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
 2668:     break;
 2669: #endif
 2670: 
 2671:     /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
 2672:     string. */
 2673: 
 2674:     case OP_MARK:
 2675:     case OP_PRUNE_ARG:
 2676:     case OP_SKIP_ARG:
 2677:     case OP_THEN_ARG:
 2678:     code += code[1];
 2679:     break;
 2680: 
 2681:     /* None of the remaining opcodes are required to match a character. */
 2682: 
 2683:     default:
 2684:     break;
 2685:     }
 2686:   }
 2687: 
 2688: return TRUE;
 2689: }
 2690: 
 2691: 
 2692: 
 2693: /*************************************************
 2694: *    Scan compiled regex for non-emptiness       *
 2695: *************************************************/
 2696: 
 2697: /* This function is called to check for left recursive calls. We want to check
 2698: the current branch of the current pattern to see if it could match the empty
 2699: string. If it could, we must look outwards for branches at other levels,
 2700: stopping when we pass beyond the bracket which is the subject of the recursion.
 2701: This function is called only during the real compile, not during the
 2702: pre-compile.
 2703: 
 2704: Arguments:
 2705:   code        points to start of the recursion
 2706:   endcode     points to where to stop (current RECURSE item)
 2707:   bcptr       points to the chain of current (unclosed) branch starts
 2708:   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
 2709:   cd          pointers to tables etc
 2710: 
 2711: Returns:      TRUE if what is matched could be empty
 2712: */
 2713: 
 2714: static BOOL
 2715: could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
 2716:   branch_chain *bcptr, BOOL utf, compile_data *cd)
 2717: {
 2718: while (bcptr != NULL && bcptr->current_branch >= code)
 2719:   {
 2720:   if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
 2721:     return FALSE;
 2722:   bcptr = bcptr->outer;
 2723:   }
 2724: return TRUE;
 2725: }
 2726: 
 2727: 
 2728: 
 2729: /*************************************************
 2730: *        Base opcode of repeated opcodes         *
 2731: *************************************************/
 2732: 
 2733: /* Returns the base opcode for repeated single character type opcodes. If the
 2734: opcode is not a repeated character type, it returns with the original value.
 2735: 
 2736: Arguments:  c opcode
 2737: Returns:    base opcode for the type
 2738: */
 2739: 
 2740: static pcre_uchar
 2741: get_repeat_base(pcre_uchar c)
 2742: {
 2743: return (c > OP_TYPEPOSUPTO)? c :
 2744:        (c >= OP_TYPESTAR)?   OP_TYPESTAR :
 2745:        (c >= OP_NOTSTARI)?   OP_NOTSTARI :
 2746:        (c >= OP_NOTSTAR)?    OP_NOTSTAR :
 2747:        (c >= OP_STARI)?      OP_STARI :
 2748:                              OP_STAR;
 2749: }
 2750: 
 2751: 
 2752: 
 2753: #ifdef SUPPORT_UCP
 2754: /*************************************************
 2755: *        Check a character and a property        *
 2756: *************************************************/
 2757: 
 2758: /* This function is called by check_auto_possessive() when a property item
 2759: is adjacent to a fixed character.
 2760: 
 2761: Arguments:
 2762:   c            the character
 2763:   ptype        the property type
 2764:   pdata        the data for the type
 2765:   negated      TRUE if it's a negated property (\P or \p{^)
 2766: 
 2767: Returns:       TRUE if auto-possessifying is OK
 2768: */
 2769: 
 2770: static BOOL
 2771: check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
 2772:   BOOL negated)
 2773: {
 2774: const pcre_uint32 *p;
 2775: const ucd_record *prop = GET_UCD(c);
 2776: 
 2777: switch(ptype)
 2778:   {
 2779:   case PT_LAMP:
 2780:   return (prop->chartype == ucp_Lu ||
 2781:           prop->chartype == ucp_Ll ||
 2782:           prop->chartype == ucp_Lt) == negated;
 2783: 
 2784:   case PT_GC:
 2785:   return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
 2786: 
 2787:   case PT_PC:
 2788:   return (pdata == prop->chartype) == negated;
 2789: 
 2790:   case PT_SC:
 2791:   return (pdata == prop->script) == negated;
 2792: 
 2793:   /* These are specials */
 2794: 
 2795:   case PT_ALNUM:
 2796:   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
 2797:           PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
 2798: 
 2799:   /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
 2800:   means that Perl space and POSIX space are now identical. PCRE was changed
 2801:   at release 8.34. */
 2802: 
 2803:   case PT_SPACE:    /* Perl space */
 2804:   case PT_PXSPACE:  /* POSIX space */
 2805:   switch(c)
 2806:     {
 2807:     HSPACE_CASES:
 2808:     VSPACE_CASES:
 2809:     return negated;
 2810: 
 2811:     default:
 2812:     return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
 2813:     }
 2814:   break;  /* Control never reaches here */
 2815: 
 2816:   case PT_WORD:
 2817:   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
 2818:           PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
 2819:           c == CHAR_UNDERSCORE) == negated;
 2820: 
 2821:   case PT_CLIST:
 2822:   p = PRIV(ucd_caseless_sets) + prop->caseset;
 2823:   for (;;)
 2824:     {
 2825:     if (c < *p) return !negated;
 2826:     if (c == *p++) return negated;
 2827:     }
 2828:   break;  /* Control never reaches here */
 2829:   }
 2830: 
 2831: return FALSE;
 2832: }
 2833: #endif  /* SUPPORT_UCP */
 2834: 
 2835: 
 2836: 
 2837: /*************************************************
 2838: *        Fill the character property list        *
 2839: *************************************************/
 2840: 
 2841: /* Checks whether the code points to an opcode that can take part in auto-
 2842: possessification, and if so, fills a list with its properties.
 2843: 
 2844: Arguments:
 2845:   code        points to start of expression
 2846:   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
 2847:   fcc         points to case-flipping table
 2848:   list        points to output list
 2849:               list[0] will be filled with the opcode
 2850:               list[1] will be non-zero if this opcode
 2851:                 can match an empty character string
 2852:               list[2..7] depends on the opcode
 2853: 
 2854: Returns:      points to the start of the next opcode if *code is accepted
 2855:               NULL if *code is not accepted
 2856: */
 2857: 
 2858: static const pcre_uchar *
 2859: get_chr_property_list(const pcre_uchar *code, BOOL utf,
 2860:   const pcre_uint8 *fcc, pcre_uint32 *list)
 2861: {
 2862: pcre_uchar c = *code;
 2863: pcre_uchar base;
 2864: const pcre_uchar *end;
 2865: pcre_uint32 chr;
 2866: 
 2867: #ifdef SUPPORT_UCP
 2868: pcre_uint32 *clist_dest;
 2869: const pcre_uint32 *clist_src;
 2870: #else
 2871: utf = utf;  /* Suppress "unused parameter" compiler warning */
 2872: #endif
 2873: 
 2874: list[0] = c;
 2875: list[1] = FALSE;
 2876: code++;
 2877: 
 2878: if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
 2879:   {
 2880:   base = get_repeat_base(c);
 2881:   c -= (base - OP_STAR);
 2882: 
 2883:   if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
 2884:     code += IMM2_SIZE;
 2885: 
 2886:   list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
 2887: 
 2888:   switch(base)
 2889:     {
 2890:     case OP_STAR:
 2891:     list[0] = OP_CHAR;
 2892:     break;
 2893: 
 2894:     case OP_STARI:
 2895:     list[0] = OP_CHARI;
 2896:     break;
 2897: 
 2898:     case OP_NOTSTAR:
 2899:     list[0] = OP_NOT;
 2900:     break;
 2901: 
 2902:     case OP_NOTSTARI:
 2903:     list[0] = OP_NOTI;
 2904:     break;
 2905: 
 2906:     case OP_TYPESTAR:
 2907:     list[0] = *code;
 2908:     code++;
 2909:     break;
 2910:     }
 2911:   c = list[0];
 2912:   }
 2913: 
 2914: switch(c)
 2915:   {
 2916:   case OP_NOT_DIGIT:
 2917:   case OP_DIGIT:
 2918:   case OP_NOT_WHITESPACE:
 2919:   case OP_WHITESPACE:
 2920:   case OP_NOT_WORDCHAR:
 2921:   case OP_WORDCHAR:
 2922:   case OP_ANY:
 2923:   case OP_ALLANY:
 2924:   case OP_ANYNL:
 2925:   case OP_NOT_HSPACE:
 2926:   case OP_HSPACE:
 2927:   case OP_NOT_VSPACE:
 2928:   case OP_VSPACE:
 2929:   case OP_EXTUNI:
 2930:   case OP_EODN:
 2931:   case OP_EOD:
 2932:   case OP_DOLL:
 2933:   case OP_DOLLM:
 2934:   return code;
 2935: 
 2936:   case OP_CHAR:
 2937:   case OP_NOT:
 2938:   GETCHARINCTEST(chr, code);
 2939:   list[2] = chr;
 2940:   list[3] = NOTACHAR;
 2941:   return code;
 2942: 
 2943:   case OP_CHARI:
 2944:   case OP_NOTI:
 2945:   list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
 2946:   GETCHARINCTEST(chr, code);
 2947:   list[2] = chr;
 2948: 
 2949: #ifdef SUPPORT_UCP
 2950:   if (chr < 128 || (chr < 256 && !utf))
 2951:     list[3] = fcc[chr];
 2952:   else
 2953:     list[3] = UCD_OTHERCASE(chr);
 2954: #elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
 2955:   list[3] = (chr < 256) ? fcc[chr] : chr;
 2956: #else
 2957:   list[3] = fcc[chr];
 2958: #endif
 2959: 
 2960:   /* The othercase might be the same value. */
 2961: 
 2962:   if (chr == list[3])
 2963:     list[3] = NOTACHAR;
 2964:   else
 2965:     list[4] = NOTACHAR;
 2966:   return code;
 2967: 
 2968: #ifdef SUPPORT_UCP
 2969:   case OP_PROP:
 2970:   case OP_NOTPROP:
 2971:   if (code[0] != PT_CLIST)
 2972:     {
 2973:     list[2] = code[0];
 2974:     list[3] = code[1];
 2975:     return code + 2;
 2976:     }
 2977: 
 2978:   /* Convert only if we have enough space. */
 2979: 
 2980:   clist_src = PRIV(ucd_caseless_sets) + code[1];
 2981:   clist_dest = list + 2;
 2982:   code += 2;
 2983: 
 2984:   do {
 2985:      if (clist_dest >= list + 8)
 2986:        {
 2987:        /* Early return if there is not enough space. This should never
 2988:        happen, since all clists are shorter than 5 character now. */
 2989:        list[2] = code[0];
 2990:        list[3] = code[1];
 2991:        return code;
 2992:        }
 2993:      *clist_dest++ = *clist_src;
 2994:      }
 2995:   while(*clist_src++ != NOTACHAR);
 2996: 
 2997:   /* All characters are stored. The terminating NOTACHAR
 2998:   is copied form the clist itself. */
 2999: 
 3000:   list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
 3001:   return code;
 3002: #endif
 3003: 
 3004:   case OP_NCLASS:
 3005:   case OP_CLASS:
 3006: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 3007:   case OP_XCLASS:
 3008:   if (c == OP_XCLASS)
 3009:     end = code + GET(code, 0) - 1;
 3010:   else
 3011: #endif
 3012:     end = code + 32 / sizeof(pcre_uchar);
 3013: 
 3014:   switch(*end)
 3015:     {
 3016:     case OP_CRSTAR:
 3017:     case OP_CRMINSTAR:
 3018:     case OP_CRQUERY:
 3019:     case OP_CRMINQUERY:
 3020:     case OP_CRPOSSTAR:
 3021:     case OP_CRPOSQUERY:
 3022:     list[1] = TRUE;
 3023:     end++;
 3024:     break;
 3025: 
 3026:     case OP_CRPLUS:
 3027:     case OP_CRMINPLUS:
 3028:     case OP_CRPOSPLUS:
 3029:     end++;
 3030:     break;
 3031: 
 3032:     case OP_CRRANGE:
 3033:     case OP_CRMINRANGE:
 3034:     case OP_CRPOSRANGE:
 3035:     list[1] = (GET2(end, 1) == 0);
 3036:     end += 1 + 2 * IMM2_SIZE;
 3037:     break;
 3038:     }
 3039:   list[2] = end - code;
 3040:   return end;
 3041:   }
 3042: return NULL;    /* Opcode not accepted */
 3043: }
 3044: 
 3045: 
 3046: 
 3047: /*************************************************
 3048: *    Scan further character sets for match       *
 3049: *************************************************/
 3050: 
 3051: /* Checks whether the base and the current opcode have a common character, in
 3052: which case the base cannot be possessified.
 3053: 
 3054: Arguments:
 3055:   code        points to the byte code
 3056:   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
 3057:   cd          static compile data
 3058:   base_list   the data list of the base opcode
 3059: 
 3060: Returns:      TRUE if the auto-possessification is possible
 3061: */
 3062: 
 3063: static BOOL
 3064: compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
 3065:   const pcre_uint32 *base_list, const pcre_uchar *base_end)
 3066: {
 3067: pcre_uchar c;
 3068: pcre_uint32 list[8];
 3069: const pcre_uint32 *chr_ptr;
 3070: const pcre_uint32 *ochr_ptr;
 3071: const pcre_uint32 *list_ptr;
 3072: const pcre_uchar *next_code;
 3073: const pcre_uint8 *class_bitset;
 3074: const pcre_uint32 *set1, *set2, *set_end;
 3075: pcre_uint32 chr;
 3076: BOOL accepted, invert_bits;
 3077: 
 3078: /* Note: the base_list[1] contains whether the current opcode has greedy
 3079: (represented by a non-zero value) quantifier. This is a different from
 3080: other character type lists, which stores here that the character iterator
 3081: matches to an empty string (also represented by a non-zero value). */
 3082: 
 3083: for(;;)
 3084:   {
 3085:   /* All operations move the code pointer forward.
 3086:   Therefore infinite recursions are not possible. */
 3087: 
 3088:   c = *code;
 3089: 
 3090:   /* Skip over callouts */
 3091: 
 3092:   if (c == OP_CALLOUT)
 3093:     {
 3094:     code += PRIV(OP_lengths)[c];
 3095:     continue;
 3096:     }
 3097: 
 3098:   if (c == OP_ALT)
 3099:     {
 3100:     do code += GET(code, 1); while (*code == OP_ALT);
 3101:     c = *code;
 3102:     }
 3103: 
 3104:   switch(c)
 3105:     {
 3106:     case OP_END:
 3107:     case OP_KETRPOS:
 3108:     /* TRUE only in greedy case. The non-greedy case could be replaced by
 3109:     an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
 3110:     uses more memory, which we cannot get at this stage.) */
 3111: 
 3112:     return base_list[1] != 0;
 3113: 
 3114:     case OP_KET:
 3115:     /* If the bracket is capturing, and referenced by an OP_RECURSE, or
 3116:     it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
 3117:     cannot be converted to a possessive form. */
 3118: 
 3119:     if (base_list[1] == 0) return FALSE;
 3120: 
 3121:     switch(*(code - GET(code, 1)))
 3122:       {
 3123:       case OP_ASSERT:
 3124:       case OP_ASSERT_NOT:
 3125:       case OP_ASSERTBACK:
 3126:       case OP_ASSERTBACK_NOT:
 3127:       case OP_ONCE:
 3128:       case OP_ONCE_NC:
 3129:       /* Atomic sub-patterns and assertions can always auto-possessify their
 3130:       last iterator. */
 3131:       return TRUE;
 3132:       }
 3133: 
 3134:     code += PRIV(OP_lengths)[c];
 3135:     continue;
 3136: 
 3137:     case OP_ONCE:
 3138:     case OP_ONCE_NC:
 3139:     case OP_BRA:
 3140:     case OP_CBRA:
 3141:     next_code = code + GET(code, 1);
 3142:     code += PRIV(OP_lengths)[c];
 3143: 
 3144:     while (*next_code == OP_ALT)
 3145:       {
 3146:       if (!compare_opcodes(code, utf, cd, base_list, base_end)) return FALSE;
 3147:       code = next_code + 1 + LINK_SIZE;
 3148:       next_code += GET(next_code, 1);
 3149:       }
 3150:     continue;
 3151: 
 3152:     case OP_BRAZERO:
 3153:     case OP_BRAMINZERO:
 3154: 
 3155:     next_code = code + 1;
 3156:     if (*next_code != OP_BRA && *next_code != OP_CBRA
 3157:         && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
 3158: 
 3159:     do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
 3160: 
 3161:     /* The bracket content will be checked by the
 3162:     OP_BRA/OP_CBRA case above. */
 3163:     next_code += 1 + LINK_SIZE;
 3164:     if (!compare_opcodes(next_code, utf, cd, base_list, base_end))
 3165:       return FALSE;
 3166: 
 3167:     code += PRIV(OP_lengths)[c];
 3168:     continue;
 3169:     }
 3170: 
 3171:   /* Check for a supported opcode, and load its properties. */
 3172: 
 3173:   code = get_chr_property_list(code, utf, cd->fcc, list);
 3174:   if (code == NULL) return FALSE;    /* Unsupported */
 3175: 
 3176:   /* If either opcode is a small character list, set pointers for comparing
 3177:   characters from that list with another list, or with a property. */
 3178: 
 3179:   if (base_list[0] == OP_CHAR)
 3180:     {
 3181:     chr_ptr = base_list + 2;
 3182:     list_ptr = list;
 3183:     }
 3184:   else if (list[0] == OP_CHAR)
 3185:     {
 3186:     chr_ptr = list + 2;
 3187:     list_ptr = base_list;
 3188:     }
 3189: 
 3190:   /* Character bitsets can also be compared to certain opcodes. */
 3191: 
 3192:   else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
 3193: #ifdef COMPILE_PCRE8
 3194:       /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
 3195:       || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
 3196: #endif
 3197:       )
 3198:     {
 3199: #ifdef COMPILE_PCRE8
 3200:     if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
 3201: #else
 3202:     if (base_list[0] == OP_CLASS)
 3203: #endif
 3204:       {
 3205:       set1 = (pcre_uint32 *)(base_end - base_list[2]);
 3206:       list_ptr = list;
 3207:       }
 3208:     else
 3209:       {
 3210:       set1 = (pcre_uint32 *)(code - list[2]);
 3211:       list_ptr = base_list;
 3212:       }
 3213: 
 3214:     invert_bits = FALSE;
 3215:     switch(list_ptr[0])
 3216:       {
 3217:       case OP_CLASS:
 3218:       case OP_NCLASS:
 3219:       set2 = (pcre_uint32 *)
 3220:         ((list_ptr == list ? code : base_end) - list_ptr[2]);
 3221:       break;
 3222: 
 3223:       /* OP_XCLASS cannot be supported here, because its bitset
 3224:       is not necessarily complete. E.g: [a-\0x{200}] is stored
 3225:       as a character range, and the appropriate bits are not set. */
 3226: 
 3227:       case OP_NOT_DIGIT:
 3228:         invert_bits = TRUE;
 3229:         /* Fall through */
 3230:       case OP_DIGIT:
 3231:         set2 = (pcre_uint32 *)(cd->cbits + cbit_digit);
 3232:         break;
 3233: 
 3234:       case OP_NOT_WHITESPACE:
 3235:         invert_bits = TRUE;
 3236:         /* Fall through */
 3237:       case OP_WHITESPACE:
 3238:         set2 = (pcre_uint32 *)(cd->cbits + cbit_space);
 3239:         break;
 3240: 
 3241:       case OP_NOT_WORDCHAR:
 3242:         invert_bits = TRUE;
 3243:         /* Fall through */
 3244:       case OP_WORDCHAR:
 3245:         set2 = (pcre_uint32 *)(cd->cbits + cbit_word);
 3246:         break;
 3247: 
 3248:       default:
 3249:       return FALSE;
 3250:       }
 3251: 
 3252:     /* Compare 4 bytes to improve speed. */
 3253:     set_end = set1 + (32 / 4);
 3254:     if (invert_bits)
 3255:       {
 3256:       do
 3257:         {
 3258:         if ((*set1++ & ~(*set2++)) != 0) return FALSE;
 3259:         }
 3260:       while (set1 < set_end);
 3261:       }
 3262:     else
 3263:       {
 3264:       do
 3265:         {
 3266:         if ((*set1++ & *set2++) != 0) return FALSE;
 3267:         }
 3268:       while (set1 < set_end);
 3269:       }
 3270: 
 3271:     if (list[1] == 0) return TRUE;
 3272:     /* Might be an empty repeat. */
 3273:     continue;
 3274:     }
 3275: 
 3276:   /* Some property combinations also acceptable. Unicode property opcodes are
 3277:   processed specially; the rest can be handled with a lookup table. */
 3278: 
 3279:   else
 3280:     {
 3281:     pcre_uint32 leftop, rightop;
 3282: 
 3283:     leftop = base_list[0];
 3284:     rightop = list[0];
 3285: 
 3286: #ifdef SUPPORT_UCP
 3287:     accepted = FALSE; /* Always set in non-unicode case. */
 3288:     if (leftop == OP_PROP || leftop == OP_NOTPROP)
 3289:       {
 3290:       if (rightop == OP_EOD)
 3291:         accepted = TRUE;
 3292:       else if (rightop == OP_PROP || rightop == OP_NOTPROP)
 3293:         {
 3294:         int n;
 3295:         const pcre_uint8 *p;
 3296:         BOOL same = leftop == rightop;
 3297:         BOOL lisprop = leftop == OP_PROP;
 3298:         BOOL risprop = rightop == OP_PROP;
 3299:         BOOL bothprop = lisprop && risprop;
 3300: 
 3301:         /* There's a table that specifies how each combination is to be
 3302:         processed:
 3303:           0   Always return FALSE (never auto-possessify)
 3304:           1   Character groups are distinct (possessify if both are OP_PROP)
 3305:           2   Check character categories in the same group (general or particular)
 3306:           3   Return TRUE if the two opcodes are not the same
 3307:           ... see comments below
 3308:         */
 3309: 
 3310:         n = propposstab[base_list[2]][list[2]];
 3311:         switch(n)
 3312:           {
 3313:           case 0: break;
 3314:           case 1: accepted = bothprop; break;
 3315:           case 2: accepted = (base_list[3] == list[3]) != same; break;
 3316:           case 3: accepted = !same; break;
 3317: 
 3318:           case 4:  /* Left general category, right particular category */
 3319:           accepted = risprop && catposstab[base_list[3]][list[3]] == same;
 3320:           break;
 3321: 
 3322:           case 5:  /* Right general category, left particular category */
 3323:           accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
 3324:           break;
 3325: 
 3326:           /* This code is logically tricky. Think hard before fiddling with it.
 3327:           The posspropstab table has four entries per row. Each row relates to
 3328:           one of PCRE's special properties such as ALNUM or SPACE or WORD.
 3329:           Only WORD actually needs all four entries, but using repeats for the
 3330:           others means they can all use the same code below.
 3331: 
 3332:           The first two entries in each row are Unicode general categories, and
 3333:           apply always, because all the characters they include are part of the
 3334:           PCRE character set. The third and fourth entries are a general and a
 3335:           particular category, respectively, that include one or more relevant
 3336:           characters. One or the other is used, depending on whether the check
 3337:           is for a general or a particular category. However, in both cases the
 3338:           category contains more characters than the specials that are defined
 3339:           for the property being tested against. Therefore, it cannot be used
 3340:           in a NOTPROP case.
 3341: 
 3342:           Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
 3343:           Underscore is covered by ucp_P or ucp_Po. */
 3344: 
 3345:           case 6:  /* Left alphanum vs right general category */
 3346:           case 7:  /* Left space vs right general category */
 3347:           case 8:  /* Left word vs right general category */
 3348:           p = posspropstab[n-6];
 3349:           accepted = risprop && lisprop ==
 3350:             (list[3] != p[0] &&
 3351:              list[3] != p[1] &&
 3352:             (list[3] != p[2] || !lisprop));
 3353:           break;
 3354: 
 3355:           case 9:   /* Right alphanum vs left general category */
 3356:           case 10:  /* Right space vs left general category */
 3357:           case 11:  /* Right word vs left general category */
 3358:           p = posspropstab[n-9];
 3359:           accepted = lisprop && risprop ==
 3360:             (base_list[3] != p[0] &&
 3361:              base_list[3] != p[1] &&
 3362:             (base_list[3] != p[2] || !risprop));
 3363:           break;
 3364: 
 3365:           case 12:  /* Left alphanum vs right particular category */
 3366:           case 13:  /* Left space vs right particular category */
 3367:           case 14:  /* Left word vs right particular category */
 3368:           p = posspropstab[n-12];
 3369:           accepted = risprop && lisprop ==
 3370:             (catposstab[p[0]][list[3]] &&
 3371:              catposstab[p[1]][list[3]] &&
 3372:             (list[3] != p[3] || !lisprop));
 3373:           break;
 3374: 
 3375:           case 15:  /* Right alphanum vs left particular category */
 3376:           case 16:  /* Right space vs left particular category */
 3377:           case 17:  /* Right word vs left particular category */
 3378:           p = posspropstab[n-15];
 3379:           accepted = lisprop && risprop ==
 3380:             (catposstab[p[0]][base_list[3]] &&
 3381:              catposstab[p[1]][base_list[3]] &&
 3382:             (base_list[3] != p[3] || !risprop));
 3383:           break;
 3384:           }
 3385:         }
 3386:       }
 3387: 
 3388:     else
 3389: #endif  /* SUPPORT_UCP */
 3390: 
 3391:     accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
 3392:            rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
 3393:            autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
 3394: 
 3395:     if (!accepted)
 3396:       return FALSE;
 3397: 
 3398:     if (list[1] == 0) return TRUE;
 3399:     /* Might be an empty repeat. */
 3400:     continue;
 3401:     }
 3402: 
 3403:   /* Control reaches here only if one of the items is a small character list.
 3404:   All characters are checked against the other side. */
 3405: 
 3406:   do
 3407:     {
 3408:     chr = *chr_ptr;
 3409: 
 3410:     switch(list_ptr[0])
 3411:       {
 3412:       case OP_CHAR:
 3413:       ochr_ptr = list_ptr + 2;
 3414:       do
 3415:         {
 3416:         if (chr == *ochr_ptr) return FALSE;
 3417:         ochr_ptr++;
 3418:         }
 3419:       while(*ochr_ptr != NOTACHAR);
 3420:       break;
 3421: 
 3422:       case OP_NOT:
 3423:       ochr_ptr = list_ptr + 2;
 3424:       do
 3425:         {
 3426:         if (chr == *ochr_ptr)
 3427:           break;
 3428:         ochr_ptr++;
 3429:         }
 3430:       while(*ochr_ptr != NOTACHAR);
 3431:       if (*ochr_ptr == NOTACHAR) return FALSE;   /* Not found */
 3432:       break;
 3433: 
 3434:       /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
 3435:       set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
 3436: 
 3437:       case OP_DIGIT:
 3438:       if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
 3439:       break;
 3440: 
 3441:       case OP_NOT_DIGIT:
 3442:       if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
 3443:       break;
 3444: 
 3445:       case OP_WHITESPACE:
 3446:       if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
 3447:       break;
 3448: 
 3449:       case OP_NOT_WHITESPACE:
 3450:       if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
 3451:       break;
 3452: 
 3453:       case OP_WORDCHAR:
 3454:       if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
 3455:       break;
 3456: 
 3457:       case OP_NOT_WORDCHAR:
 3458:       if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
 3459:       break;
 3460: 
 3461:       case OP_HSPACE:
 3462:       switch(chr)
 3463:         {
 3464:         HSPACE_CASES: return FALSE;
 3465:         default: break;
 3466:         }
 3467:       break;
 3468: 
 3469:       case OP_NOT_HSPACE:
 3470:       switch(chr)
 3471:         {
 3472:         HSPACE_CASES: break;
 3473:         default: return FALSE;
 3474:         }
 3475:       break;
 3476: 
 3477:       case OP_ANYNL:
 3478:       case OP_VSPACE:
 3479:       switch(chr)
 3480:         {
 3481:         VSPACE_CASES: return FALSE;
 3482:         default: break;
 3483:         }
 3484:       break;
 3485: 
 3486:       case OP_NOT_VSPACE:
 3487:       switch(chr)
 3488:         {
 3489:         VSPACE_CASES: break;
 3490:         default: return FALSE;
 3491:         }
 3492:       break;
 3493: 
 3494:       case OP_DOLL:
 3495:       case OP_EODN:
 3496:       switch (chr)
 3497:         {
 3498:         case CHAR_CR:
 3499:         case CHAR_LF:
 3500:         case CHAR_VT:
 3501:         case CHAR_FF:
 3502:         case CHAR_NEL:
 3503: #ifndef EBCDIC
 3504:         case 0x2028:
 3505:         case 0x2029:
 3506: #endif  /* Not EBCDIC */
 3507:         return FALSE;
 3508:         }
 3509:       break;
 3510: 
 3511:       case OP_EOD:    /* Can always possessify before \z */
 3512:       break;
 3513: 
 3514: #ifdef SUPPORT_UCP
 3515:       case OP_PROP:
 3516:       case OP_NOTPROP:
 3517:       if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
 3518:             list_ptr[0] == OP_NOTPROP))
 3519:         return FALSE;
 3520:       break;
 3521: #endif
 3522: 
 3523:       case OP_NCLASS:
 3524:       if (chr > 255) return FALSE;
 3525:       /* Fall through */
 3526: 
 3527:       case OP_CLASS:
 3528:       if (chr > 255) break;
 3529:       class_bitset = (pcre_uint8 *)
 3530:         ((list_ptr == list ? code : base_end) - list_ptr[2]);
 3531:       if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
 3532:       break;
 3533: 
 3534: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 3535:       case OP_XCLASS:
 3536:       if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
 3537:           list_ptr[2] + LINK_SIZE, utf)) return FALSE;
 3538:       break;
 3539: #endif
 3540: 
 3541:       default:
 3542:       return FALSE;
 3543:       }
 3544: 
 3545:     chr_ptr++;
 3546:     }
 3547:   while(*chr_ptr != NOTACHAR);
 3548: 
 3549:   /* At least one character must be matched from this opcode. */
 3550: 
 3551:   if (list[1] == 0) return TRUE;
 3552:   }
 3553: 
 3554: return FALSE;
 3555: }
 3556: 
 3557: 
 3558: 
 3559: /*************************************************
 3560: *    Scan compiled regex for auto-possession     *
 3561: *************************************************/
 3562: 
 3563: /* Replaces single character iterations with their possessive alternatives
 3564: if appropriate. This function modifies the compiled opcode!
 3565: 
 3566: Arguments:
 3567:   code        points to start of the byte code
 3568:   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
 3569:   cd          static compile data
 3570: 
 3571: Returns:      nothing
 3572: */
 3573: 
 3574: static void
 3575: auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
 3576: {
 3577: register pcre_uchar c;
 3578: const pcre_uchar *end;
 3579: pcre_uchar *repeat_opcode;
 3580: pcre_uint32 list[8];
 3581: 
 3582: for (;;)
 3583:   {
 3584:   c = *code;
 3585: 
 3586:   if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
 3587:     {
 3588:     c -= get_repeat_base(c) - OP_STAR;
 3589:     end = (c <= OP_MINUPTO) ?
 3590:       get_chr_property_list(code, utf, cd->fcc, list) : NULL;
 3591:     list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
 3592: 
 3593:     if (end != NULL && compare_opcodes(end, utf, cd, list, end))
 3594:       {
 3595:       switch(c)
 3596:         {
 3597:         case OP_STAR:
 3598:         *code += OP_POSSTAR - OP_STAR;
 3599:         break;
 3600: 
 3601:         case OP_MINSTAR:
 3602:         *code += OP_POSSTAR - OP_MINSTAR;
 3603:         break;
 3604: 
 3605:         case OP_PLUS:
 3606:         *code += OP_POSPLUS - OP_PLUS;
 3607:         break;
 3608: 
 3609:         case OP_MINPLUS:
 3610:         *code += OP_POSPLUS - OP_MINPLUS;
 3611:         break;
 3612: 
 3613:         case OP_QUERY:
 3614:         *code += OP_POSQUERY - OP_QUERY;
 3615:         break;
 3616: 
 3617:         case OP_MINQUERY:
 3618:         *code += OP_POSQUERY - OP_MINQUERY;
 3619:         break;
 3620: 
 3621:         case OP_UPTO:
 3622:         *code += OP_POSUPTO - OP_UPTO;
 3623:         break;
 3624: 
 3625:         case OP_MINUPTO:
 3626:         *code += OP_MINUPTO - OP_UPTO;
 3627:         break;
 3628:         }
 3629:       }
 3630:     c = *code;
 3631:     }
 3632:   else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
 3633:     {
 3634: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 3635:     if (c == OP_XCLASS)
 3636:       repeat_opcode = code + GET(code, 1);
 3637:     else
 3638: #endif
 3639:       repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
 3640: 
 3641:     c = *repeat_opcode;
 3642:     if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
 3643:       {
 3644:       /* end must not be NULL. */
 3645:       end = get_chr_property_list(code, utf, cd->fcc, list);
 3646: 
 3647:       list[1] = (c & 1) == 0;
 3648: 
 3649:       if (compare_opcodes(end, utf, cd, list, end))
 3650:         {
 3651:         switch (c)
 3652:           {
 3653:           case OP_CRSTAR:
 3654:           case OP_CRMINSTAR:
 3655:           *repeat_opcode = OP_CRPOSSTAR;
 3656:           break;
 3657: 
 3658:           case OP_CRPLUS:
 3659:           case OP_CRMINPLUS:
 3660:           *repeat_opcode = OP_CRPOSPLUS;
 3661:           break;
 3662: 
 3663:           case OP_CRQUERY:
 3664:           case OP_CRMINQUERY:
 3665:           *repeat_opcode = OP_CRPOSQUERY;
 3666:           break;
 3667: 
 3668:           case OP_CRRANGE:
 3669:           case OP_CRMINRANGE:
 3670:           *repeat_opcode = OP_CRPOSRANGE;
 3671:           break;
 3672:           }
 3673:         }
 3674:       }
 3675:     c = *code;
 3676:     }
 3677: 
 3678:   switch(c)
 3679:     {
 3680:     case OP_END:
 3681:     return;
 3682: 
 3683:     case OP_TYPESTAR:
 3684:     case OP_TYPEMINSTAR:
 3685:     case OP_TYPEPLUS:
 3686:     case OP_TYPEMINPLUS:
 3687:     case OP_TYPEQUERY:
 3688:     case OP_TYPEMINQUERY:
 3689:     case OP_TYPEPOSSTAR:
 3690:     case OP_TYPEPOSPLUS:
 3691:     case OP_TYPEPOSQUERY:
 3692:     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
 3693:     break;
 3694: 
 3695:     case OP_TYPEUPTO:
 3696:     case OP_TYPEMINUPTO:
 3697:     case OP_TYPEEXACT:
 3698:     case OP_TYPEPOSUPTO:
 3699:     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
 3700:       code += 2;
 3701:     break;
 3702: 
 3703: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 3704:     case OP_XCLASS:
 3705:     code += GET(code, 1);
 3706:     break;
 3707: #endif
 3708: 
 3709:     case OP_MARK:
 3710:     case OP_PRUNE_ARG:
 3711:     case OP_SKIP_ARG:
 3712:     case OP_THEN_ARG:
 3713:     code += code[1];
 3714:     break;
 3715:     }
 3716: 
 3717:   /* Add in the fixed length from the table */
 3718: 
 3719:   code += PRIV(OP_lengths)[c];
 3720: 
 3721:   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
 3722:   a multi-byte character. The length in the table is a minimum, so we have to
 3723:   arrange to skip the extra bytes. */
 3724: 
 3725: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
 3726:   if (utf) switch(c)
 3727:     {
 3728:     case OP_CHAR:
 3729:     case OP_CHARI:
 3730:     case OP_NOT:
 3731:     case OP_NOTI:
 3732:     case OP_STAR:
 3733:     case OP_MINSTAR:
 3734:     case OP_PLUS:
 3735:     case OP_MINPLUS:
 3736:     case OP_QUERY:
 3737:     case OP_MINQUERY:
 3738:     case OP_UPTO:
 3739:     case OP_MINUPTO:
 3740:     case OP_EXACT:
 3741:     case OP_POSSTAR:
 3742:     case OP_POSPLUS:
 3743:     case OP_POSQUERY:
 3744:     case OP_POSUPTO:
 3745:     case OP_STARI:
 3746:     case OP_MINSTARI:
 3747:     case OP_PLUSI:
 3748:     case OP_MINPLUSI:
 3749:     case OP_QUERYI:
 3750:     case OP_MINQUERYI:
 3751:     case OP_UPTOI:
 3752:     case OP_MINUPTOI:
 3753:     case OP_EXACTI:
 3754:     case OP_POSSTARI:
 3755:     case OP_POSPLUSI:
 3756:     case OP_POSQUERYI:
 3757:     case OP_POSUPTOI:
 3758:     case OP_NOTSTAR:
 3759:     case OP_NOTMINSTAR:
 3760:     case OP_NOTPLUS:
 3761:     case OP_NOTMINPLUS:
 3762:     case OP_NOTQUERY:
 3763:     case OP_NOTMINQUERY:
 3764:     case OP_NOTUPTO:
 3765:     case OP_NOTMINUPTO:
 3766:     case OP_NOTEXACT:
 3767:     case OP_NOTPOSSTAR:
 3768:     case OP_NOTPOSPLUS:
 3769:     case OP_NOTPOSQUERY:
 3770:     case OP_NOTPOSUPTO:
 3771:     case OP_NOTSTARI:
 3772:     case OP_NOTMINSTARI:
 3773:     case OP_NOTPLUSI:
 3774:     case OP_NOTMINPLUSI:
 3775:     case OP_NOTQUERYI:
 3776:     case OP_NOTMINQUERYI:
 3777:     case OP_NOTUPTOI:
 3778:     case OP_NOTMINUPTOI:
 3779:     case OP_NOTEXACTI:
 3780:     case OP_NOTPOSSTARI:
 3781:     case OP_NOTPOSPLUSI:
 3782:     case OP_NOTPOSQUERYI:
 3783:     case OP_NOTPOSUPTOI:
 3784:     if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
 3785:     break;
 3786:     }
 3787: #else
 3788:   (void)(utf);  /* Keep compiler happy by referencing function argument */
 3789: #endif
 3790:   }
 3791: }
 3792: 
 3793: 
 3794: 
 3795: /*************************************************
 3796: *           Check for POSIX class syntax         *
 3797: *************************************************/
 3798: 
 3799: /* This function is called when the sequence "[:" or "[." or "[=" is
 3800: encountered in a character class. It checks whether this is followed by a
 3801: sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
 3802: reach an unescaped ']' without the special preceding character, return FALSE.
 3803: 
 3804: Originally, this function only recognized a sequence of letters between the
 3805: terminators, but it seems that Perl recognizes any sequence of characters,
 3806: though of course unknown POSIX names are subsequently rejected. Perl gives an
 3807: "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
 3808: didn't consider this to be a POSIX class. Likewise for [:1234:].
 3809: 
 3810: The problem in trying to be exactly like Perl is in the handling of escapes. We
 3811: have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
 3812: class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
 3813: below handles the special case of \], but does not try to do any other escape
 3814: processing. This makes it different from Perl for cases such as [:l\ower:]
 3815: where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
 3816: "l\ower". This is a lesser evil than not diagnosing bad classes when Perl does,
 3817: I think.
 3818: 
 3819: A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
 3820: It seems that the appearance of a nested POSIX class supersedes an apparent
 3821: external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
 3822: a digit.
 3823: 
 3824: In Perl, unescaped square brackets may also appear as part of class names. For
 3825: example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
 3826: [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
 3827: seem right at all. PCRE does not allow closing square brackets in POSIX class
 3828: names.
 3829: 
 3830: Arguments:
 3831:   ptr      pointer to the initial [
 3832:   endptr   where to return the end pointer
 3833: 
 3834: Returns:   TRUE or FALSE
 3835: */
 3836: 
 3837: static BOOL
 3838: check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
 3839: {
 3840: pcre_uchar terminator;          /* Don't combine these lines; the Solaris cc */
 3841: terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
 3842: for (++ptr; *ptr != CHAR_NULL; ptr++)
 3843:   {
 3844:   if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
 3845:     ptr++;
 3846:   else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
 3847:   else
 3848:     {
 3849:     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
 3850:       {
 3851:       *endptr = ptr;
 3852:       return TRUE;
 3853:       }
 3854:     if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
 3855:          (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
 3856:           ptr[1] == CHAR_EQUALS_SIGN) &&
 3857:         check_posix_syntax(ptr, endptr))
 3858:       return FALSE;
 3859:     }
 3860:   }
 3861: return FALSE;
 3862: }
 3863: 
 3864: 
 3865: 
 3866: 
 3867: /*************************************************
 3868: *          Check POSIX class name                *
 3869: *************************************************/
 3870: 
 3871: /* This function is called to check the name given in a POSIX-style class entry
 3872: such as [:alnum:].
 3873: 
 3874: Arguments:
 3875:   ptr        points to the first letter
 3876:   len        the length of the name
 3877: 
 3878: Returns:     a value representing the name, or -1 if unknown
 3879: */
 3880: 
 3881: static int
 3882: check_posix_name(const pcre_uchar *ptr, int len)
 3883: {
 3884: const char *pn = posix_names;
 3885: register int yield = 0;
 3886: while (posix_name_lengths[yield] != 0)
 3887:   {
 3888:   if (len == posix_name_lengths[yield] &&
 3889:     STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
 3890:   pn += posix_name_lengths[yield] + 1;
 3891:   yield++;
 3892:   }
 3893: return -1;
 3894: }
 3895: 
 3896: 
 3897: /*************************************************
 3898: *    Adjust OP_RECURSE items in repeated group   *
 3899: *************************************************/
 3900: 
 3901: /* OP_RECURSE items contain an offset from the start of the regex to the group
 3902: that is referenced. This means that groups can be replicated for fixed
 3903: repetition simply by copying (because the recursion is allowed to refer to
 3904: earlier groups that are outside the current group). However, when a group is
 3905: optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
 3906: inserted before it, after it has been compiled. This means that any OP_RECURSE
 3907: items within it that refer to the group itself or any contained groups have to
 3908: have their offsets adjusted. That one of the jobs of this function. Before it
 3909: is called, the partially compiled regex must be temporarily terminated with
 3910: OP_END.
 3911: 
 3912: This function has been extended with the possibility of forward references for
 3913: recursions and subroutine calls. It must also check the list of such references
 3914: for the group we are dealing with. If it finds that one of the recursions in
 3915: the current group is on this list, it adjusts the offset in the list, not the
 3916: value in the reference (which is a group number).
 3917: 
 3918: Arguments:
 3919:   group      points to the start of the group
 3920:   adjust     the amount by which the group is to be moved
 3921:   utf        TRUE in UTF-8 / UTF-16 / UTF-32 mode
 3922:   cd         contains pointers to tables etc.
 3923:   save_hwm   the hwm forward reference pointer at the start of the group
 3924: 
 3925: Returns:     nothing
 3926: */
 3927: 
 3928: static void
 3929: adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
 3930:   pcre_uchar *save_hwm)
 3931: {
 3932: pcre_uchar *ptr = group;
 3933: 
 3934: while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
 3935:   {
 3936:   int offset;
 3937:   pcre_uchar *hc;
 3938: 
 3939:   /* See if this recursion is on the forward reference list. If so, adjust the
 3940:   reference. */
 3941: 
 3942:   for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
 3943:     {
 3944:     offset = (int)GET(hc, 0);
 3945:     if (cd->start_code + offset == ptr + 1)
 3946:       {
 3947:       PUT(hc, 0, offset + adjust);
 3948:       break;
 3949:       }
 3950:     }
 3951: 
 3952:   /* Otherwise, adjust the recursion offset if it's after the start of this
 3953:   group. */
 3954: 
 3955:   if (hc >= cd->hwm)
 3956:     {
 3957:     offset = (int)GET(ptr, 1);
 3958:     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
 3959:     }
 3960: 
 3961:   ptr += 1 + LINK_SIZE;
 3962:   }
 3963: }
 3964: 
 3965: 
 3966: 
 3967: /*************************************************
 3968: *        Insert an automatic callout point       *
 3969: *************************************************/
 3970: 
 3971: /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
 3972: callout points before each pattern item.
 3973: 
 3974: Arguments:
 3975:   code           current code pointer
 3976:   ptr            current pattern pointer
 3977:   cd             pointers to tables etc
 3978: 
 3979: Returns:         new code pointer
 3980: */
 3981: 
 3982: static pcre_uchar *
 3983: auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
 3984: {
 3985: *code++ = OP_CALLOUT;
 3986: *code++ = 255;
 3987: PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
 3988: PUT(code, LINK_SIZE, 0);                       /* Default length */
 3989: return code + 2 * LINK_SIZE;
 3990: }
 3991: 
 3992: 
 3993: 
 3994: /*************************************************
 3995: *         Complete a callout item                *
 3996: *************************************************/
 3997: 
 3998: /* A callout item contains the length of the next item in the pattern, which
 3999: we can't fill in till after we have reached the relevant point. This is used
 4000: for both automatic and manual callouts.
 4001: 
 4002: Arguments:
 4003:   previous_callout   points to previous callout item
 4004:   ptr                current pattern pointer
 4005:   cd                 pointers to tables etc
 4006: 
 4007: Returns:             nothing
 4008: */
 4009: 
 4010: static void
 4011: complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
 4012: {
 4013: int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
 4014: PUT(previous_callout, 2 + LINK_SIZE, length);
 4015: }
 4016: 
 4017: 
 4018: 
 4019: #ifdef SUPPORT_UCP
 4020: /*************************************************
 4021: *           Get othercase range                  *
 4022: *************************************************/
 4023: 
 4024: /* This function is passed the start and end of a class range, in UTF-8 mode
 4025: with UCP support. It searches up the characters, looking for ranges of
 4026: characters in the "other" case. Each call returns the next one, updating the
 4027: start address. A character with multiple other cases is returned on its own
 4028: with a special return value.
 4029: 
 4030: Arguments:
 4031:   cptr        points to starting character value; updated
 4032:   d           end value
 4033:   ocptr       where to put start of othercase range
 4034:   odptr       where to put end of othercase range
 4035: 
 4036: Yield:        -1 when no more
 4037:                0 when a range is returned
 4038:               >0 the CASESET offset for char with multiple other cases
 4039:                 in this case, ocptr contains the original
 4040: */
 4041: 
 4042: static int
 4043: get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
 4044:   pcre_uint32 *odptr)
 4045: {
 4046: pcre_uint32 c, othercase, next;
 4047: unsigned int co;
 4048: 
 4049: /* Find the first character that has an other case. If it has multiple other
 4050: cases, return its case offset value. */
 4051: 
 4052: for (c = *cptr; c <= d; c++)
 4053:   {
 4054:   if ((co = UCD_CASESET(c)) != 0)
 4055:     {
 4056:     *ocptr = c++;   /* Character that has the set */
 4057:     *cptr = c;      /* Rest of input range */
 4058:     return (int)co;
 4059:     }
 4060:   if ((othercase = UCD_OTHERCASE(c)) != c) break;
 4061:   }
 4062: 
 4063: if (c > d) return -1;  /* Reached end of range */
 4064: 
 4065: *ocptr = othercase;
 4066: next = othercase + 1;
 4067: 
 4068: for (++c; c <= d; c++)
 4069:   {
 4070:   if (UCD_OTHERCASE(c) != next) break;
 4071:   next++;
 4072:   }
 4073: 
 4074: *odptr = next - 1;     /* End of othercase range */
 4075: *cptr = c;             /* Rest of input range */
 4076: return 0;
 4077: }
 4078: #endif  /* SUPPORT_UCP */
 4079: 
 4080: 
 4081: 
 4082: /*************************************************
 4083: *        Add a character or range to a class     *
 4084: *************************************************/
 4085: 
 4086: /* This function packages up the logic of adding a character or range of
 4087: characters to a class. The character values in the arguments will be within the
 4088: valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
 4089: mutually recursive with the function immediately below.
 4090: 
 4091: Arguments:
 4092:   classbits     the bit map for characters < 256
 4093:   uchardptr     points to the pointer for extra data
 4094:   options       the options word
 4095:   cd            contains pointers to tables etc.
 4096:   start         start of range character
 4097:   end           end of range character
 4098: 
 4099: Returns:        the number of < 256 characters added
 4100:                 the pointer to extra data is updated
 4101: */
 4102: 
 4103: static int
 4104: add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
 4105:   compile_data *cd, pcre_uint32 start, pcre_uint32 end)
 4106: {
 4107: pcre_uint32 c;
 4108: int n8 = 0;
 4109: 
 4110: /* If caseless matching is required, scan the range and process alternate
 4111: cases. In Unicode, there are 8-bit characters that have alternate cases that
 4112: are greater than 255 and vice-versa. Sometimes we can just extend the original
 4113: range. */
 4114: 
 4115: if ((options & PCRE_CASELESS) != 0)
 4116:   {
 4117: #ifdef SUPPORT_UCP
 4118:   if ((options & PCRE_UTF8) != 0)
 4119:     {
 4120:     int rc;
 4121:     pcre_uint32 oc, od;
 4122: 
 4123:     options &= ~PCRE_CASELESS;   /* Remove for recursive calls */
 4124:     c = start;
 4125: 
 4126:     while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
 4127:       {
 4128:       /* Handle a single character that has more than one other case. */
 4129: 
 4130:       if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
 4131:         PRIV(ucd_caseless_sets) + rc, oc);
 4132: 
 4133:       /* Do nothing if the other case range is within the original range. */
 4134: 
 4135:       else if (oc >= start && od <= end) continue;
 4136: 
 4137:       /* Extend the original range if there is overlap, noting that if oc < c, we
 4138:       can't have od > end because a subrange is always shorter than the basic
 4139:       range. Otherwise, use a recursive call to add the additional range. */
 4140: 
 4141:       else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
 4142:       else if (od > end && oc <= end + 1) end = od;       /* Extend upwards */
 4143:       else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
 4144:       }
 4145:     }
 4146:   else
 4147: #endif  /* SUPPORT_UCP */
 4148: 
 4149:   /* Not UTF-mode, or no UCP */
 4150: 
 4151:   for (c = start; c <= end && c < 256; c++)
 4152:     {
 4153:     SETBIT(classbits, cd->fcc[c]);
 4154:     n8++;
 4155:     }
 4156:   }
 4157: 
 4158: /* Now handle the original range. Adjust the final value according to the bit
 4159: length - this means that the same lists of (e.g.) horizontal spaces can be used
 4160: in all cases. */
 4161: 
 4162: #if defined COMPILE_PCRE8
 4163: #ifdef SUPPORT_UTF
 4164:   if ((options & PCRE_UTF8) == 0)
 4165: #endif
 4166:   if (end > 0xff) end = 0xff;
 4167: 
 4168: #elif defined COMPILE_PCRE16
 4169: #ifdef SUPPORT_UTF
 4170:   if ((options & PCRE_UTF16) == 0)
 4171: #endif
 4172:   if (end > 0xffff) end = 0xffff;
 4173: 
 4174: #endif /* COMPILE_PCRE[8|16] */
 4175: 
 4176: /* If all characters are less than 256, use the bit map. Otherwise use extra
 4177: data. */
 4178: 
 4179: if (end < 0x100)
 4180:   {
 4181:   for (c = start; c <= end; c++)
 4182:     {
 4183:     n8++;
 4184:     SETBIT(classbits, c);
 4185:     }
 4186:   }
 4187: 
 4188: else
 4189:   {
 4190:   pcre_uchar *uchardata = *uchardptr;
 4191: 
 4192: #ifdef SUPPORT_UTF
 4193:   if ((options & PCRE_UTF8) != 0)  /* All UTFs use the same flag bit */
 4194:     {
 4195:     if (start < end)
 4196:       {
 4197:       *uchardata++ = XCL_RANGE;
 4198:       uchardata += PRIV(ord2utf)(start, uchardata);
 4199:       uchardata += PRIV(ord2utf)(end, uchardata);
 4200:       }
 4201:     else if (start == end)
 4202:       {
 4203:       *uchardata++ = XCL_SINGLE;
 4204:       uchardata += PRIV(ord2utf)(start, uchardata);
 4205:       }
 4206:     }
 4207:   else
 4208: #endif  /* SUPPORT_UTF */
 4209: 
 4210:   /* Without UTF support, character values are constrained by the bit length,
 4211:   and can only be > 256 for 16-bit and 32-bit libraries. */
 4212: 
 4213: #ifdef COMPILE_PCRE8
 4214:     {}
 4215: #else
 4216:   if (start < end)
 4217:     {
 4218:     *uchardata++ = XCL_RANGE;
 4219:     *uchardata++ = start;
 4220:     *uchardata++ = end;
 4221:     }
 4222:   else if (start == end)
 4223:     {
 4224:     *uchardata++ = XCL_SINGLE;
 4225:     *uchardata++ = start;
 4226:     }
 4227: #endif
 4228: 
 4229:   *uchardptr = uchardata;   /* Updata extra data pointer */
 4230:   }
 4231: 
 4232: return n8;    /* Number of 8-bit characters */
 4233: }
 4234: 
 4235: 
 4236: 
 4237: 
 4238: /*************************************************
 4239: *        Add a list of characters to a class     *
 4240: *************************************************/
 4241: 
 4242: /* This function is used for adding a list of case-equivalent characters to a
 4243: class, and also for adding a list of horizontal or vertical whitespace. If the
 4244: list is in order (which it should be), ranges of characters are detected and
 4245: handled appropriately. This function is mutually recursive with the function
 4246: above.
 4247: 
 4248: Arguments:
 4249:   classbits     the bit map for characters < 256
 4250:   uchardptr     points to the pointer for extra data
 4251:   options       the options word
 4252:   cd            contains pointers to tables etc.
 4253:   p             points to row of 32-bit values, terminated by NOTACHAR
 4254:   except        character to omit; this is used when adding lists of
 4255:                   case-equivalent characters to avoid including the one we
 4256:                   already know about
 4257: 
 4258: Returns:        the number of < 256 characters added
 4259:                 the pointer to extra data is updated
 4260: */
 4261: 
 4262: static int
 4263: add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
 4264:   compile_data *cd, const pcre_uint32 *p, unsigned int except)
 4265: {
 4266: int n8 = 0;
 4267: while (p[0] < NOTACHAR)
 4268:   {
 4269:   int n = 0;
 4270:   if (p[0] != except)
 4271:     {
 4272:     while(p[n+1] == p[0] + n + 1) n++;
 4273:     n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
 4274:     }
 4275:   p += n + 1;
 4276:   }
 4277: return n8;
 4278: }
 4279: 
 4280: 
 4281: 
 4282: /*************************************************
 4283: *    Add characters not in a list to a class     *
 4284: *************************************************/
 4285: 
 4286: /* This function is used for adding the complement of a list of horizontal or
 4287: vertical whitespace to a class. The list must be in order.
 4288: 
 4289: Arguments:
 4290:   classbits     the bit map for characters < 256
 4291:   uchardptr     points to the pointer for extra data
 4292:   options       the options word
 4293:   cd            contains pointers to tables etc.
 4294:   p             points to row of 32-bit values, terminated by NOTACHAR
 4295: 
 4296: Returns:        the number of < 256 characters added
 4297:                 the pointer to extra data is updated
 4298: */
 4299: 
 4300: static int
 4301: add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
 4302:   int options, compile_data *cd, const pcre_uint32 *p)
 4303: {
 4304: BOOL utf = (options & PCRE_UTF8) != 0;
 4305: int n8 = 0;
 4306: if (p[0] > 0)
 4307:   n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
 4308: while (p[0] < NOTACHAR)
 4309:   {
 4310:   while (p[1] == p[0] + 1) p++;
 4311:   n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
 4312:     (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
 4313:   p++;
 4314:   }
 4315: return n8;
 4316: }
 4317: 
 4318: 
 4319: 
 4320: /*************************************************
 4321: *           Compile one branch                   *
 4322: *************************************************/
 4323: 
 4324: /* Scan the pattern, compiling it into the a vector. If the options are
 4325: changed during the branch, the pointer is used to change the external options
 4326: bits. This function is used during the pre-compile phase when we are trying
 4327: to find out the amount of memory needed, as well as during the real compile
 4328: phase. The value of lengthptr distinguishes the two phases.
 4329: 
 4330: Arguments:
 4331:   optionsptr        pointer to the option bits
 4332:   codeptr           points to the pointer to the current code point
 4333:   ptrptr            points to the current pattern pointer
 4334:   errorcodeptr      points to error code variable
 4335:   firstcharptr      place to put the first required character
 4336:   firstcharflagsptr place to put the first character flags, or a negative number
 4337:   reqcharptr        place to put the last required character
 4338:   reqcharflagsptr   place to put the last required character flags, or a negative number
 4339:   bcptr             points to current branch chain
 4340:   cond_depth        conditional nesting depth
 4341:   cd                contains pointers to tables etc.
 4342:   lengthptr         NULL during the real compile phase
 4343:                     points to length accumulator during pre-compile phase
 4344: 
 4345: Returns:            TRUE on success
 4346:                     FALSE, with *errorcodeptr set non-zero on error
 4347: */
 4348: 
 4349: static BOOL
 4350: compile_branch(int *optionsptr, pcre_uchar **codeptr,
 4351:   const pcre_uchar **ptrptr, int *errorcodeptr,
 4352:   pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
 4353:   pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
 4354:   branch_chain *bcptr, int cond_depth,
 4355:   compile_data *cd, int *lengthptr)
 4356: {
 4357: int repeat_type, op_type;
 4358: int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
 4359: int bravalue = 0;
 4360: int greedy_default, greedy_non_default;
 4361: pcre_uint32 firstchar, reqchar;
 4362: pcre_int32 firstcharflags, reqcharflags;
 4363: pcre_uint32 zeroreqchar, zerofirstchar;
 4364: pcre_int32 zeroreqcharflags, zerofirstcharflags;
 4365: pcre_int32 req_caseopt, reqvary, tempreqvary;
 4366: int options = *optionsptr;               /* May change dynamically */
 4367: int after_manual_callout = 0;
 4368: int length_prevgroup = 0;
 4369: register pcre_uint32 c;
 4370: int escape;
 4371: register pcre_uchar *code = *codeptr;
 4372: pcre_uchar *last_code = code;
 4373: pcre_uchar *orig_code = code;
 4374: pcre_uchar *tempcode;
 4375: BOOL inescq = FALSE;
 4376: BOOL groupsetfirstchar = FALSE;
 4377: const pcre_uchar *ptr = *ptrptr;
 4378: const pcre_uchar *tempptr;
 4379: const pcre_uchar *nestptr = NULL;
 4380: pcre_uchar *previous = NULL;
 4381: pcre_uchar *previous_callout = NULL;
 4382: pcre_uchar *save_hwm = NULL;
 4383: pcre_uint8 classbits[32];
 4384: 
 4385: /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
 4386: must not do this for other options (e.g. PCRE_EXTENDED) because they may change
 4387: dynamically as we process the pattern. */
 4388: 
 4389: #ifdef SUPPORT_UTF
 4390: /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
 4391: BOOL utf = (options & PCRE_UTF8) != 0;
 4392: #ifndef COMPILE_PCRE32
 4393: pcre_uchar utf_chars[6];
 4394: #endif
 4395: #else
 4396: BOOL utf = FALSE;
 4397: #endif
 4398: 
 4399: /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
 4400: class_uchardata always so that it can be passed to add_to_class() always,
 4401: though it will not be used in non-UTF 8-bit cases. This avoids having to supply
 4402: alternative calls for the different cases. */
 4403: 
 4404: pcre_uchar *class_uchardata;
 4405: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 4406: BOOL xclass;
 4407: pcre_uchar *class_uchardata_base;
 4408: #endif
 4409: 
 4410: #ifdef PCRE_DEBUG
 4411: if (lengthptr != NULL) DPRINTF((">> start branch\n"));
 4412: #endif
 4413: 
 4414: /* Set up the default and non-default settings for greediness */
 4415: 
 4416: greedy_default = ((options & PCRE_UNGREEDY) != 0);
 4417: greedy_non_default = greedy_default ^ 1;
 4418: 
 4419: /* Initialize no first byte, no required byte. REQ_UNSET means "no char
 4420: matching encountered yet". It gets changed to REQ_NONE if we hit something that
 4421: matches a non-fixed char first char; reqchar just remains unset if we never
 4422: find one.
 4423: 
 4424: When we hit a repeat whose minimum is zero, we may have to adjust these values
 4425: to take the zero repeat into account. This is implemented by setting them to
 4426: zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
 4427: item types that can be repeated set these backoff variables appropriately. */
 4428: 
 4429: firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
 4430: firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
 4431: 
 4432: /* The variable req_caseopt contains either the REQ_CASELESS value
 4433: or zero, according to the current setting of the caseless flag. The
 4434: REQ_CASELESS leaves the lower 28 bit empty. It is added into the
 4435: firstchar or reqchar variables to record the case status of the
 4436: value. This is used only for ASCII characters. */
 4437: 
 4438: req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
 4439: 
 4440: /* Switch on next character until the end of the branch */
 4441: 
 4442: for (;; ptr++)
 4443:   {
 4444:   BOOL negate_class;
 4445:   BOOL should_flip_negation;
 4446:   BOOL possessive_quantifier;
 4447:   BOOL is_quantifier;
 4448:   BOOL is_recurse;
 4449:   BOOL reset_bracount;
 4450:   int class_has_8bitchar;
 4451:   int class_one_char;
 4452:   int newoptions;
 4453:   int recno;
 4454:   int refsign;
 4455:   int skipbytes;
 4456:   pcre_uint32 subreqchar, subfirstchar;
 4457:   pcre_int32 subreqcharflags, subfirstcharflags;
 4458:   int terminator;
 4459:   unsigned int mclength;
 4460:   unsigned int tempbracount;
 4461:   pcre_uint32 ec;
 4462:   pcre_uchar mcbuffer[8];
 4463: 
 4464:   /* Get next character in the pattern */
 4465: 
 4466:   c = *ptr;
 4467: 
 4468:   /* If we are at the end of a nested substitution, revert to the outer level
 4469:   string. Nesting only happens one level deep. */
 4470: 
 4471:   if (c == CHAR_NULL && nestptr != NULL)
 4472:     {
 4473:     ptr = nestptr;
 4474:     nestptr = NULL;
 4475:     c = *ptr;
 4476:     }
 4477: 
 4478:   /* If we are in the pre-compile phase, accumulate the length used for the
 4479:   previous cycle of this loop. */
 4480: 
 4481:   if (lengthptr != NULL)
 4482:     {
 4483: #ifdef PCRE_DEBUG
 4484:     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
 4485: #endif
 4486:     if (code > cd->start_workspace + cd->workspace_size -
 4487:         WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
 4488:       {
 4489:       *errorcodeptr = ERR52;
 4490:       goto FAILED;
 4491:       }
 4492: 
 4493:     /* There is at least one situation where code goes backwards: this is the
 4494:     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
 4495:     the class is simply eliminated. However, it is created first, so we have to
 4496:     allow memory for it. Therefore, don't ever reduce the length at this point.
 4497:     */
 4498: 
 4499:     if (code < last_code) code = last_code;
 4500: 
 4501:     /* Paranoid check for integer overflow */
 4502: 
 4503:     if (OFLOW_MAX - *lengthptr < code - last_code)
 4504:       {
 4505:       *errorcodeptr = ERR20;
 4506:       goto FAILED;
 4507:       }
 4508: 
 4509:     *lengthptr += (int)(code - last_code);
 4510:     DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
 4511:       (int)(code - last_code), c, c));
 4512: 
 4513:     /* If "previous" is set and it is not at the start of the work space, move
 4514:     it back to there, in order to avoid filling up the work space. Otherwise,
 4515:     if "previous" is NULL, reset the current code pointer to the start. */
 4516: 
 4517:     if (previous != NULL)
 4518:       {
 4519:       if (previous > orig_code)
 4520:         {
 4521:         memmove(orig_code, previous, IN_UCHARS(code - previous));
 4522:         code -= previous - orig_code;
 4523:         previous = orig_code;
 4524:         }
 4525:       }
 4526:     else code = orig_code;
 4527: 
 4528:     /* Remember where this code item starts so we can pick up the length
 4529:     next time round. */
 4530: 
 4531:     last_code = code;
 4532:     }
 4533: 
 4534:   /* In the real compile phase, just check the workspace used by the forward
 4535:   reference list. */
 4536: 
 4537:   else if (cd->hwm > cd->start_workspace + cd->workspace_size -
 4538:            WORK_SIZE_SAFETY_MARGIN)
 4539:     {
 4540:     *errorcodeptr = ERR52;
 4541:     goto FAILED;
 4542:     }
 4543: 
 4544:   /* If in \Q...\E, check for the end; if not, we have a literal */
 4545: 
 4546:   if (inescq && c != CHAR_NULL)
 4547:     {
 4548:     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
 4549:       {
 4550:       inescq = FALSE;
 4551:       ptr++;
 4552:       continue;
 4553:       }
 4554:     else
 4555:       {
 4556:       if (previous_callout != NULL)
 4557:         {
 4558:         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
 4559:           complete_callout(previous_callout, ptr, cd);
 4560:         previous_callout = NULL;
 4561:         }
 4562:       if ((options & PCRE_AUTO_CALLOUT) != 0)
 4563:         {
 4564:         previous_callout = code;
 4565:         code = auto_callout(code, ptr, cd);
 4566:         }
 4567:       goto NORMAL_CHAR;
 4568:       }
 4569:     /* Control does not reach here. */
 4570:     }
 4571: 
 4572:   /* In extended mode, skip white space and comments. We need a loop in order
 4573:   to check for more white space and more comments after a comment. */
 4574: 
 4575:   if ((options & PCRE_EXTENDED) != 0)
 4576:     {
 4577:     for (;;)
 4578:       {
 4579:       while (MAX_255(c) && (cd->ctypes[c] & ctype_space) != 0) c = *(++ptr);
 4580:       if (c != CHAR_NUMBER_SIGN) break;
 4581:       ptr++;
 4582:       while (*ptr != CHAR_NULL)
 4583:         {
 4584:         if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
 4585:           {                          /* IS_NEWLINE sets cd->nllen. */
 4586:           ptr += cd->nllen;
 4587:           break;
 4588:           }
 4589:         ptr++;
 4590: #ifdef SUPPORT_UTF
 4591:         if (utf) FORWARDCHAR(ptr);
 4592: #endif
 4593:         }
 4594:       c = *ptr;     /* Either NULL or the char after a newline */
 4595:       }
 4596:     }
 4597: 
 4598:   /* See if the next thing is a quantifier. */
 4599: 
 4600:   is_quantifier =
 4601:     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
 4602:     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
 4603: 
 4604:   /* Fill in length of a previous callout, except when the next thing is a
 4605:   quantifier or when processing a property substitution string in UCP mode. */
 4606: 
 4607:   if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
 4608:        after_manual_callout-- <= 0)
 4609:     {
 4610:     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
 4611:       complete_callout(previous_callout, ptr, cd);
 4612:     previous_callout = NULL;
 4613:     }
 4614: 
 4615:   /* Create auto callout, except for quantifiers, or while processing property
 4616:   strings that are substituted for \w etc in UCP mode. */
 4617: 
 4618:   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
 4619:     {
 4620:     previous_callout = code;
 4621:     code = auto_callout(code, ptr, cd);
 4622:     }
 4623: 
 4624:   /* Process the next pattern item. */
 4625: 
 4626:   switch(c)
 4627:     {
 4628:     /* ===================================================================*/
 4629:     case CHAR_NULL:                /* The branch terminates at string end */
 4630:     case CHAR_VERTICAL_LINE:       /* or | or ) */
 4631:     case CHAR_RIGHT_PARENTHESIS:
 4632:     *firstcharptr = firstchar;
 4633:     *firstcharflagsptr = firstcharflags;
 4634:     *reqcharptr = reqchar;
 4635:     *reqcharflagsptr = reqcharflags;
 4636:     *codeptr = code;
 4637:     *ptrptr = ptr;
 4638:     if (lengthptr != NULL)
 4639:       {
 4640:       if (OFLOW_MAX - *lengthptr < code - last_code)
 4641:         {
 4642:         *errorcodeptr = ERR20;
 4643:         goto FAILED;
 4644:         }
 4645:       *lengthptr += (int)(code - last_code);   /* To include callout length */
 4646:       DPRINTF((">> end branch\n"));
 4647:       }
 4648:     return TRUE;
 4649: 
 4650: 
 4651:     /* ===================================================================*/
 4652:     /* Handle single-character metacharacters. In multiline mode, ^ disables
 4653:     the setting of any following char as a first character. */
 4654: 
 4655:     case CHAR_CIRCUMFLEX_ACCENT:
 4656:     previous = NULL;
 4657:     if ((options & PCRE_MULTILINE) != 0)
 4658:       {
 4659:       if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
 4660:       *code++ = OP_CIRCM;
 4661:       }
 4662:     else *code++ = OP_CIRC;
 4663:     break;
 4664: 
 4665:     case CHAR_DOLLAR_SIGN:
 4666:     previous = NULL;
 4667:     *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
 4668:     break;
 4669: 
 4670:     /* There can never be a first char if '.' is first, whatever happens about
 4671:     repeats. The value of reqchar doesn't change either. */
 4672: 
 4673:     case CHAR_DOT:
 4674:     if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
 4675:     zerofirstchar = firstchar;
 4676:     zerofirstcharflags = firstcharflags;
 4677:     zeroreqchar = reqchar;
 4678:     zeroreqcharflags = reqcharflags;
 4679:     previous = code;
 4680:     *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
 4681:     break;
 4682: 
 4683: 
 4684:     /* ===================================================================*/
 4685:     /* Character classes. If the included characters are all < 256, we build a
 4686:     32-byte bitmap of the permitted characters, except in the special case
 4687:     where there is only one such character. For negated classes, we build the
 4688:     map as usual, then invert it at the end. However, we use a different opcode
 4689:     so that data characters > 255 can be handled correctly.
 4690: 
 4691:     If the class contains characters outside the 0-255 range, a different
 4692:     opcode is compiled. It may optionally have a bit map for characters < 256,
 4693:     but those above are are explicitly listed afterwards. A flag byte tells
 4694:     whether the bitmap is present, and whether this is a negated class or not.
 4695: 
 4696:     In JavaScript compatibility mode, an isolated ']' causes an error. In
 4697:     default (Perl) mode, it is treated as a data character. */
 4698: 
 4699:     case CHAR_RIGHT_SQUARE_BRACKET:
 4700:     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
 4701:       {
 4702:       *errorcodeptr = ERR64;
 4703:       goto FAILED;
 4704:       }
 4705:     goto NORMAL_CHAR;
 4706: 
 4707:     /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
 4708:     used for "start of word" and "end of word". As these are otherwise illegal
 4709:     sequences, we don't break anything by recognizing them. They are replaced
 4710:     by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
 4711:     erroneous and are handled by the normal code below. */
 4712: 
 4713:     case CHAR_LEFT_SQUARE_BRACKET:
 4714:     if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
 4715:       {
 4716:       nestptr = ptr + 7;
 4717:       ptr = sub_start_of_word - 1;
 4718:       continue;
 4719:       }
 4720: 
 4721:     if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
 4722:       {
 4723:       nestptr = ptr + 7;
 4724:       ptr = sub_end_of_word - 1;
 4725:       continue;
 4726:       }
 4727: 
 4728:     /* Handle a real character class. */
 4729: 
 4730:     previous = code;
 4731: 
 4732:     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
 4733:     they are encountered at the top level, so we'll do that too. */
 4734: 
 4735:     if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
 4736:          ptr[1] == CHAR_EQUALS_SIGN) &&
 4737:         check_posix_syntax(ptr, &tempptr))
 4738:       {
 4739:       *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
 4740:       goto FAILED;
 4741:       }
 4742: 
 4743:     /* If the first character is '^', set the negation flag and skip it. Also,
 4744:     if the first few characters (either before or after ^) are \Q\E or \E we
 4745:     skip them too. This makes for compatibility with Perl. */
 4746: 
 4747:     negate_class = FALSE;
 4748:     for (;;)
 4749:       {
 4750:       c = *(++ptr);
 4751:       if (c == CHAR_BACKSLASH)
 4752:         {
 4753:         if (ptr[1] == CHAR_E)
 4754:           ptr++;
 4755:         else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
 4756:           ptr += 3;
 4757:         else
 4758:           break;
 4759:         }
 4760:       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
 4761:         negate_class = TRUE;
 4762:       else break;
 4763:       }
 4764: 
 4765:     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
 4766:     an initial ']' is taken as a data character -- the code below handles
 4767:     that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
 4768:     [^] must match any character, so generate OP_ALLANY. */
 4769: 
 4770:     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
 4771:         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
 4772:       {
 4773:       *code++ = negate_class? OP_ALLANY : OP_FAIL;
 4774:       if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
 4775:       zerofirstchar = firstchar;
 4776:       zerofirstcharflags = firstcharflags;
 4777:       break;
 4778:       }
 4779: 
 4780:     /* If a class contains a negative special such as \S, we need to flip the
 4781:     negation flag at the end, so that support for characters > 255 works
 4782:     correctly (they are all included in the class). */
 4783: 
 4784:     should_flip_negation = FALSE;
 4785: 
 4786:     /* For optimization purposes, we track some properties of the class:
 4787:     class_has_8bitchar will be non-zero if the class contains at least one <
 4788:     256 character; class_one_char will be 1 if the class contains just one
 4789:     character. */
 4790: 
 4791:     class_has_8bitchar = 0;
 4792:     class_one_char = 0;
 4793: 
 4794:     /* Initialize the 32-char bit map to all zeros. We build the map in a
 4795:     temporary bit of memory, in case the class contains fewer than two
 4796:     8-bit characters because in that case the compiled code doesn't use the bit
 4797:     map. */
 4798: 
 4799:     memset(classbits, 0, 32 * sizeof(pcre_uint8));
 4800: 
 4801: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 4802:     xclass = FALSE;
 4803:     class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
 4804:     class_uchardata_base = class_uchardata;   /* Save the start */
 4805: #endif
 4806: 
 4807:     /* Process characters until ] is reached. By writing this as a "do" it
 4808:     means that an initial ] is taken as a data character. At the start of the
 4809:     loop, c contains the first byte of the character. */
 4810: 
 4811:     if (c != CHAR_NULL) do
 4812:       {
 4813:       const pcre_uchar *oldptr;
 4814: 
 4815: #ifdef SUPPORT_UTF
 4816:       if (utf && HAS_EXTRALEN(c))
 4817:         {                           /* Braces are required because the */
 4818:         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
 4819:         }
 4820: #endif
 4821: 
 4822: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 4823:       /* In the pre-compile phase, accumulate the length of any extra
 4824:       data and reset the pointer. This is so that very large classes that
 4825:       contain a zillion > 255 characters no longer overwrite the work space
 4826:       (which is on the stack). We have to remember that there was XCLASS data,
 4827:       however. */
 4828: 
 4829:       if (lengthptr != NULL && class_uchardata > class_uchardata_base)
 4830:         {
 4831:         xclass = TRUE;
 4832:         *lengthptr += class_uchardata - class_uchardata_base;
 4833:         class_uchardata = class_uchardata_base;
 4834:         }
 4835: #endif
 4836: 
 4837:       /* Inside \Q...\E everything is literal except \E */
 4838: 
 4839:       if (inescq)
 4840:         {
 4841:         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
 4842:           {
 4843:           inescq = FALSE;                   /* Reset literal state */
 4844:           ptr++;                            /* Skip the 'E' */
 4845:           continue;                         /* Carry on with next */
 4846:           }
 4847:         goto CHECK_RANGE;                   /* Could be range if \E follows */
 4848:         }
 4849: 
 4850:       /* Handle POSIX class names. Perl allows a negation extension of the
 4851:       form [:^name:]. A square bracket that doesn't match the syntax is
 4852:       treated as a literal. We also recognize the POSIX constructions
 4853:       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
 4854:       5.6 and 5.8 do. */
 4855: 
 4856:       if (c == CHAR_LEFT_SQUARE_BRACKET &&
 4857:           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
 4858:            ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
 4859:         {
 4860:         BOOL local_negate = FALSE;
 4861:         int posix_class, taboffset, tabopt;
 4862:         register const pcre_uint8 *cbits = cd->cbits;
 4863:         pcre_uint8 pbits[32];
 4864: 
 4865:         if (ptr[1] != CHAR_COLON)
 4866:           {
 4867:           *errorcodeptr = ERR31;
 4868:           goto FAILED;
 4869:           }
 4870: 
 4871:         ptr += 2;
 4872:         if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
 4873:           {
 4874:           local_negate = TRUE;
 4875:           should_flip_negation = TRUE;  /* Note negative special */
 4876:           ptr++;
 4877:           }
 4878: 
 4879:         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
 4880:         if (posix_class < 0)
 4881:           {
 4882:           *errorcodeptr = ERR30;
 4883:           goto FAILED;
 4884:           }
 4885: 
 4886:         /* If matching is caseless, upper and lower are converted to
 4887:         alpha. This relies on the fact that the class table starts with
 4888:         alpha, lower, upper as the first 3 entries. */
 4889: 
 4890:         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
 4891:           posix_class = 0;
 4892: 
 4893:         /* When PCRE_UCP is set, some of the POSIX classes are converted to
 4894:         different escape sequences that use Unicode properties \p or \P. Others
 4895:         that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
 4896:         directly. */
 4897: 
 4898: #ifdef SUPPORT_UCP
 4899:         if ((options & PCRE_UCP) != 0)
 4900:           {
 4901:           unsigned int ptype = 0;
 4902:           int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
 4903: 
 4904:           /* The posix_substitutes table specifies which POSIX classes can be
 4905:           converted to \p or \P items. */
 4906: 
 4907:           if (posix_substitutes[pc] != NULL)
 4908:             {
 4909:             nestptr = tempptr + 1;
 4910:             ptr = posix_substitutes[pc] - 1;
 4911:             continue;
 4912:             }
 4913: 
 4914:           /* There are three other classes that generate special property calls
 4915:           that are recognized only in an XCLASS. */
 4916: 
 4917:           else switch(posix_class)
 4918:             {
 4919:             case PC_GRAPH:
 4920:             ptype = PT_PXGRAPH;
 4921:             /* Fall through */
 4922:             case PC_PRINT:
 4923:             if (ptype == 0) ptype = PT_PXPRINT;
 4924:             /* Fall through */
 4925:             case PC_PUNCT:
 4926:             if (ptype == 0) ptype = PT_PXPUNCT;
 4927:             *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
 4928:             *class_uchardata++ = ptype;
 4929:             *class_uchardata++ = 0;
 4930:             ptr = tempptr + 1;
 4931:             continue;
 4932: 
 4933:             /* For all other POSIX classes, no special action is taken in UCP
 4934:             mode. Fall through to the non_UCP case. */
 4935: 
 4936:             default:
 4937:             break;
 4938:             }
 4939:           }
 4940: #endif
 4941:         /* In the non-UCP case, or when UCP makes no difference, we build the
 4942:         bit map for the POSIX class in a chunk of local store because we may be
 4943:         adding and subtracting from it, and we don't want to subtract bits that
 4944:         may be in the main map already. At the end we or the result into the
 4945:         bit map that is being built. */
 4946: 
 4947:         posix_class *= 3;
 4948: 
 4949:         /* Copy in the first table (always present) */
 4950: 
 4951:         memcpy(pbits, cbits + posix_class_maps[posix_class],
 4952:           32 * sizeof(pcre_uint8));
 4953: 
 4954:         /* If there is a second table, add or remove it as required. */
 4955: 
 4956:         taboffset = posix_class_maps[posix_class + 1];
 4957:         tabopt = posix_class_maps[posix_class + 2];
 4958: 
 4959:         if (taboffset >= 0)
 4960:           {
 4961:           if (tabopt >= 0)
 4962:             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
 4963:           else
 4964:             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
 4965:           }
 4966: 
 4967:         /* Now see if we need to remove any special characters. An option
 4968:         value of 1 removes vertical space and 2 removes underscore. */
 4969: 
 4970:         if (tabopt < 0) tabopt = -tabopt;
 4971:         if (tabopt == 1) pbits[1] &= ~0x3c;
 4972:           else if (tabopt == 2) pbits[11] &= 0x7f;
 4973: 
 4974:         /* Add the POSIX table or its complement into the main table that is
 4975:         being built and we are done. */
 4976: 
 4977:         if (local_negate)
 4978:           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
 4979:         else
 4980:           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
 4981: 
 4982:         ptr = tempptr + 1;
 4983:         /* Every class contains at least one < 256 character. */
 4984:         class_has_8bitchar = 1;
 4985:         /* Every class contains at least two characters. */
 4986:         class_one_char = 2;
 4987:         continue;    /* End of POSIX syntax handling */
 4988:         }
 4989: 
 4990:       /* Backslash may introduce a single character, or it may introduce one
 4991:       of the specials, which just set a flag. The sequence \b is a special
 4992:       case. Inside a class (and only there) it is treated as backspace. We
 4993:       assume that other escapes have more than one character in them, so
 4994:       speculatively set both class_has_8bitchar and class_one_char bigger
 4995:       than one. Unrecognized escapes fall through and are either treated
 4996:       as literal characters (by default), or are faulted if
 4997:       PCRE_EXTRA is set. */
 4998: 
 4999:       if (c == CHAR_BACKSLASH)
 5000:         {
 5001:         escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
 5002:           TRUE);
 5003:         if (*errorcodeptr != 0) goto FAILED;
 5004:         if (escape == 0) c = ec;
 5005:         else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
 5006:         else if (escape == ESC_N)          /* \N is not supported in a class */
 5007:           {
 5008:           *errorcodeptr = ERR71;
 5009:           goto FAILED;
 5010:           }
 5011:         else if (escape == ESC_Q)            /* Handle start of quoted string */
 5012:           {
 5013:           if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
 5014:             {
 5015:             ptr += 2; /* avoid empty string */
 5016:             }
 5017:           else inescq = TRUE;
 5018:           continue;
 5019:           }
 5020:         else if (escape == ESC_E) continue;  /* Ignore orphan \E */
 5021: 
 5022:         else
 5023:           {
 5024:           register const pcre_uint8 *cbits = cd->cbits;
 5025:           /* Every class contains at least two < 256 characters. */
 5026:           class_has_8bitchar++;
 5027:           /* Every class contains at least two characters. */
 5028:           class_one_char += 2;
 5029: 
 5030:           switch (escape)
 5031:             {
 5032: #ifdef SUPPORT_UCP
 5033:             case ESC_du:     /* These are the values given for \d etc */
 5034:             case ESC_DU:     /* when PCRE_UCP is set. We replace the */
 5035:             case ESC_wu:     /* escape sequence with an appropriate \p */
 5036:             case ESC_WU:     /* or \P to test Unicode properties instead */
 5037:             case ESC_su:     /* of the default ASCII testing. */
 5038:             case ESC_SU:
 5039:             nestptr = ptr;
 5040:             ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
 5041:             class_has_8bitchar--;                /* Undo! */
 5042:             continue;
 5043: #endif
 5044:             case ESC_d:
 5045:             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
 5046:             continue;
 5047: 
 5048:             case ESC_D:
 5049:             should_flip_negation = TRUE;
 5050:             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
 5051:             continue;
 5052: 
 5053:             case ESC_w:
 5054:             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
 5055:             continue;
 5056: 
 5057:             case ESC_W:
 5058:             should_flip_negation = TRUE;
 5059:             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
 5060:             continue;
 5061: 
 5062:             /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
 5063:             5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
 5064:             previously set by something earlier in the character class.
 5065:             Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
 5066:             we could just adjust the appropriate bit. From PCRE 8.34 we no
 5067:             longer treat \s and \S specially. */
 5068: 
 5069:             case ESC_s:
 5070:             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
 5071:             continue;
 5072: 
 5073:             case ESC_S:
 5074:             should_flip_negation = TRUE;
 5075:             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
 5076:             continue;
 5077: 
 5078:             /* The rest apply in both UCP and non-UCP cases. */
 5079: 
 5080:             case ESC_h:
 5081:             (void)add_list_to_class(classbits, &class_uchardata, options, cd,
 5082:               PRIV(hspace_list), NOTACHAR);
 5083:             continue;
 5084: 
 5085:             case ESC_H:
 5086:             (void)add_not_list_to_class(classbits, &class_uchardata, options,
 5087:               cd, PRIV(hspace_list));
 5088:             continue;
 5089: 
 5090:             case ESC_v:
 5091:             (void)add_list_to_class(classbits, &class_uchardata, options, cd,
 5092:               PRIV(vspace_list), NOTACHAR);
 5093:             continue;
 5094: 
 5095:             case ESC_V:
 5096:             (void)add_not_list_to_class(classbits, &class_uchardata, options,
 5097:               cd, PRIV(vspace_list));
 5098:             continue;
 5099: 
 5100: #ifdef SUPPORT_UCP
 5101:             case ESC_p:
 5102:             case ESC_P:
 5103:               {
 5104:               BOOL negated;
 5105:               unsigned int ptype = 0, pdata = 0;
 5106:               if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
 5107:                 goto FAILED;
 5108:               *class_uchardata++ = ((escape == ESC_p) != negated)?
 5109:                 XCL_PROP : XCL_NOTPROP;
 5110:               *class_uchardata++ = ptype;
 5111:               *class_uchardata++ = pdata;
 5112:               class_has_8bitchar--;                /* Undo! */
 5113:               continue;
 5114:               }
 5115: #endif
 5116:             /* Unrecognized escapes are faulted if PCRE is running in its
 5117:             strict mode. By default, for compatibility with Perl, they are
 5118:             treated as literals. */
 5119: 
 5120:             default:
 5121:             if ((options & PCRE_EXTRA) != 0)
 5122:               {
 5123:               *errorcodeptr = ERR7;
 5124:               goto FAILED;
 5125:               }
 5126:             class_has_8bitchar--;    /* Undo the speculative increase. */
 5127:             class_one_char -= 2;     /* Undo the speculative increase. */
 5128:             c = *ptr;                /* Get the final character and fall through */
 5129:             break;
 5130:             }
 5131:           }
 5132: 
 5133:         /* Fall through if the escape just defined a single character (c >= 0).
 5134:         This may be greater than 256. */
 5135: 
 5136:         escape = 0;
 5137: 
 5138:         }   /* End of backslash handling */
 5139: 
 5140:       /* A character may be followed by '-' to form a range. However, Perl does
 5141:       not permit ']' to be the end of the range. A '-' character at the end is
 5142:       treated as a literal. Perl ignores orphaned \E sequences entirely. The
 5143:       code for handling \Q and \E is messy. */
 5144: 
 5145:       CHECK_RANGE:
 5146:       while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
 5147:         {
 5148:         inescq = FALSE;
 5149:         ptr += 2;
 5150:         }
 5151:       oldptr = ptr;
 5152: 
 5153:       /* Remember if \r or \n were explicitly used */
 5154: 
 5155:       if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
 5156: 
 5157:       /* Check for range */
 5158: 
 5159:       if (!inescq && ptr[1] == CHAR_MINUS)
 5160:         {
 5161:         pcre_uint32 d;
 5162:         ptr += 2;
 5163:         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
 5164: 
 5165:         /* If we hit \Q (not followed by \E) at this point, go into escaped
 5166:         mode. */
 5167: 
 5168:         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
 5169:           {
 5170:           ptr += 2;
 5171:           if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
 5172:             { ptr += 2; continue; }
 5173:           inescq = TRUE;
 5174:           break;
 5175:           }
 5176: 
 5177:         /* Minus (hyphen) at the end of a class is treated as a literal, so put
 5178:         back the pointer and jump to handle the character that preceded it. */
 5179: 
 5180:         if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
 5181:           {
 5182:           ptr = oldptr;
 5183:           goto CLASS_SINGLE_CHARACTER;
 5184:           }
 5185: 
 5186:         /* Otherwise, we have a potential range; pick up the next character */
 5187: 
 5188: #ifdef SUPPORT_UTF
 5189:         if (utf)
 5190:           {                           /* Braces are required because the */
 5191:           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
 5192:           }
 5193:         else
 5194: #endif
 5195:         d = *ptr;  /* Not UTF-8 mode */
 5196: 
 5197:         /* The second part of a range can be a single-character escape
 5198:         sequence, but not any of the other escapes. Perl treats a hyphen as a
 5199:         literal in such circumstances. However, in Perl's warning mode, a
 5200:         warning is given, so PCRE now faults it as it is almost certainly a
 5201:         mistake on the user's part. */
 5202: 
 5203:         if (!inescq)
 5204:           {
 5205:           if (d == CHAR_BACKSLASH)
 5206:             {
 5207:             int descape;
 5208:             descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
 5209:             if (*errorcodeptr != 0) goto FAILED;
 5210: 
 5211:             /* 0 means a character was put into d; \b is backspace; any other
 5212:             special causes an error. */
 5213: 
 5214:             if (descape != 0)
 5215:               {
 5216:               if (descape == ESC_b) d = CHAR_BS; else
 5217:                 {
 5218:                 *errorcodeptr = ERR83;
 5219:                 goto FAILED;
 5220:                 }
 5221:               }
 5222:             }
 5223: 
 5224:           /* A hyphen followed by a POSIX class is treated in the same way. */
 5225: 
 5226:           else if (d == CHAR_LEFT_SQUARE_BRACKET &&
 5227:                    (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
 5228:                     ptr[1] == CHAR_EQUALS_SIGN) &&
 5229:                    check_posix_syntax(ptr, &tempptr))
 5230:             {
 5231:             *errorcodeptr = ERR83;
 5232:             goto FAILED;
 5233:             }
 5234:           }
 5235: 
 5236:         /* Check that the two values are in the correct order. Optimize
 5237:         one-character ranges. */
 5238: 
 5239:         if (d < c)
 5240:           {
 5241:           *errorcodeptr = ERR8;
 5242:           goto FAILED;
 5243:           }
 5244:         if (d == c) goto CLASS_SINGLE_CHARACTER;  /* A few lines below */
 5245: 
 5246:         /* We have found a character range, so single character optimizations
 5247:         cannot be done anymore. Any value greater than 1 indicates that there
 5248:         is more than one character. */
 5249: 
 5250:         class_one_char = 2;
 5251: 
 5252:         /* Remember an explicit \r or \n, and add the range to the class. */
 5253: 
 5254:         if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
 5255: 
 5256:         class_has_8bitchar +=
 5257:           add_to_class(classbits, &class_uchardata, options, cd, c, d);
 5258: 
 5259:         continue;   /* Go get the next char in the class */
 5260:         }
 5261: 
 5262:       /* Handle a single character - we can get here for a normal non-escape
 5263:       char, or after \ that introduces a single character or for an apparent
 5264:       range that isn't. Only the value 1 matters for class_one_char, so don't
 5265:       increase it if it is already 2 or more ... just in case there's a class
 5266:       with a zillion characters in it. */
 5267: 
 5268:       CLASS_SINGLE_CHARACTER:
 5269:       if (class_one_char < 2) class_one_char++;
 5270: 
 5271:       /* If class_one_char is 1, we have the first single character in the
 5272:       class, and there have been no prior ranges, or XCLASS items generated by
 5273:       escapes. If this is the final character in the class, we can optimize by
 5274:       turning the item into a 1-character OP_CHAR[I] if it's positive, or
 5275:       OP_NOT[I] if it's negative. In the positive case, it can cause firstchar
 5276:       to be set. Otherwise, there can be no first char if this item is first,
 5277:       whatever repeat count may follow. In the case of reqchar, save the
 5278:       previous value for reinstating. */
 5279: 
 5280:       if (class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
 5281:         {
 5282:         ptr++;
 5283:         zeroreqchar = reqchar;
 5284:         zeroreqcharflags = reqcharflags;
 5285: 
 5286:         if (negate_class)
 5287:           {
 5288: #ifdef SUPPORT_UCP
 5289:           int d;
 5290: #endif
 5291:           if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
 5292:           zerofirstchar = firstchar;
 5293:           zerofirstcharflags = firstcharflags;
 5294: 
 5295:           /* For caseless UTF-8 mode when UCP support is available, check
 5296:           whether this character has more than one other case. If so, generate
 5297:           a special OP_NOTPROP item instead of OP_NOTI. */
 5298: 
 5299: #ifdef SUPPORT_UCP
 5300:           if (utf && (options & PCRE_CASELESS) != 0 &&
 5301:               (d = UCD_CASESET(c)) != 0)
 5302:             {
 5303:             *code++ = OP_NOTPROP;
 5304:             *code++ = PT_CLIST;
 5305:             *code++ = d;
 5306:             }
 5307:           else
 5308: #endif
 5309:           /* Char has only one other case, or UCP not available */
 5310: 
 5311:             {
 5312:             *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
 5313: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
 5314:             if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
 5315:               code += PRIV(ord2utf)(c, code);
 5316:             else
 5317: #endif
 5318:               *code++ = c;
 5319:             }
 5320: 
 5321:           /* We are finished with this character class */
 5322: 
 5323:           goto END_CLASS;
 5324:           }
 5325: 
 5326:         /* For a single, positive character, get the value into mcbuffer, and
 5327:         then we can handle this with the normal one-character code. */
 5328: 
 5329: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
 5330:         if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
 5331:           mclength = PRIV(ord2utf)(c, mcbuffer);
 5332:         else
 5333: #endif
 5334:           {
 5335:           mcbuffer[0] = c;
 5336:           mclength = 1;
 5337:           }
 5338:         goto ONE_CHAR;
 5339:         }       /* End of 1-char optimization */
 5340: 
 5341:       /* There is more than one character in the class, or an XCLASS item
 5342:       has been generated. Add this character to the class. */
 5343: 
 5344:       class_has_8bitchar +=
 5345:         add_to_class(classbits, &class_uchardata, options, cd, c, c);
 5346:       }
 5347: 
 5348:     /* Loop until ']' reached. This "while" is the end of the "do" far above.
 5349:     If we are at the end of an internal nested string, revert to the outer
 5350:     string. */
 5351: 
 5352:     while (((c = *(++ptr)) != CHAR_NULL ||
 5353:            (nestptr != NULL &&
 5354:              (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
 5355:            (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
 5356: 
 5357:     /* Check for missing terminating ']' */
 5358: 
 5359:     if (c == CHAR_NULL)
 5360:       {
 5361:       *errorcodeptr = ERR6;
 5362:       goto FAILED;
 5363:       }
 5364: 
 5365:     /* We will need an XCLASS if data has been placed in class_uchardata. In
 5366:     the second phase this is a sufficient test. However, in the pre-compile
 5367:     phase, class_uchardata gets emptied to prevent workspace overflow, so it
 5368:     only if the very last character in the class needs XCLASS will it contain
 5369:     anything at this point. For this reason, xclass gets set TRUE above when
 5370:     uchar_classdata is emptied, and that's why this code is the way it is here
 5371:     instead of just doing a test on class_uchardata below. */
 5372: 
 5373: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 5374:     if (class_uchardata > class_uchardata_base) xclass = TRUE;
 5375: #endif
 5376: 
 5377:     /* If this is the first thing in the branch, there can be no first char
 5378:     setting, whatever the repeat count. Any reqchar setting must remain
 5379:     unchanged after any kind of repeat. */
 5380: 
 5381:     if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
 5382:     zerofirstchar = firstchar;
 5383:     zerofirstcharflags = firstcharflags;
 5384:     zeroreqchar = reqchar;
 5385:     zeroreqcharflags = reqcharflags;
 5386: 
 5387:     /* If there are characters with values > 255, we have to compile an
 5388:     extended class, with its own opcode, unless there was a negated special
 5389:     such as \S in the class, and PCRE_UCP is not set, because in that case all
 5390:     characters > 255 are in the class, so any that were explicitly given as
 5391:     well can be ignored. If (when there are explicit characters > 255 that must
 5392:     be listed) there are no characters < 256, we can omit the bitmap in the
 5393:     actual compiled code. */
 5394: 
 5395: #ifdef SUPPORT_UTF
 5396:     if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
 5397: #elif !defined COMPILE_PCRE8
 5398:     if (xclass && !should_flip_negation)
 5399: #endif
 5400: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 5401:       {
 5402:       *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
 5403:       *code++ = OP_XCLASS;
 5404:       code += LINK_SIZE;
 5405:       *code = negate_class? XCL_NOT:0;
 5406: 
 5407:       /* If the map is required, move up the extra data to make room for it;
 5408:       otherwise just move the code pointer to the end of the extra data. */
 5409: 
 5410:       if (class_has_8bitchar > 0)
 5411:         {
 5412:         *code++ |= XCL_MAP;
 5413:         memmove(code + (32 / sizeof(pcre_uchar)), code,
 5414:           IN_UCHARS(class_uchardata - code));
 5415:         memcpy(code, classbits, 32);
 5416:         code = class_uchardata + (32 / sizeof(pcre_uchar));
 5417:         }
 5418:       else code = class_uchardata;
 5419: 
 5420:       /* Now fill in the complete length of the item */
 5421: 
 5422:       PUT(previous, 1, (int)(code - previous));
 5423:       break;   /* End of class handling */
 5424:       }
 5425: #endif
 5426: 
 5427:     /* If there are no characters > 255, or they are all to be included or
 5428:     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
 5429:     whole class was negated and whether there were negative specials such as \S
 5430:     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
 5431:     negating it if necessary. */
 5432: 
 5433:     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
 5434:     if (lengthptr == NULL)    /* Save time in the pre-compile phase */
 5435:       {
 5436:       if (negate_class)
 5437:         for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
 5438:       memcpy(code, classbits, 32);
 5439:       }
 5440:     code += 32 / sizeof(pcre_uchar);
 5441: 
 5442:     END_CLASS:
 5443:     break;
 5444: 
 5445: 
 5446:     /* ===================================================================*/
 5447:     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
 5448:     has been tested above. */
 5449: 
 5450:     case CHAR_LEFT_CURLY_BRACKET:
 5451:     if (!is_quantifier) goto NORMAL_CHAR;
 5452:     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
 5453:     if (*errorcodeptr != 0) goto FAILED;
 5454:     goto REPEAT;
 5455: 
 5456:     case CHAR_ASTERISK:
 5457:     repeat_min = 0;
 5458:     repeat_max = -1;
 5459:     goto REPEAT;
 5460: 
 5461:     case CHAR_PLUS:
 5462:     repeat_min = 1;
 5463:     repeat_max = -1;
 5464:     goto REPEAT;
 5465: 
 5466:     case CHAR_QUESTION_MARK:
 5467:     repeat_min = 0;
 5468:     repeat_max = 1;
 5469: 
 5470:     REPEAT:
 5471:     if (previous == NULL)
 5472:       {
 5473:       *errorcodeptr = ERR9;
 5474:       goto FAILED;
 5475:       }
 5476: 
 5477:     if (repeat_min == 0)
 5478:       {
 5479:       firstchar = zerofirstchar;    /* Adjust for zero repeat */
 5480:       firstcharflags = zerofirstcharflags;
 5481:       reqchar = zeroreqchar;        /* Ditto */
 5482:       reqcharflags = zeroreqcharflags;
 5483:       }
 5484: 
 5485:     /* Remember whether this is a variable length repeat */
 5486: 
 5487:     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
 5488: 
 5489:     op_type = 0;                    /* Default single-char op codes */
 5490:     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
 5491: 
 5492:     /* Save start of previous item, in case we have to move it up in order to
 5493:     insert something before it. */
 5494: 
 5495:     tempcode = previous;
 5496: 
 5497:     /* Before checking for a possessive quantifier, we must skip over
 5498:     whitespace and comments in extended mode because Perl allows white space at
 5499:     this point. */
 5500: 
 5501:     if ((options & PCRE_EXTENDED) != 0)
 5502:       {
 5503:       const pcre_uchar *p = ptr + 1;
 5504:       for (;;)
 5505:         {
 5506:         while (MAX_255(*p) && (cd->ctypes[*p] & ctype_space) != 0) p++;
 5507:         if (*p != CHAR_NUMBER_SIGN) break;
 5508:         p++;
 5509:         while (*p != CHAR_NULL)
 5510:           {
 5511:           if (IS_NEWLINE(p))         /* For non-fixed-length newline cases, */
 5512:             {                        /* IS_NEWLINE sets cd->nllen. */
 5513:             p += cd->nllen;
 5514:             break;
 5515:             }
 5516:           p++;
 5517: #ifdef SUPPORT_UTF
 5518:           if (utf) FORWARDCHAR(p);
 5519: #endif
 5520:           }           /* Loop for comment characters */
 5521:         }             /* Loop for multiple comments */
 5522:       ptr = p - 1;    /* Character before the next significant one. */
 5523:       }
 5524: 
 5525:     /* If the next character is '+', we have a possessive quantifier. This
 5526:     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
 5527:     If the next character is '?' this is a minimizing repeat, by default,
 5528:     but if PCRE_UNGREEDY is set, it works the other way round. We change the
 5529:     repeat type to the non-default. */
 5530: 
 5531:     if (ptr[1] == CHAR_PLUS)
 5532:       {
 5533:       repeat_type = 0;                  /* Force greedy */
 5534:       possessive_quantifier = TRUE;
 5535:       ptr++;
 5536:       }
 5537:     else if (ptr[1] == CHAR_QUESTION_MARK)
 5538:       {
 5539:       repeat_type = greedy_non_default;
 5540:       ptr++;
 5541:       }
 5542:     else repeat_type = greedy_default;
 5543: 
 5544:     /* If previous was a recursion call, wrap it in atomic brackets so that
 5545:     previous becomes the atomic group. All recursions were so wrapped in the
 5546:     past, but it no longer happens for non-repeated recursions. In fact, the
 5547:     repeated ones could be re-implemented independently so as not to need this,
 5548:     but for the moment we rely on the code for repeating groups. */
 5549: 
 5550:     if (*previous == OP_RECURSE)
 5551:       {
 5552:       memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
 5553:       *previous = OP_ONCE;
 5554:       PUT(previous, 1, 2 + 2*LINK_SIZE);
 5555:       previous[2 + 2*LINK_SIZE] = OP_KET;
 5556:       PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
 5557:       code += 2 + 2 * LINK_SIZE;
 5558:       length_prevgroup = 3 + 3*LINK_SIZE;
 5559: 
 5560:       /* When actually compiling, we need to check whether this was a forward
 5561:       reference, and if so, adjust the offset. */
 5562: 
 5563:       if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
 5564:         {
 5565:         int offset = GET(cd->hwm, -LINK_SIZE);
 5566:         if (offset == previous + 1 - cd->start_code)
 5567:           PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
 5568:         }
 5569:       }
 5570: 
 5571:     /* Now handle repetition for the different types of item. */
 5572: 
 5573:     /* If previous was a character or negated character match, abolish the item
 5574:     and generate a repeat item instead. If a char item has a minimum of more
 5575:     than one, ensure that it is set in reqchar - it might not be if a sequence
 5576:     such as x{3} is the first thing in a branch because the x will have gone
 5577:     into firstchar instead.  */
 5578: 
 5579:     if (*previous == OP_CHAR || *previous == OP_CHARI
 5580:         || *previous == OP_NOT || *previous == OP_NOTI)
 5581:       {
 5582:       switch (*previous)
 5583:         {
 5584:         default: /* Make compiler happy. */
 5585:         case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;
 5586:         case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
 5587:         case OP_NOT:   op_type = OP_NOTSTAR - OP_STAR; break;
 5588:         case OP_NOTI:  op_type = OP_NOTSTARI - OP_STAR; break;
 5589:         }
 5590: 
 5591:       /* Deal with UTF characters that take up more than one character. It's
 5592:       easier to write this out separately than try to macrify it. Use c to
 5593:       hold the length of the character in bytes, plus UTF_LENGTH to flag that
 5594:       it's a length rather than a small character. */
 5595: 
 5596: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
 5597:       if (utf && NOT_FIRSTCHAR(code[-1]))
 5598:         {
 5599:         pcre_uchar *lastchar = code - 1;
 5600:         BACKCHAR(lastchar);
 5601:         c = (int)(code - lastchar);     /* Length of UTF-8 character */
 5602:         memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
 5603:         c |= UTF_LENGTH;                /* Flag c as a length */
 5604:         }
 5605:       else
 5606: #endif /* SUPPORT_UTF */
 5607: 
 5608:       /* Handle the case of a single charater - either with no UTF support, or
 5609:       with UTF disabled, or for a single character UTF character. */
 5610:         {
 5611:         c = code[-1];
 5612:         if (*previous <= OP_CHARI && repeat_min > 1)
 5613:           {
 5614:           reqchar = c;
 5615:           reqcharflags = req_caseopt | cd->req_varyopt;
 5616:           }
 5617:         }
 5618: 
 5619:       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
 5620:       }
 5621: 
 5622:     /* If previous was a character type match (\d or similar), abolish it and
 5623:     create a suitable repeat item. The code is shared with single-character
 5624:     repeats by setting op_type to add a suitable offset into repeat_type. Note
 5625:     the the Unicode property types will be present only when SUPPORT_UCP is
 5626:     defined, but we don't wrap the little bits of code here because it just
 5627:     makes it horribly messy. */
 5628: 
 5629:     else if (*previous < OP_EODN)
 5630:       {
 5631:       pcre_uchar *oldcode;
 5632:       int prop_type, prop_value;
 5633:       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
 5634:       c = *previous;
 5635: 
 5636:       OUTPUT_SINGLE_REPEAT:
 5637:       if (*previous == OP_PROP || *previous == OP_NOTPROP)
 5638:         {
 5639:         prop_type = previous[1];
 5640:         prop_value = previous[2];
 5641:         }
 5642:       else prop_type = prop_value = -1;
 5643: 
 5644:       oldcode = code;
 5645:       code = previous;                  /* Usually overwrite previous item */
 5646: 
 5647:       /* If the maximum is zero then the minimum must also be zero; Perl allows
 5648:       this case, so we do too - by simply omitting the item altogether. */
 5649: 
 5650:       if (repeat_max == 0) goto END_REPEAT;
 5651: 
 5652:       /* Combine the op_type with the repeat_type */
 5653: 
 5654:       repeat_type += op_type;
 5655: 
 5656:       /* A minimum of zero is handled either as the special case * or ?, or as
 5657:       an UPTO, with the maximum given. */
 5658: 
 5659:       if (repeat_min == 0)
 5660:         {
 5661:         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
 5662:           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
 5663:         else
 5664:           {
 5665:           *code++ = OP_UPTO + repeat_type;
 5666:           PUT2INC(code, 0, repeat_max);
 5667:           }
 5668:         }
 5669: 
 5670:       /* A repeat minimum of 1 is optimized into some special cases. If the
 5671:       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
 5672:       left in place and, if the maximum is greater than 1, we use OP_UPTO with
 5673:       one less than the maximum. */
 5674: 
 5675:       else if (repeat_min == 1)
 5676:         {
 5677:         if (repeat_max == -1)
 5678:           *code++ = OP_PLUS + repeat_type;
 5679:         else
 5680:           {
 5681:           code = oldcode;                 /* leave previous item in place */
 5682:           if (repeat_max == 1) goto END_REPEAT;
 5683:           *code++ = OP_UPTO + repeat_type;
 5684:           PUT2INC(code, 0, repeat_max - 1);
 5685:           }
 5686:         }
 5687: 
 5688:       /* The case {n,n} is just an EXACT, while the general case {n,m} is
 5689:       handled as an EXACT followed by an UPTO. */
 5690: 
 5691:       else
 5692:         {
 5693:         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
 5694:         PUT2INC(code, 0, repeat_min);
 5695: 
 5696:         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
 5697:         we have to insert the character for the previous code. For a repeated
 5698:         Unicode property match, there are two extra bytes that define the
 5699:         required property. In UTF-8 mode, long characters have their length in
 5700:         c, with the UTF_LENGTH bit as a flag. */
 5701: 
 5702:         if (repeat_max < 0)
 5703:           {
 5704: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
 5705:           if (utf && (c & UTF_LENGTH) != 0)
 5706:             {
 5707:             memcpy(code, utf_chars, IN_UCHARS(c & 7));
 5708:             code += c & 7;
 5709:             }
 5710:           else
 5711: #endif
 5712:             {
 5713:             *code++ = c;
 5714:             if (prop_type >= 0)
 5715:               {
 5716:               *code++ = prop_type;
 5717:               *code++ = prop_value;
 5718:               }
 5719:             }
 5720:           *code++ = OP_STAR + repeat_type;
 5721:           }
 5722: 
 5723:         /* Else insert an UPTO if the max is greater than the min, again
 5724:         preceded by the character, for the previously inserted code. If the
 5725:         UPTO is just for 1 instance, we can use QUERY instead. */
 5726: 
 5727:         else if (repeat_max != repeat_min)
 5728:           {
 5729: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
 5730:           if (utf && (c & UTF_LENGTH) != 0)
 5731:             {
 5732:             memcpy(code, utf_chars, IN_UCHARS(c & 7));
 5733:             code += c & 7;
 5734:             }
 5735:           else
 5736: #endif
 5737:           *code++ = c;
 5738:           if (prop_type >= 0)
 5739:             {
 5740:             *code++ = prop_type;
 5741:             *code++ = prop_value;
 5742:             }
 5743:           repeat_max -= repeat_min;
 5744: 
 5745:           if (repeat_max == 1)
 5746:             {
 5747:             *code++ = OP_QUERY + repeat_type;
 5748:             }
 5749:           else
 5750:             {
 5751:             *code++ = OP_UPTO + repeat_type;
 5752:             PUT2INC(code, 0, repeat_max);
 5753:             }
 5754:           }
 5755:         }
 5756: 
 5757:       /* The character or character type itself comes last in all cases. */
 5758: 
 5759: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
 5760:       if (utf && (c & UTF_LENGTH) != 0)
 5761:         {
 5762:         memcpy(code, utf_chars, IN_UCHARS(c & 7));
 5763:         code += c & 7;
 5764:         }
 5765:       else
 5766: #endif
 5767:       *code++ = c;
 5768: 
 5769:       /* For a repeated Unicode property match, there are two extra bytes that
 5770:       define the required property. */
 5771: 
 5772: #ifdef SUPPORT_UCP
 5773:       if (prop_type >= 0)
 5774:         {
 5775:         *code++ = prop_type;
 5776:         *code++ = prop_value;
 5777:         }
 5778: #endif
 5779:       }
 5780: 
 5781:     /* If previous was a character class or a back reference, we put the repeat
 5782:     stuff after it, but just skip the item if the repeat was {0,0}. */
 5783: 
 5784:     else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
 5785: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 5786:              *previous == OP_XCLASS ||
 5787: #endif
 5788:              *previous == OP_REF   || *previous == OP_REFI ||
 5789:              *previous == OP_DNREF || *previous == OP_DNREFI)
 5790:       {
 5791:       if (repeat_max == 0)
 5792:         {
 5793:         code = previous;
 5794:         goto END_REPEAT;
 5795:         }
 5796: 
 5797:       if (repeat_min == 0 && repeat_max == -1)
 5798:         *code++ = OP_CRSTAR + repeat_type;
 5799:       else if (repeat_min == 1 && repeat_max == -1)
 5800:         *code++ = OP_CRPLUS + repeat_type;
 5801:       else if (repeat_min == 0 && repeat_max == 1)
 5802:         *code++ = OP_CRQUERY + repeat_type;
 5803:       else
 5804:         {
 5805:         *code++ = OP_CRRANGE + repeat_type;
 5806:         PUT2INC(code, 0, repeat_min);
 5807:         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
 5808:         PUT2INC(code, 0, repeat_max);
 5809:         }
 5810:       }
 5811: 
 5812:     /* If previous was a bracket group, we may have to replicate it in certain
 5813:     cases. Note that at this point we can encounter only the "basic" bracket
 5814:     opcodes such as BRA and CBRA, as this is the place where they get converted
 5815:     into the more special varieties such as BRAPOS and SBRA. A test for >=
 5816:     OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
 5817:     ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND.
 5818:     Originally, PCRE did not allow repetition of assertions, but now it does,
 5819:     for Perl compatibility. */
 5820: 
 5821:     else if (*previous >= OP_ASSERT && *previous <= OP_COND)
 5822:       {
 5823:       register int i;
 5824:       int len = (int)(code - previous);
 5825:       pcre_uchar *bralink = NULL;
 5826:       pcre_uchar *brazeroptr = NULL;
 5827: 
 5828:       /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
 5829:       we just ignore the repeat. */
 5830: 
 5831:       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
 5832:         goto END_REPEAT;
 5833: 
 5834:       /* There is no sense in actually repeating assertions. The only potential
 5835:       use of repetition is in cases when the assertion is optional. Therefore,
 5836:       if the minimum is greater than zero, just ignore the repeat. If the
 5837:       maximum is not zero or one, set it to 1. */
 5838: 
 5839:       if (*previous < OP_ONCE)    /* Assertion */
 5840:         {
 5841:         if (repeat_min > 0) goto END_REPEAT;
 5842:         if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
 5843:         }
 5844: 
 5845:       /* The case of a zero minimum is special because of the need to stick
 5846:       OP_BRAZERO in front of it, and because the group appears once in the
 5847:       data, whereas in other cases it appears the minimum number of times. For
 5848:       this reason, it is simplest to treat this case separately, as otherwise
 5849:       the code gets far too messy. There are several special subcases when the
 5850:       minimum is zero. */
 5851: 
 5852:       if (repeat_min == 0)
 5853:         {
 5854:         /* If the maximum is also zero, we used to just omit the group from the
 5855:         output altogether, like this:
 5856: 
 5857:         ** if (repeat_max == 0)
 5858:         **   {
 5859:         **   code = previous;
 5860:         **   goto END_REPEAT;
 5861:         **   }
 5862: 
 5863:         However, that fails when a group or a subgroup within it is referenced
 5864:         as a subroutine from elsewhere in the pattern, so now we stick in
 5865:         OP_SKIPZERO in front of it so that it is skipped on execution. As we
 5866:         don't have a list of which groups are referenced, we cannot do this
 5867:         selectively.
 5868: 
 5869:         If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
 5870:         and do no more at this point. However, we do need to adjust any
 5871:         OP_RECURSE calls inside the group that refer to the group itself or any
 5872:         internal or forward referenced group, because the offset is from the
 5873:         start of the whole regex. Temporarily terminate the pattern while doing
 5874:         this. */
 5875: 
 5876:         if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
 5877:           {
 5878:           *code = OP_END;
 5879:           adjust_recurse(previous, 1, utf, cd, save_hwm);
 5880:           memmove(previous + 1, previous, IN_UCHARS(len));
 5881:           code++;
 5882:           if (repeat_max == 0)
 5883:             {
 5884:             *previous++ = OP_SKIPZERO;
 5885:             goto END_REPEAT;
 5886:             }
 5887:           brazeroptr = previous;    /* Save for possessive optimizing */
 5888:           *previous++ = OP_BRAZERO + repeat_type;
 5889:           }
 5890: 
 5891:         /* If the maximum is greater than 1 and limited, we have to replicate
 5892:         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
 5893:         The first one has to be handled carefully because it's the original
 5894:         copy, which has to be moved up. The remainder can be handled by code
 5895:         that is common with the non-zero minimum case below. We have to
 5896:         adjust the value or repeat_max, since one less copy is required. Once
 5897:         again, we may have to adjust any OP_RECURSE calls inside the group. */
 5898: 
 5899:         else
 5900:           {
 5901:           int offset;
 5902:           *code = OP_END;
 5903:           adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
 5904:           memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
 5905:           code += 2 + LINK_SIZE;
 5906:           *previous++ = OP_BRAZERO + repeat_type;
 5907:           *previous++ = OP_BRA;
 5908: 
 5909:           /* We chain together the bracket offset fields that have to be
 5910:           filled in later when the ends of the brackets are reached. */
 5911: 
 5912:           offset = (bralink == NULL)? 0 : (int)(previous - bralink);
 5913:           bralink = previous;
 5914:           PUTINC(previous, 0, offset);
 5915:           }
 5916: 
 5917:         repeat_max--;
 5918:         }
 5919: 
 5920:       /* If the minimum is greater than zero, replicate the group as many
 5921:       times as necessary, and adjust the maximum to the number of subsequent
 5922:       copies that we need. If we set a first char from the group, and didn't
 5923:       set a required char, copy the latter from the former. If there are any
 5924:       forward reference subroutine calls in the group, there will be entries on
 5925:       the workspace list; replicate these with an appropriate increment. */
 5926: 
 5927:       else
 5928:         {
 5929:         if (repeat_min > 1)
 5930:           {
 5931:           /* In the pre-compile phase, we don't actually do the replication. We
 5932:           just adjust the length as if we had. Do some paranoid checks for
 5933:           potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
 5934:           integer type when available, otherwise double. */
 5935: 
 5936:           if (lengthptr != NULL)
 5937:             {
 5938:             int delta = (repeat_min - 1)*length_prevgroup;
 5939:             if ((INT64_OR_DOUBLE)(repeat_min - 1)*
 5940:                   (INT64_OR_DOUBLE)length_prevgroup >
 5941:                     (INT64_OR_DOUBLE)INT_MAX ||
 5942:                 OFLOW_MAX - *lengthptr < delta)
 5943:               {
 5944:               *errorcodeptr = ERR20;
 5945:               goto FAILED;
 5946:               }
 5947:             *lengthptr += delta;
 5948:             }
 5949: 
 5950:           /* This is compiling for real. If there is a set first byte for
 5951:           the group, and we have not yet set a "required byte", set it. Make
 5952:           sure there is enough workspace for copying forward references before
 5953:           doing the copy. */
 5954: 
 5955:           else
 5956:             {
 5957:             if (groupsetfirstchar && reqcharflags < 0)
 5958:               {
 5959:               reqchar = firstchar;
 5960:               reqcharflags = firstcharflags;
 5961:               }
 5962: 
 5963:             for (i = 1; i < repeat_min; i++)
 5964:               {
 5965:               pcre_uchar *hc;
 5966:               pcre_uchar *this_hwm = cd->hwm;
 5967:               memcpy(code, previous, IN_UCHARS(len));
 5968: 
 5969:               while (cd->hwm > cd->start_workspace + cd->workspace_size -
 5970:                      WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
 5971:                 {
 5972:                 int save_offset = save_hwm - cd->start_workspace;
 5973:                 int this_offset = this_hwm - cd->start_workspace;
 5974:                 *errorcodeptr = expand_workspace(cd);
 5975:                 if (*errorcodeptr != 0) goto FAILED;
 5976:                 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
 5977:                 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
 5978:                 }
 5979: 
 5980:               for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
 5981:                 {
 5982:                 PUT(cd->hwm, 0, GET(hc, 0) + len);
 5983:                 cd->hwm += LINK_SIZE;
 5984:                 }
 5985:               save_hwm = this_hwm;
 5986:               code += len;
 5987:               }
 5988:             }
 5989:           }
 5990: 
 5991:         if (repeat_max > 0) repeat_max -= repeat_min;
 5992:         }
 5993: 
 5994:       /* This code is common to both the zero and non-zero minimum cases. If
 5995:       the maximum is limited, it replicates the group in a nested fashion,
 5996:       remembering the bracket starts on a stack. In the case of a zero minimum,
 5997:       the first one was set up above. In all cases the repeat_max now specifies
 5998:       the number of additional copies needed. Again, we must remember to
 5999:       replicate entries on the forward reference list. */
 6000: 
 6001:       if (repeat_max >= 0)
 6002:         {
 6003:         /* In the pre-compile phase, we don't actually do the replication. We
 6004:         just adjust the length as if we had. For each repetition we must add 1
 6005:         to the length for BRAZERO and for all but the last repetition we must
 6006:         add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
 6007:         paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
 6008:         a 64-bit integer type when available, otherwise double. */
 6009: 
 6010:         if (lengthptr != NULL && repeat_max > 0)
 6011:           {
 6012:           int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
 6013:                       2 - 2*LINK_SIZE;   /* Last one doesn't nest */
 6014:           if ((INT64_OR_DOUBLE)repeat_max *
 6015:                 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
 6016:                   > (INT64_OR_DOUBLE)INT_MAX ||
 6017:               OFLOW_MAX - *lengthptr < delta)
 6018:             {
 6019:             *errorcodeptr = ERR20;
 6020:             goto FAILED;
 6021:             }
 6022:           *lengthptr += delta;
 6023:           }
 6024: 
 6025:         /* This is compiling for real */
 6026: 
 6027:         else for (i = repeat_max - 1; i >= 0; i--)
 6028:           {
 6029:           pcre_uchar *hc;
 6030:           pcre_uchar *this_hwm = cd->hwm;
 6031: 
 6032:           *code++ = OP_BRAZERO + repeat_type;
 6033: 
 6034:           /* All but the final copy start a new nesting, maintaining the
 6035:           chain of brackets outstanding. */
 6036: 
 6037:           if (i != 0)
 6038:             {
 6039:             int offset;
 6040:             *code++ = OP_BRA;
 6041:             offset = (bralink == NULL)? 0 : (int)(code - bralink);
 6042:             bralink = code;
 6043:             PUTINC(code, 0, offset);
 6044:             }
 6045: 
 6046:           memcpy(code, previous, IN_UCHARS(len));
 6047: 
 6048:           /* Ensure there is enough workspace for forward references before
 6049:           copying them. */
 6050: 
 6051:           while (cd->hwm > cd->start_workspace + cd->workspace_size -
 6052:                  WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
 6053:             {
 6054:             int save_offset = save_hwm - cd->start_workspace;
 6055:             int this_offset = this_hwm - cd->start_workspace;
 6056:             *errorcodeptr = expand_workspace(cd);
 6057:             if (*errorcodeptr != 0) goto FAILED;
 6058:             save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
 6059:             this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
 6060:             }
 6061: 
 6062:           for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
 6063:             {
 6064:             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
 6065:             cd->hwm += LINK_SIZE;
 6066:             }
 6067:           save_hwm = this_hwm;
 6068:           code += len;
 6069:           }
 6070: 
 6071:         /* Now chain through the pending brackets, and fill in their length
 6072:         fields (which are holding the chain links pro tem). */
 6073: 
 6074:         while (bralink != NULL)
 6075:           {
 6076:           int oldlinkoffset;
 6077:           int offset = (int)(code - bralink + 1);
 6078:           pcre_uchar *bra = code - offset;
 6079:           oldlinkoffset = GET(bra, 1);
 6080:           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
 6081:           *code++ = OP_KET;
 6082:           PUTINC(code, 0, offset);
 6083:           PUT(bra, 1, offset);
 6084:           }
 6085:         }
 6086: 
 6087:       /* If the maximum is unlimited, set a repeater in the final copy. For
 6088:       ONCE brackets, that's all we need to do. However, possessively repeated
 6089:       ONCE brackets can be converted into non-capturing brackets, as the
 6090:       behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
 6091:       deal with possessive ONCEs specially.
 6092: 
 6093:       Otherwise, when we are doing the actual compile phase, check to see
 6094:       whether this group is one that could match an empty string. If so,
 6095:       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
 6096:       that runtime checking can be done. [This check is also applied to ONCE
 6097:       groups at runtime, but in a different way.]
 6098: 
 6099:       Then, if the quantifier was possessive and the bracket is not a
 6100:       conditional, we convert the BRA code to the POS form, and the KET code to
 6101:       KETRPOS. (It turns out to be convenient at runtime to detect this kind of
 6102:       subpattern at both the start and at the end.) The use of special opcodes
 6103:       makes it possible to reduce greatly the stack usage in pcre_exec(). If
 6104:       the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
 6105: 
 6106:       Then, if the minimum number of matches is 1 or 0, cancel the possessive
 6107:       flag so that the default action below, of wrapping everything inside
 6108:       atomic brackets, does not happen. When the minimum is greater than 1,
 6109:       there will be earlier copies of the group, and so we still have to wrap
 6110:       the whole thing. */
 6111: 
 6112:       else
 6113:         {
 6114:         pcre_uchar *ketcode = code - 1 - LINK_SIZE;
 6115:         pcre_uchar *bracode = ketcode - GET(ketcode, 1);
 6116: 
 6117:         /* Convert possessive ONCE brackets to non-capturing */
 6118: 
 6119:         if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
 6120:             possessive_quantifier) *bracode = OP_BRA;
 6121: 
 6122:         /* For non-possessive ONCE brackets, all we need to do is to
 6123:         set the KET. */
 6124: 
 6125:         if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
 6126:           *ketcode = OP_KETRMAX + repeat_type;
 6127: 
 6128:         /* Handle non-ONCE brackets and possessive ONCEs (which have been
 6129:         converted to non-capturing above). */
 6130: 
 6131:         else
 6132:           {
 6133:           /* In the compile phase, check for empty string matching. */
 6134: 
 6135:           if (lengthptr == NULL)
 6136:             {
 6137:             pcre_uchar *scode = bracode;
 6138:             do
 6139:               {
 6140:               if (could_be_empty_branch(scode, ketcode, utf, cd, NULL))
 6141:                 {
 6142:                 *bracode += OP_SBRA - OP_BRA;
 6143:                 break;
 6144:                 }
 6145:               scode += GET(scode, 1);
 6146:               }
 6147:             while (*scode == OP_ALT);
 6148:             }
 6149: 
 6150:           /* Handle possessive quantifiers. */
 6151: 
 6152:           if (possessive_quantifier)
 6153:             {
 6154:             /* For COND brackets, we wrap the whole thing in a possessively
 6155:             repeated non-capturing bracket, because we have not invented POS
 6156:             versions of the COND opcodes. Because we are moving code along, we
 6157:             must ensure that any pending recursive references are updated. */
 6158: 
 6159:             if (*bracode == OP_COND || *bracode == OP_SCOND)
 6160:               {
 6161:               int nlen = (int)(code - bracode);
 6162:               *code = OP_END;
 6163:               adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);
 6164:               memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
 6165:               code += 1 + LINK_SIZE;
 6166:               nlen += 1 + LINK_SIZE;
 6167:               *bracode = OP_BRAPOS;
 6168:               *code++ = OP_KETRPOS;
 6169:               PUTINC(code, 0, nlen);
 6170:               PUT(bracode, 1, nlen);
 6171:               }
 6172: 
 6173:             /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
 6174: 
 6175:             else
 6176:               {
 6177:               *bracode += 1;              /* Switch to xxxPOS opcodes */
 6178:               *ketcode = OP_KETRPOS;
 6179:               }
 6180: 
 6181:             /* If the minimum is zero, mark it as possessive, then unset the
 6182:             possessive flag when the minimum is 0 or 1. */
 6183: 
 6184:             if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
 6185:             if (repeat_min < 2) possessive_quantifier = FALSE;
 6186:             }
 6187: 
 6188:           /* Non-possessive quantifier */
 6189: 
 6190:           else *ketcode = OP_KETRMAX + repeat_type;
 6191:           }
 6192:         }
 6193:       }
 6194: 
 6195:     /* If previous is OP_FAIL, it was generated by an empty class [] in
 6196:     JavaScript mode. The other ways in which OP_FAIL can be generated, that is
 6197:     by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
 6198:     error above. We can just ignore the repeat in JS case. */
 6199: 
 6200:     else if (*previous == OP_FAIL) goto END_REPEAT;
 6201: 
 6202:     /* Else there's some kind of shambles */
 6203: 
 6204:     else
 6205:       {
 6206:       *errorcodeptr = ERR11;
 6207:       goto FAILED;
 6208:       }
 6209: 
 6210:     /* If the character following a repeat is '+', possessive_quantifier is
 6211:     TRUE. For some opcodes, there are special alternative opcodes for this
 6212:     case. For anything else, we wrap the entire repeated item inside OP_ONCE
 6213:     brackets. Logically, the '+' notation is just syntactic sugar, taken from
 6214:     Sun's Java package, but the special opcodes can optimize it.
 6215: 
 6216:     Some (but not all) possessively repeated subpatterns have already been
 6217:     completely handled in the code just above. For them, possessive_quantifier
 6218:     is always FALSE at this stage. Note that the repeated item starts at
 6219:     tempcode, not at previous, which might be the first part of a string whose
 6220:     (former) last char we repeated. */
 6221: 
 6222:     if (possessive_quantifier)
 6223:       {
 6224:       int len;
 6225: 
 6226:       /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
 6227:       However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
 6228:       {5,}, or {5,10}). We skip over an EXACT item; if the length of what
 6229:       remains is greater than zero, there's a further opcode that can be
 6230:       handled. If not, do nothing, leaving the EXACT alone. */
 6231: 
 6232:       switch(*tempcode)
 6233:         {
 6234:         case OP_TYPEEXACT:
 6235:         tempcode += PRIV(OP_lengths)[*tempcode] +
 6236:           ((tempcode[1 + IMM2_SIZE] == OP_PROP
 6237:           || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
 6238:         break;
 6239: 
 6240:         /* CHAR opcodes are used for exacts whose count is 1. */
 6241: 
 6242:         case OP_CHAR:
 6243:         case OP_CHARI:
 6244:         case OP_NOT:
 6245:         case OP_NOTI:
 6246:         case OP_EXACT:
 6247:         case OP_EXACTI:
 6248:         case OP_NOTEXACT:
 6249:         case OP_NOTEXACTI:
 6250:         tempcode += PRIV(OP_lengths)[*tempcode];
 6251: #ifdef SUPPORT_UTF
 6252:         if (utf && HAS_EXTRALEN(tempcode[-1]))
 6253:           tempcode += GET_EXTRALEN(tempcode[-1]);
 6254: #endif
 6255:         break;
 6256: 
 6257:         /* For the class opcodes, the repeat operator appears at the end;
 6258:         adjust tempcode to point to it. */
 6259: 
 6260:         case OP_CLASS:
 6261:         case OP_NCLASS:
 6262:         tempcode += 1 + 32/sizeof(pcre_uchar);
 6263:         break;
 6264: 
 6265: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 6266:         case OP_XCLASS:
 6267:         tempcode += GET(tempcode, 1);
 6268:         break;
 6269: #endif
 6270:         }
 6271: 
 6272:       /* If tempcode is equal to code (which points to the end of the repeated
 6273:       item), it means we have skipped an EXACT item but there is no following
 6274:       QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
 6275:       all other cases, tempcode will be pointing to the repeat opcode, and will
 6276:       be less than code, so the value of len will be greater than 0. */
 6277: 
 6278:       len = (int)(code - tempcode);
 6279:       if (len > 0)
 6280:         {
 6281:         unsigned int repcode = *tempcode;
 6282: 
 6283:         /* There is a table for possessifying opcodes, all of which are less
 6284:         than OP_CALLOUT. A zero entry means there is no possessified version.
 6285:         */
 6286: 
 6287:         if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
 6288:           *tempcode = opcode_possessify[repcode];
 6289: 
 6290:         /* For opcode without a special possessified version, wrap the item in
 6291:         ONCE brackets. Because we are moving code along, we must ensure that any
 6292:         pending recursive references are updated. */
 6293: 
 6294:         else
 6295:           {
 6296:           *code = OP_END;
 6297:           adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
 6298:           memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
 6299:           code += 1 + LINK_SIZE;
 6300:           len += 1 + LINK_SIZE;
 6301:           tempcode[0] = OP_ONCE;
 6302:           *code++ = OP_KET;
 6303:           PUTINC(code, 0, len);
 6304:           PUT(tempcode, 1, len);
 6305:           }
 6306:         }
 6307: 
 6308: #ifdef NEVER
 6309:       if (len > 0) switch (*tempcode)
 6310:         {
 6311:         case OP_STAR:  *tempcode = OP_POSSTAR; break;
 6312:         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
 6313:         case OP_QUERY: *tempcode = OP_POSQUERY; break;
 6314:         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
 6315: 
 6316:         case OP_STARI:  *tempcode = OP_POSSTARI; break;
 6317:         case OP_PLUSI:  *tempcode = OP_POSPLUSI; break;
 6318:         case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
 6319:         case OP_UPTOI:  *tempcode = OP_POSUPTOI; break;
 6320: 
 6321:         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
 6322:         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
 6323:         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
 6324:         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
 6325: 
 6326:         case OP_NOTSTARI:  *tempcode = OP_NOTPOSSTARI; break;
 6327:         case OP_NOTPLUSI:  *tempcode = OP_NOTPOSPLUSI; break;
 6328:         case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
 6329:         case OP_NOTUPTOI:  *tempcode = OP_NOTPOSUPTOI; break;
 6330: 
 6331:         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
 6332:         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
 6333:         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
 6334:         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
 6335: 
 6336:         case OP_CRSTAR:   *tempcode = OP_CRPOSSTAR; break;
 6337:         case OP_CRPLUS:   *tempcode = OP_CRPOSPLUS; break;
 6338:         case OP_CRQUERY:  *tempcode = OP_CRPOSQUERY; break;
 6339:         case OP_CRRANGE:  *tempcode = OP_CRPOSRANGE; break;
 6340: 
 6341:         /* Because we are moving code along, we must ensure that any
 6342:         pending recursive references are updated. */
 6343: 
 6344:         default:
 6345:         *code = OP_END;
 6346:         adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
 6347:         memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
 6348:         code += 1 + LINK_SIZE;
 6349:         len += 1 + LINK_SIZE;
 6350:         tempcode[0] = OP_ONCE;
 6351:         *code++ = OP_KET;
 6352:         PUTINC(code, 0, len);
 6353:         PUT(tempcode, 1, len);
 6354:         break;
 6355:         }
 6356: #endif
 6357:       }
 6358: 
 6359:     /* In all case we no longer have a previous item. We also set the
 6360:     "follows varying string" flag for subsequently encountered reqchars if
 6361:     it isn't already set and we have just passed a varying length item. */
 6362: 
 6363:     END_REPEAT:
 6364:     previous = NULL;
 6365:     cd->req_varyopt |= reqvary;
 6366:     break;
 6367: 
 6368: 
 6369:     /* ===================================================================*/
 6370:     /* Start of nested parenthesized sub-expression, or comment or lookahead or
 6371:     lookbehind or option setting or condition or all the other extended
 6372:     parenthesis forms.  */
 6373: 
 6374:     case CHAR_LEFT_PARENTHESIS:
 6375:     newoptions = options;
 6376:     skipbytes = 0;
 6377:     bravalue = OP_CBRA;
 6378:     save_hwm = cd->hwm;
 6379:     reset_bracount = FALSE;
 6380: 
 6381:     /* First deal with various "verbs" that can be introduced by '*'. */
 6382: 
 6383:     ptr++;
 6384:     if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
 6385:          || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
 6386:       {
 6387:       int i, namelen;
 6388:       int arglen = 0;
 6389:       const char *vn = verbnames;
 6390:       const pcre_uchar *name = ptr + 1;
 6391:       const pcre_uchar *arg = NULL;
 6392:       previous = NULL;
 6393:       ptr++;
 6394:       while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
 6395:       namelen = (int)(ptr - name);
 6396: 
 6397:       /* It appears that Perl allows any characters whatsoever, other than
 6398:       a closing parenthesis, to appear in arguments, so we no longer insist on
 6399:       letters, digits, and underscores. */
 6400: 
 6401:       if (*ptr == CHAR_COLON)
 6402:         {
 6403:         arg = ++ptr;
 6404:         while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
 6405:         arglen = (int)(ptr - arg);
 6406:         if ((unsigned int)arglen > MAX_MARK)
 6407:           {
 6408:           *errorcodeptr = ERR75;
 6409:           goto FAILED;
 6410:           }
 6411:         }
 6412: 
 6413:       if (*ptr != CHAR_RIGHT_PARENTHESIS)
 6414:         {
 6415:         *errorcodeptr = ERR60;
 6416:         goto FAILED;
 6417:         }
 6418: 
 6419:       /* Scan the table of verb names */
 6420: 
 6421:       for (i = 0; i < verbcount; i++)
 6422:         {
 6423:         if (namelen == verbs[i].len &&
 6424:             STRNCMP_UC_C8(name, vn, namelen) == 0)
 6425:           {
 6426:           int setverb;
 6427: 
 6428:           /* Check for open captures before ACCEPT and convert it to
 6429:           ASSERT_ACCEPT if in an assertion. */
 6430: 
 6431:           if (verbs[i].op == OP_ACCEPT)
 6432:             {
 6433:             open_capitem *oc;
 6434:             if (arglen != 0)
 6435:               {
 6436:               *errorcodeptr = ERR59;
 6437:               goto FAILED;
 6438:               }
 6439:             cd->had_accept = TRUE;
 6440:             for (oc = cd->open_caps; oc != NULL; oc = oc->next)
 6441:               {
 6442:               *code++ = OP_CLOSE;
 6443:               PUT2INC(code, 0, oc->number);
 6444:               }
 6445:             setverb = *code++ =
 6446:               (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
 6447: 
 6448:             /* Do not set firstchar after *ACCEPT */
 6449:             if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
 6450:             }
 6451: 
 6452:           /* Handle other cases with/without an argument */
 6453: 
 6454:           else if (arglen == 0)
 6455:             {
 6456:             if (verbs[i].op < 0)   /* Argument is mandatory */
 6457:               {
 6458:               *errorcodeptr = ERR66;
 6459:               goto FAILED;
 6460:               }
 6461:             setverb = *code++ = verbs[i].op;
 6462:             }
 6463: 
 6464:           else
 6465:             {
 6466:             if (verbs[i].op_arg < 0)   /* Argument is forbidden */
 6467:               {
 6468:               *errorcodeptr = ERR59;
 6469:               goto FAILED;
 6470:               }
 6471:             setverb = *code++ = verbs[i].op_arg;
 6472:             *code++ = arglen;
 6473:             memcpy(code, arg, IN_UCHARS(arglen));
 6474:             code += arglen;
 6475:             *code++ = 0;
 6476:             }
 6477: 
 6478:           switch (setverb)
 6479:             {
 6480:             case OP_THEN:
 6481:             case OP_THEN_ARG:
 6482:             cd->external_flags |= PCRE_HASTHEN;
 6483:             break;
 6484: 
 6485:             case OP_PRUNE:
 6486:             case OP_PRUNE_ARG:
 6487:             case OP_SKIP:
 6488:             case OP_SKIP_ARG:
 6489:             cd->had_pruneorskip = TRUE;
 6490:             break;
 6491:             }
 6492: 
 6493:           break;  /* Found verb, exit loop */
 6494:           }
 6495: 
 6496:         vn += verbs[i].len + 1;
 6497:         }
 6498: 
 6499:       if (i < verbcount) continue;    /* Successfully handled a verb */
 6500:       *errorcodeptr = ERR60;          /* Verb not recognized */
 6501:       goto FAILED;
 6502:       }
 6503: 
 6504:     /* Deal with the extended parentheses; all are introduced by '?', and the
 6505:     appearance of any of them means that this is not a capturing group. */
 6506: 
 6507:     else if (*ptr == CHAR_QUESTION_MARK)
 6508:       {
 6509:       int i, set, unset, namelen;
 6510:       int *optset;
 6511:       const pcre_uchar *name;
 6512:       pcre_uchar *slot;
 6513: 
 6514:       switch (*(++ptr))
 6515:         {
 6516:         case CHAR_NUMBER_SIGN:                 /* Comment; skip to ket */
 6517:         ptr++;
 6518:         while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
 6519:         if (*ptr == CHAR_NULL)
 6520:           {
 6521:           *errorcodeptr = ERR18;
 6522:           goto FAILED;
 6523:           }
 6524:         continue;
 6525: 
 6526: 
 6527:         /* ------------------------------------------------------------ */
 6528:         case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
 6529:         reset_bracount = TRUE;
 6530:         /* Fall through */
 6531: 
 6532:         /* ------------------------------------------------------------ */
 6533:         case CHAR_COLON:          /* Non-capturing bracket */
 6534:         bravalue = OP_BRA;
 6535:         ptr++;
 6536:         break;
 6537: 
 6538: 
 6539:         /* ------------------------------------------------------------ */
 6540:         case CHAR_LEFT_PARENTHESIS:
 6541:         bravalue = OP_COND;       /* Conditional group */
 6542:         tempptr = ptr;
 6543: 
 6544:         /* A condition can be an assertion, a number (referring to a numbered
 6545:         group's having been set), a name (referring to a named group), or 'R',
 6546:         referring to recursion. R<digits> and R&name are also permitted for
 6547:         recursion tests.
 6548: 
 6549:         There are ways of testing a named group: (?(name)) is used by Python;
 6550:         Perl 5.10 onwards uses (?(<name>) or (?('name')).
 6551: 
 6552:         There is one unfortunate ambiguity, caused by history. 'R' can be the
 6553:         recursive thing or the name 'R' (and similarly for 'R' followed by
 6554:         digits). We look for a name first; if not found, we try the other case.
 6555: 
 6556:         For compatibility with auto-callouts, we allow a callout to be
 6557:         specified before a condition that is an assertion. First, check for the
 6558:         syntax of a callout; if found, adjust the temporary pointer that is
 6559:         used to check for an assertion condition. That's all that is needed! */
 6560: 
 6561:         if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
 6562:           {
 6563:           for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
 6564:           if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
 6565:             tempptr += i + 1;
 6566:           }
 6567: 
 6568:         /* For conditions that are assertions, check the syntax, and then exit
 6569:         the switch. This will take control down to where bracketed groups,
 6570:         including assertions, are processed. */
 6571: 
 6572:         if (tempptr[1] == CHAR_QUESTION_MARK &&
 6573:               (tempptr[2] == CHAR_EQUALS_SIGN ||
 6574:                tempptr[2] == CHAR_EXCLAMATION_MARK ||
 6575:                tempptr[2] == CHAR_LESS_THAN_SIGN))
 6576:           break;
 6577: 
 6578:         /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
 6579:         need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
 6580: 
 6581:         code[1+LINK_SIZE] = OP_CREF;
 6582:         skipbytes = 1+IMM2_SIZE;
 6583:         refsign = -1;
 6584: 
 6585:         /* Check for a test for recursion in a named group. */
 6586: 
 6587:         ptr++;
 6588:         if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND)
 6589:           {
 6590:           terminator = -1;
 6591:           ptr += 2;
 6592:           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
 6593:           }
 6594: 
 6595:         /* Check for a test for a named group's having been set, using the Perl
 6596:         syntax (?(<name>) or (?('name'), and also allow for the original PCRE
 6597:         syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). */
 6598: 
 6599:         else if (*ptr == CHAR_LESS_THAN_SIGN)
 6600:           {
 6601:           terminator = CHAR_GREATER_THAN_SIGN;
 6602:           ptr++;
 6603:           }
 6604:         else if (*ptr == CHAR_APOSTROPHE)
 6605:           {
 6606:           terminator = CHAR_APOSTROPHE;
 6607:           ptr++;
 6608:           }
 6609:         else
 6610:           {
 6611:           terminator = CHAR_NULL;
 6612:           if (*ptr == CHAR_MINUS || *ptr == CHAR_PLUS) refsign = *ptr++;
 6613:             else if (IS_DIGIT(*ptr)) refsign = 0;
 6614:           }
 6615: 
 6616:         /* Handle a number */
 6617: 
 6618:         if (refsign >= 0)
 6619:           {
 6620:           recno = 0;
 6621:           while (IS_DIGIT(*ptr))
 6622:             {
 6623:             recno = recno * 10 + (int)(*ptr - CHAR_0);
 6624:             ptr++;
 6625:             }
 6626:           }
 6627: 
 6628:         /* Otherwise we expect to read a name; anything else is an error. When
 6629:         a name is one of a number of duplicates, a different opcode is used and
 6630:         it needs more memory. Unfortunately we cannot tell whether a name is a
 6631:         duplicate in the first pass, so we have to allow for more memory. */
 6632: 
 6633:         else
 6634:           {
 6635:           if (IS_DIGIT(*ptr))
 6636:             {
 6637:             *errorcodeptr = ERR84;
 6638:             goto FAILED;
 6639:             }
 6640:           if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_word) == 0)
 6641:             {
 6642:             *errorcodeptr = ERR28;   /* Assertion expected */
 6643:             goto FAILED;
 6644:             }
 6645:           name = ptr++;
 6646:           while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
 6647:             {
 6648:             ptr++;
 6649:             }
 6650:           namelen = (int)(ptr - name);
 6651:           if (lengthptr != NULL) *lengthptr += IMM2_SIZE;
 6652:           }
 6653: 
 6654:         /* Check the terminator */
 6655: 
 6656:         if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
 6657:             *ptr++ != CHAR_RIGHT_PARENTHESIS)
 6658:           {
 6659:           ptr--;                  /* Error offset */
 6660:           *errorcodeptr = ERR26;  /* Malformed number or name */
 6661:           goto FAILED;
 6662:           }
 6663: 
 6664:         /* Do no further checking in the pre-compile phase. */
 6665: 
 6666:         if (lengthptr != NULL) break;
 6667: 
 6668:         /* In the real compile we do the work of looking for the actual
 6669:         reference. If refsign is not negative, it means we have a number in
 6670:         recno. */
 6671: 
 6672:         if (refsign >= 0)
 6673:           {
 6674:           if (recno <= 0)
 6675:             {
 6676:             *errorcodeptr = ERR35;
 6677:             goto FAILED;
 6678:             }
 6679:           if (refsign != 0) recno = (refsign == CHAR_MINUS)?
 6680:             cd->bracount - recno + 1 : recno + cd->bracount;
 6681:           if (recno <= 0 || recno > cd->final_bracount)
 6682:             {
 6683:             *errorcodeptr = ERR15;
 6684:             goto FAILED;
 6685:             }
 6686:           PUT2(code, 2+LINK_SIZE, recno);
 6687:           break;
 6688:           }
 6689: 
 6690:         /* Otherwise look for the name. */
 6691: 
 6692:         slot = cd->name_table;
 6693:         for (i = 0; i < cd->names_found; i++)
 6694:           {
 6695:           if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
 6696:           slot += cd->name_entry_size;
 6697:           }
 6698: 
 6699:         /* Found the named subpattern. If the name is duplicated, add one to
 6700:         the opcode to change CREF/RREF into DNCREF/DNRREF and insert
 6701:         appropriate data values. Otherwise, just insert the unique subpattern
 6702:         number. */
 6703: 
 6704:         if (i < cd->names_found)
 6705:           {
 6706:           int offset = i++;
 6707:           int count = 1;
 6708:           recno = GET2(slot, 0);   /* Number from first found */
 6709:           for (; i < cd->names_found; i++)
 6710:             {
 6711:             slot += cd->name_entry_size;
 6712:             if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0) break;
 6713:             count++;
 6714:             }
 6715:           if (count > 1)
 6716:             {
 6717:             PUT2(code, 2+LINK_SIZE, offset);
 6718:             PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
 6719:             skipbytes += IMM2_SIZE;
 6720:             code[1+LINK_SIZE]++;
 6721:             }
 6722:           else  /* Not a duplicated name */
 6723:             {
 6724:             PUT2(code, 2+LINK_SIZE, recno);
 6725:             }
 6726:           }
 6727: 
 6728:         /* If terminator == CHAR_NULL it means that the name followed directly
 6729:         after the opening parenthesis [e.g. (?(abc)...] and in this case there
 6730:         are some further alternatives to try. For the cases where terminator !=
 6731:         CHAR_NULL [things like (?(<name>... or (?('name')... or (?(R&name)... ]
 6732:         we have now checked all the possibilities, so give an error. */
 6733: 
 6734:         else if (terminator != CHAR_NULL)
 6735:           {
 6736:           *errorcodeptr = ERR15;
 6737:           goto FAILED;
 6738:           }
 6739: 
 6740:         /* Check for (?(R) for recursion. Allow digits after R to specify a
 6741:         specific group number. */
 6742: 
 6743:         else if (*name == CHAR_R)
 6744:           {
 6745:           recno = 0;
 6746:           for (i = 1; i < namelen; i++)
 6747:             {
 6748:             if (!IS_DIGIT(name[i]))
 6749:               {
 6750:               *errorcodeptr = ERR15;
 6751:               goto FAILED;
 6752:               }
 6753:             recno = recno * 10 + name[i] - CHAR_0;
 6754:             }
 6755:           if (recno == 0) recno = RREF_ANY;
 6756:           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
 6757:           PUT2(code, 2+LINK_SIZE, recno);
 6758:           }
 6759: 
 6760:         /* Similarly, check for the (?(DEFINE) "condition", which is always
 6761:         false. */
 6762: 
 6763:         else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
 6764:           {
 6765:           code[1+LINK_SIZE] = OP_DEF;
 6766:           skipbytes = 1;
 6767:           }
 6768: 
 6769:         /* Reference to an unidentified subpattern. */
 6770: 
 6771:         else
 6772:           {
 6773:           *errorcodeptr = ERR15;
 6774:           goto FAILED;
 6775:           }
 6776:         break;
 6777: 
 6778: 
 6779:         /* ------------------------------------------------------------ */
 6780:         case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
 6781:         bravalue = OP_ASSERT;
 6782:         cd->assert_depth += 1;
 6783:         ptr++;
 6784:         break;
 6785: 
 6786:         /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
 6787:         thing to do, but Perl allows all assertions to be quantified, and when
 6788:         they contain capturing parentheses there may be a potential use for
 6789:         this feature. Not that that applies to a quantified (?!) but we allow
 6790:         it for uniformity. */
 6791: 
 6792:         /* ------------------------------------------------------------ */
 6793:         case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
 6794:         ptr++;
 6795:         if (*ptr == CHAR_RIGHT_PARENTHESIS && ptr[1] != CHAR_ASTERISK &&
 6796:              ptr[1] != CHAR_PLUS && ptr[1] != CHAR_QUESTION_MARK &&
 6797:             (ptr[1] != CHAR_LEFT_CURLY_BRACKET || !is_counted_repeat(ptr+2)))
 6798:           {
 6799:           *code++ = OP_FAIL;
 6800:           previous = NULL;
 6801:           continue;
 6802:           }
 6803:         bravalue = OP_ASSERT_NOT;
 6804:         cd->assert_depth += 1;
 6805:         break;
 6806: 
 6807: 
 6808:         /* ------------------------------------------------------------ */
 6809:         case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
 6810:         switch (ptr[1])
 6811:           {
 6812:           case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
 6813:           bravalue = OP_ASSERTBACK;
 6814:           cd->assert_depth += 1;
 6815:           ptr += 2;
 6816:           break;
 6817: 
 6818:           case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
 6819:           bravalue = OP_ASSERTBACK_NOT;
 6820:           cd->assert_depth += 1;
 6821:           ptr += 2;
 6822:           break;
 6823: 
 6824:           default:                /* Could be name define, else bad */
 6825:           if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
 6826:             goto DEFINE_NAME;
 6827:           ptr++;                  /* Correct offset for error */
 6828:           *errorcodeptr = ERR24;
 6829:           goto FAILED;
 6830:           }
 6831:         break;
 6832: 
 6833: 
 6834:         /* ------------------------------------------------------------ */
 6835:         case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
 6836:         bravalue = OP_ONCE;
 6837:         ptr++;
 6838:         break;
 6839: 
 6840: 
 6841:         /* ------------------------------------------------------------ */
 6842:         case CHAR_C:                 /* Callout - may be followed by digits; */
 6843:         previous_callout = code;     /* Save for later completion */
 6844:         after_manual_callout = 1;    /* Skip one item before completing */
 6845:         *code++ = OP_CALLOUT;
 6846:           {
 6847:           int n = 0;
 6848:           ptr++;
 6849:           while(IS_DIGIT(*ptr))
 6850:             n = n * 10 + *ptr++ - CHAR_0;
 6851:           if (*ptr != CHAR_RIGHT_PARENTHESIS)
 6852:             {
 6853:             *errorcodeptr = ERR39;
 6854:             goto FAILED;
 6855:             }
 6856:           if (n > 255)
 6857:             {
 6858:             *errorcodeptr = ERR38;
 6859:             goto FAILED;
 6860:             }
 6861:           *code++ = n;
 6862:           PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
 6863:           PUT(code, LINK_SIZE, 0);                          /* Default length */
 6864:           code += 2 * LINK_SIZE;
 6865:           }
 6866:         previous = NULL;
 6867:         continue;
 6868: 
 6869: 
 6870:         /* ------------------------------------------------------------ */
 6871:         case CHAR_P:              /* Python-style named subpattern handling */
 6872:         if (*(++ptr) == CHAR_EQUALS_SIGN ||
 6873:             *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
 6874:           {
 6875:           is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
 6876:           terminator = CHAR_RIGHT_PARENTHESIS;
 6877:           goto NAMED_REF_OR_RECURSE;
 6878:           }
 6879:         else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
 6880:           {
 6881:           *errorcodeptr = ERR41;
 6882:           goto FAILED;
 6883:           }
 6884:         /* Fall through to handle (?P< as (?< is handled */
 6885: 
 6886: 
 6887:         /* ------------------------------------------------------------ */
 6888:         DEFINE_NAME:    /* Come here from (?< handling */
 6889:         case CHAR_APOSTROPHE:
 6890:         terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
 6891:           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
 6892:         name = ++ptr;
 6893:         if (IS_DIGIT(*ptr))
 6894:           {
 6895:           *errorcodeptr = ERR84;   /* Group name must start with non-digit */
 6896:           goto FAILED;
 6897:           }
 6898:         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
 6899:         namelen = (int)(ptr - name);
 6900: 
 6901:         /* In the pre-compile phase, do a syntax check, remember the longest
 6902:         name, and then remember the group in a vector, expanding it if
 6903:         necessary. Duplicates for the same number are skipped; other duplicates
 6904:         are checked for validity. In the actual compile, there is nothing to
 6905:         do. */
 6906: 
 6907:         if (lengthptr != NULL)
 6908:           {
 6909:           named_group *ng;
 6910:           pcre_uint32 number = cd->bracount + 1;
 6911: 
 6912:           if (*ptr != (pcre_uchar)terminator)
 6913:             {
 6914:             *errorcodeptr = ERR42;
 6915:             goto FAILED;
 6916:             }
 6917: 
 6918:           if (cd->names_found >= MAX_NAME_COUNT)
 6919:             {
 6920:             *errorcodeptr = ERR49;
 6921:             goto FAILED;
 6922:             }
 6923: 
 6924:           if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
 6925:             {
 6926:             cd->name_entry_size = namelen + IMM2_SIZE + 1;
 6927:             if (namelen > MAX_NAME_SIZE)
 6928:               {
 6929:               *errorcodeptr = ERR48;
 6930:               goto FAILED;
 6931:               }
 6932:             }
 6933: 
 6934:           /* Scan the list to check for duplicates. For duplicate names, if the
 6935:           number is the same, break the loop, which causes the name to be
 6936:           discarded; otherwise, if DUPNAMES is not set, give an error.
 6937:           If it is set, allow the name with a different number, but continue
 6938:           scanning in case this is a duplicate with the same number. For
 6939:           non-duplicate names, give an error if the number is duplicated. */
 6940: 
 6941:           ng = cd->named_groups;
 6942:           for (i = 0; i < cd->names_found; i++, ng++)
 6943:             {
 6944:             if (namelen == ng->length &&
 6945:                 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
 6946:               {
 6947:               if (ng->number == number) break;
 6948:               if ((options & PCRE_DUPNAMES) == 0)
 6949:                 {
 6950:                 *errorcodeptr = ERR43;
 6951:                 goto FAILED;
 6952:                 }
 6953:               cd->dupnames = TRUE;  /* Duplicate names exist */
 6954:               }
 6955:             else if (ng->number == number)
 6956:               {
 6957:               *errorcodeptr = ERR65;
 6958:               goto FAILED;
 6959:               }
 6960:             }
 6961: 
 6962:           if (i >= cd->names_found)     /* Not a duplicate with same number */
 6963:             {
 6964:             /* Increase the list size if necessary */
 6965: 
 6966:             if (cd->names_found >= cd->named_group_list_size)
 6967:               {
 6968:               int newsize = cd->named_group_list_size * 2;
 6969:               named_group *newspace = (PUBL(malloc))
 6970:                 (newsize * sizeof(named_group));
 6971: 
 6972:               if (newspace == NULL)
 6973:                 {
 6974:                 *errorcodeptr = ERR21;
 6975:                 goto FAILED;
 6976:                 }
 6977: 
 6978:               memcpy(newspace, cd->named_groups,
 6979:                 cd->named_group_list_size * sizeof(named_group));
 6980:               if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
 6981:                 (PUBL(free))((void *)cd->named_groups);
 6982:               cd->named_groups = newspace;
 6983:               cd->named_group_list_size = newsize;
 6984:               }
 6985: 
 6986:             cd->named_groups[cd->names_found].name = name;
 6987:             cd->named_groups[cd->names_found].length = namelen;
 6988:             cd->named_groups[cd->names_found].number = number;
 6989:             cd->names_found++;
 6990:             }
 6991:           }
 6992: 
 6993:         ptr++;                    /* Move past > or ' in both passes. */
 6994:         goto NUMBERED_GROUP;
 6995: 
 6996: 
 6997:         /* ------------------------------------------------------------ */
 6998:         case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
 6999:         terminator = CHAR_RIGHT_PARENTHESIS;
 7000:         is_recurse = TRUE;
 7001:         /* Fall through */
 7002: 
 7003:         /* We come here from the Python syntax above that handles both
 7004:         references (?P=name) and recursion (?P>name), as well as falling
 7005:         through from the Perl recursion syntax (?&name). We also come here from
 7006:         the Perl \k<name> or \k'name' back reference syntax and the \k{name}
 7007:         .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
 7008: 
 7009:         NAMED_REF_OR_RECURSE:
 7010:         name = ++ptr;
 7011:         if (IS_DIGIT(*ptr))
 7012:           {
 7013:           *errorcodeptr = ERR84;   /* Group name must start with non-digit */
 7014:           goto FAILED;
 7015:           }
 7016:         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
 7017:         namelen = (int)(ptr - name);
 7018: 
 7019:         /* In the pre-compile phase, do a syntax check. We used to just set
 7020:         a dummy reference number, because it was not used in the first pass.
 7021:         However, with the change of recursive back references to be atomic,
 7022:         we have to look for the number so that this state can be identified, as
 7023:         otherwise the incorrect length is computed. If it's not a backwards
 7024:         reference, the dummy number will do. */
 7025: 
 7026:         if (lengthptr != NULL)
 7027:           {
 7028:           named_group *ng;
 7029: 
 7030:           if (namelen == 0)
 7031:             {
 7032:             *errorcodeptr = ERR62;
 7033:             goto FAILED;
 7034:             }
 7035:           if (*ptr != (pcre_uchar)terminator)
 7036:             {
 7037:             *errorcodeptr = ERR42;
 7038:             goto FAILED;
 7039:             }
 7040:           if (namelen > MAX_NAME_SIZE)
 7041:             {
 7042:             *errorcodeptr = ERR48;
 7043:             goto FAILED;
 7044:             }
 7045: 
 7046:           /* The name table does not exist in the first pass; instead we must
 7047:           scan the list of names encountered so far in order to get the
 7048:           number. If the name is not found, set the value to 0 for a forward
 7049:           reference. */
 7050: 
 7051:           ng = cd->named_groups;
 7052:           for (i = 0; i < cd->names_found; i++, ng++)
 7053:             {
 7054:             if (namelen == ng->length &&
 7055:                 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
 7056:               break;
 7057:             }
 7058:           recno = (i < cd->names_found)? ng->number : 0;
 7059: 
 7060:           /* Count named back references. */
 7061: 
 7062:           if (!is_recurse) cd->namedrefcount++;
 7063:           }
 7064: 
 7065:         /* In the real compile, search the name table. We check the name
 7066:         first, and then check that we have reached the end of the name in the
 7067:         table. That way, if the name is longer than any in the table, the
 7068:         comparison will fail without reading beyond the table entry. */
 7069: 
 7070:         else
 7071:           {
 7072:           slot = cd->name_table;
 7073:           for (i = 0; i < cd->names_found; i++)
 7074:             {
 7075:             if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
 7076:                 slot[IMM2_SIZE+namelen] == 0)
 7077:               break;
 7078:             slot += cd->name_entry_size;
 7079:             }
 7080: 
 7081:           if (i < cd->names_found)
 7082:             {
 7083:             recno = GET2(slot, 0);
 7084:             }
 7085:           else
 7086:             {
 7087:             *errorcodeptr = ERR15;
 7088:             goto FAILED;
 7089:             }
 7090:           }
 7091: 
 7092:         /* In both phases, for recursions, we can now go to the code than
 7093:         handles numerical recursion. */
 7094: 
 7095:         if (is_recurse) goto HANDLE_RECURSION;
 7096: 
 7097:         /* In the second pass we must see if the name is duplicated. If so, we
 7098:         generate a different opcode. */
 7099: 
 7100:         if (lengthptr == NULL && cd->dupnames)
 7101:           {
 7102:           int count = 1;
 7103:           unsigned int index = i;
 7104:           pcre_uchar *cslot = slot + cd->name_entry_size;
 7105: 
 7106:           for (i++; i < cd->names_found; i++)
 7107:             {
 7108:             if (STRCMP_UC_UC(slot + IMM2_SIZE, cslot + IMM2_SIZE) != 0) break;
 7109:             count++;
 7110:             cslot += cd->name_entry_size;
 7111:             }
 7112: 
 7113:           if (count > 1)
 7114:             {
 7115:             if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
 7116:             previous = code;
 7117:             *code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
 7118:             PUT2INC(code, 0, index);
 7119:             PUT2INC(code, 0, count);
 7120: 
 7121:             /* Process each potentially referenced group. */
 7122: 
 7123:             for (; slot < cslot; slot += cd->name_entry_size)
 7124:               {
 7125:               open_capitem *oc;
 7126:               recno = GET2(slot, 0);
 7127:               cd->backref_map |= (recno < 32)? (1 << recno) : 1;
 7128:               if (recno > cd->top_backref) cd->top_backref = recno;
 7129: 
 7130:               /* Check to see if this back reference is recursive, that it, it
 7131:               is inside the group that it references. A flag is set so that the
 7132:               group can be made atomic. */
 7133: 
 7134:               for (oc = cd->open_caps; oc != NULL; oc = oc->next)
 7135:                 {
 7136:                 if (oc->number == recno)
 7137:                   {
 7138:                   oc->flag = TRUE;
 7139:                   break;
 7140:                   }
 7141:                 }
 7142:               }
 7143: 
 7144:             continue;  /* End of back ref handling */
 7145:             }
 7146:           }
 7147: 
 7148:         /* First pass, or a non-duplicated name. */
 7149: 
 7150:         goto HANDLE_REFERENCE;
 7151: 
 7152: 
 7153:         /* ------------------------------------------------------------ */
 7154:         case CHAR_R:              /* Recursion */
 7155:         ptr++;                    /* Same as (?0)      */
 7156:         /* Fall through */
 7157: 
 7158: 
 7159:         /* ------------------------------------------------------------ */
 7160:         case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
 7161:         case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
 7162:         case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
 7163:           {
 7164:           const pcre_uchar *called;
 7165:           terminator = CHAR_RIGHT_PARENTHESIS;
 7166: 
 7167:           /* Come here from the \g<...> and \g'...' code (Oniguruma
 7168:           compatibility). However, the syntax has been checked to ensure that
 7169:           the ... are a (signed) number, so that neither ERR63 nor ERR29 will
 7170:           be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
 7171:           ever be taken. */
 7172: 
 7173:           HANDLE_NUMERICAL_RECURSION:
 7174: 
 7175:           if ((refsign = *ptr) == CHAR_PLUS)
 7176:             {
 7177:             ptr++;
 7178:             if (!IS_DIGIT(*ptr))
 7179:               {
 7180:               *errorcodeptr = ERR63;
 7181:               goto FAILED;
 7182:               }
 7183:             }
 7184:           else if (refsign == CHAR_MINUS)
 7185:             {
 7186:             if (!IS_DIGIT(ptr[1]))
 7187:               goto OTHER_CHAR_AFTER_QUERY;
 7188:             ptr++;
 7189:             }
 7190: 
 7191:           recno = 0;
 7192:           while(IS_DIGIT(*ptr))
 7193:             recno = recno * 10 + *ptr++ - CHAR_0;
 7194: 
 7195:           if (*ptr != (pcre_uchar)terminator)
 7196:             {
 7197:             *errorcodeptr = ERR29;
 7198:             goto FAILED;
 7199:             }
 7200: 
 7201:           if (refsign == CHAR_MINUS)
 7202:             {
 7203:             if (recno == 0)
 7204:               {
 7205:               *errorcodeptr = ERR58;
 7206:               goto FAILED;
 7207:               }
 7208:             recno = cd->bracount - recno + 1;
 7209:             if (recno <= 0)
 7210:               {
 7211:               *errorcodeptr = ERR15;
 7212:               goto FAILED;
 7213:               }
 7214:             }
 7215:           else if (refsign == CHAR_PLUS)
 7216:             {
 7217:             if (recno == 0)
 7218:               {
 7219:               *errorcodeptr = ERR58;
 7220:               goto FAILED;
 7221:               }
 7222:             recno += cd->bracount;
 7223:             }
 7224: 
 7225:           /* Come here from code above that handles a named recursion */
 7226: 
 7227:           HANDLE_RECURSION:
 7228: 
 7229:           previous = code;
 7230:           called = cd->start_code;
 7231: 
 7232:           /* When we are actually compiling, find the bracket that is being
 7233:           referenced. Temporarily end the regex in case it doesn't exist before
 7234:           this point. If we end up with a forward reference, first check that
 7235:           the bracket does occur later so we can give the error (and position)
 7236:           now. Then remember this forward reference in the workspace so it can
 7237:           be filled in at the end. */
 7238: 
 7239:           if (lengthptr == NULL)
 7240:             {
 7241:             *code = OP_END;
 7242:             if (recno != 0)
 7243:               called = PRIV(find_bracket)(cd->start_code, utf, recno);
 7244: 
 7245:             /* Forward reference */
 7246: 
 7247:             if (called == NULL)
 7248:               {
 7249:               if (recno > cd->final_bracount)
 7250:                 {
 7251:                 *errorcodeptr = ERR15;
 7252:                 goto FAILED;
 7253:                 }
 7254: 
 7255:               /* Fudge the value of "called" so that when it is inserted as an
 7256:               offset below, what it actually inserted is the reference number
 7257:               of the group. Then remember the forward reference. */
 7258: 
 7259:               called = cd->start_code + recno;
 7260:               if (cd->hwm >= cd->start_workspace + cd->workspace_size -
 7261:                   WORK_SIZE_SAFETY_MARGIN)
 7262:                 {
 7263:                 *errorcodeptr = expand_workspace(cd);
 7264:                 if (*errorcodeptr != 0) goto FAILED;
 7265:                 }
 7266:               PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
 7267:               }
 7268: 
 7269:             /* If not a forward reference, and the subpattern is still open,
 7270:             this is a recursive call. We check to see if this is a left
 7271:             recursion that could loop for ever, and diagnose that case. We
 7272:             must not, however, do this check if we are in a conditional
 7273:             subpattern because the condition might be testing for recursion in
 7274:             a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid.
 7275:             Forever loops are also detected at runtime, so those that occur in
 7276:             conditional subpatterns will be picked up then. */
 7277: 
 7278:             else if (GET(called, 1) == 0 && cond_depth <= 0 &&
 7279:                      could_be_empty(called, code, bcptr, utf, cd))
 7280:               {
 7281:               *errorcodeptr = ERR40;
 7282:               goto FAILED;
 7283:               }
 7284:             }
 7285: 
 7286:           /* Insert the recursion/subroutine item. It does not have a set first
 7287:           character (relevant if it is repeated, because it will then be
 7288:           wrapped with ONCE brackets). */
 7289: 
 7290:           *code = OP_RECURSE;
 7291:           PUT(code, 1, (int)(called - cd->start_code));
 7292:           code += 1 + LINK_SIZE;
 7293:           groupsetfirstchar = FALSE;
 7294:           }
 7295: 
 7296:         /* Can't determine a first byte now */
 7297: 
 7298:         if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
 7299:         continue;
 7300: 
 7301: 
 7302:         /* ------------------------------------------------------------ */
 7303:         default:              /* Other characters: check option setting */
 7304:         OTHER_CHAR_AFTER_QUERY:
 7305:         set = unset = 0;
 7306:         optset = &set;
 7307: 
 7308:         while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
 7309:           {
 7310:           switch (*ptr++)
 7311:             {
 7312:             case CHAR_MINUS: optset = &unset; break;
 7313: 
 7314:             case CHAR_J:    /* Record that it changed in the external options */
 7315:             *optset |= PCRE_DUPNAMES;
 7316:             cd->external_flags |= PCRE_JCHANGED;
 7317:             break;
 7318: 
 7319:             case CHAR_i: *optset |= PCRE_CASELESS; break;
 7320:             case CHAR_m: *optset |= PCRE_MULTILINE; break;
 7321:             case CHAR_s: *optset |= PCRE_DOTALL; break;
 7322:             case CHAR_x: *optset |= PCRE_EXTENDED; break;
 7323:             case CHAR_U: *optset |= PCRE_UNGREEDY; break;
 7324:             case CHAR_X: *optset |= PCRE_EXTRA; break;
 7325: 
 7326:             default:  *errorcodeptr = ERR12;
 7327:                       ptr--;    /* Correct the offset */
 7328:                       goto FAILED;
 7329:             }
 7330:           }
 7331: 
 7332:         /* Set up the changed option bits, but don't change anything yet. */
 7333: 
 7334:         newoptions = (options | set) & (~unset);
 7335: 
 7336:         /* If the options ended with ')' this is not the start of a nested
 7337:         group with option changes, so the options change at this level. If this
 7338:         item is right at the start of the pattern, the options can be
 7339:         abstracted and made external in the pre-compile phase, and ignored in
 7340:         the compile phase. This can be helpful when matching -- for instance in
 7341:         caseless checking of required bytes.
 7342: 
 7343:         If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
 7344:         definitely *not* at the start of the pattern because something has been
 7345:         compiled. In the pre-compile phase, however, the code pointer can have
 7346:         that value after the start, because it gets reset as code is discarded
 7347:         during the pre-compile. However, this can happen only at top level - if
 7348:         we are within parentheses, the starting BRA will still be present. At
 7349:         any parenthesis level, the length value can be used to test if anything
 7350:         has been compiled at that level. Thus, a test for both these conditions
 7351:         is necessary to ensure we correctly detect the start of the pattern in
 7352:         both phases.
 7353: 
 7354:         If we are not at the pattern start, reset the greedy defaults and the
 7355:         case value for firstchar and reqchar. */
 7356: 
 7357:         if (*ptr == CHAR_RIGHT_PARENTHESIS)
 7358:           {
 7359:           if (code == cd->start_code + 1 + LINK_SIZE &&
 7360:                (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
 7361:             {
 7362:             cd->external_options = newoptions;
 7363:             }
 7364:           else
 7365:             {
 7366:             greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
 7367:             greedy_non_default = greedy_default ^ 1;
 7368:             req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
 7369:             }
 7370: 
 7371:           /* Change options at this level, and pass them back for use
 7372:           in subsequent branches. */
 7373: 
 7374:           *optionsptr = options = newoptions;
 7375:           previous = NULL;       /* This item can't be repeated */
 7376:           continue;              /* It is complete */
 7377:           }
 7378: 
 7379:         /* If the options ended with ':' we are heading into a nested group
 7380:         with possible change of options. Such groups are non-capturing and are
 7381:         not assertions of any kind. All we need to do is skip over the ':';
 7382:         the newoptions value is handled below. */
 7383: 
 7384:         bravalue = OP_BRA;
 7385:         ptr++;
 7386:         }     /* End of switch for character following (? */
 7387:       }       /* End of (? handling */
 7388: 
 7389:     /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
 7390:     is set, all unadorned brackets become non-capturing and behave like (?:...)
 7391:     brackets. */
 7392: 
 7393:     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
 7394:       {
 7395:       bravalue = OP_BRA;
 7396:       }
 7397: 
 7398:     /* Else we have a capturing group. */
 7399: 
 7400:     else
 7401:       {
 7402:       NUMBERED_GROUP:
 7403:       cd->bracount += 1;
 7404:       PUT2(code, 1+LINK_SIZE, cd->bracount);
 7405:       skipbytes = IMM2_SIZE;
 7406:       }
 7407: 
 7408:     /* Process nested bracketed regex. First check for parentheses nested too
 7409:     deeply. */
 7410: 
 7411:     if ((cd->parens_depth += 1) > PARENS_NEST_LIMIT)
 7412:       {
 7413:       *errorcodeptr = ERR82;
 7414:       goto FAILED;
 7415:       }
 7416: 
 7417:     /* Assertions used not to be repeatable, but this was changed for Perl
 7418:     compatibility, so all kinds can now be repeated. We copy code into a
 7419:     non-register variable (tempcode) in order to be able to pass its address
 7420:     because some compilers complain otherwise. */
 7421: 
 7422:     previous = code;                      /* For handling repetition */
 7423:     *code = bravalue;
 7424:     tempcode = code;
 7425:     tempreqvary = cd->req_varyopt;        /* Save value before bracket */
 7426:     tempbracount = cd->bracount;          /* Save value before bracket */
 7427:     length_prevgroup = 0;                 /* Initialize for pre-compile phase */
 7428: 
 7429:     if (!compile_regex(
 7430:          newoptions,                      /* The complete new option state */
 7431:          &tempcode,                       /* Where to put code (updated) */
 7432:          &ptr,                            /* Input pointer (updated) */
 7433:          errorcodeptr,                    /* Where to put an error message */
 7434:          (bravalue == OP_ASSERTBACK ||
 7435:           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
 7436:          reset_bracount,                  /* True if (?| group */
 7437:          skipbytes,                       /* Skip over bracket number */
 7438:          cond_depth +
 7439:            ((bravalue == OP_COND)?1:0),   /* Depth of condition subpatterns */
 7440:          &subfirstchar,                   /* For possible first char */
 7441:          &subfirstcharflags,
 7442:          &subreqchar,                     /* For possible last char */
 7443:          &subreqcharflags,
 7444:          bcptr,                           /* Current branch chain */
 7445:          cd,                              /* Tables block */
 7446:          (lengthptr == NULL)? NULL :      /* Actual compile phase */
 7447:            &length_prevgroup              /* Pre-compile phase */
 7448:          ))
 7449:       goto FAILED;
 7450: 
 7451:     cd->parens_depth -= 1;
 7452: 
 7453:     /* If this was an atomic group and there are no capturing groups within it,
 7454:     generate OP_ONCE_NC instead of OP_ONCE. */
 7455: 
 7456:     if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
 7457:       *code = OP_ONCE_NC;
 7458: 
 7459:     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
 7460:       cd->assert_depth -= 1;
 7461: 
 7462:     /* At the end of compiling, code is still pointing to the start of the
 7463:     group, while tempcode has been updated to point past the end of the group.
 7464:     The pattern pointer (ptr) is on the bracket.
 7465: 
 7466:     If this is a conditional bracket, check that there are no more than
 7467:     two branches in the group, or just one if it's a DEFINE group. We do this
 7468:     in the real compile phase, not in the pre-pass, where the whole group may
 7469:     not be available. */
 7470: 
 7471:     if (bravalue == OP_COND && lengthptr == NULL)
 7472:       {
 7473:       pcre_uchar *tc = code;
 7474:       int condcount = 0;
 7475: 
 7476:       do {
 7477:          condcount++;
 7478:          tc += GET(tc,1);
 7479:          }
 7480:       while (*tc != OP_KET);
 7481: 
 7482:       /* A DEFINE group is never obeyed inline (the "condition" is always
 7483:       false). It must have only one branch. */
 7484: 
 7485:       if (code[LINK_SIZE+1] == OP_DEF)
 7486:         {
 7487:         if (condcount > 1)
 7488:           {
 7489:           *errorcodeptr = ERR54;
 7490:           goto FAILED;
 7491:           }
 7492:         bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
 7493:         }
 7494: 
 7495:       /* A "normal" conditional group. If there is just one branch, we must not
 7496:       make use of its firstchar or reqchar, because this is equivalent to an
 7497:       empty second branch. */
 7498: 
 7499:       else
 7500:         {
 7501:         if (condcount > 2)
 7502:           {
 7503:           *errorcodeptr = ERR27;
 7504:           goto FAILED;
 7505:           }
 7506:         if (condcount == 1) subfirstcharflags = subreqcharflags = REQ_NONE;
 7507:         }
 7508:       }
 7509: 
 7510:     /* Error if hit end of pattern */
 7511: 
 7512:     if (*ptr != CHAR_RIGHT_PARENTHESIS)
 7513:       {
 7514:       *errorcodeptr = ERR14;
 7515:       goto FAILED;
 7516:       }
 7517: 
 7518:     /* In the pre-compile phase, update the length by the length of the group,
 7519:     less the brackets at either end. Then reduce the compiled code to just a
 7520:     set of non-capturing brackets so that it doesn't use much memory if it is
 7521:     duplicated by a quantifier.*/
 7522: 
 7523:     if (lengthptr != NULL)
 7524:       {
 7525:       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
 7526:         {
 7527:         *errorcodeptr = ERR20;
 7528:         goto FAILED;
 7529:         }
 7530:       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
 7531:       code++;   /* This already contains bravalue */
 7532:       PUTINC(code, 0, 1 + LINK_SIZE);
 7533:       *code++ = OP_KET;
 7534:       PUTINC(code, 0, 1 + LINK_SIZE);
 7535:       break;    /* No need to waste time with special character handling */
 7536:       }
 7537: 
 7538:     /* Otherwise update the main code pointer to the end of the group. */
 7539: 
 7540:     code = tempcode;
 7541: 
 7542:     /* For a DEFINE group, required and first character settings are not
 7543:     relevant. */
 7544: 
 7545:     if (bravalue == OP_DEF) break;
 7546: 
 7547:     /* Handle updating of the required and first characters for other types of
 7548:     group. Update for normal brackets of all kinds, and conditions with two
 7549:     branches (see code above). If the bracket is followed by a quantifier with
 7550:     zero repeat, we have to back off. Hence the definition of zeroreqchar and
 7551:     zerofirstchar outside the main loop so that they can be accessed for the
 7552:     back off. */
 7553: 
 7554:     zeroreqchar = reqchar;
 7555:     zeroreqcharflags = reqcharflags;
 7556:     zerofirstchar = firstchar;
 7557:     zerofirstcharflags = firstcharflags;
 7558:     groupsetfirstchar = FALSE;
 7559: 
 7560:     if (bravalue >= OP_ONCE)
 7561:       {
 7562:       /* If we have not yet set a firstchar in this branch, take it from the
 7563:       subpattern, remembering that it was set here so that a repeat of more
 7564:       than one can replicate it as reqchar if necessary. If the subpattern has
 7565:       no firstchar, set "none" for the whole branch. In both cases, a zero
 7566:       repeat forces firstchar to "none". */
 7567: 
 7568:       if (firstcharflags == REQ_UNSET)
 7569:         {
 7570:         if (subfirstcharflags >= 0)
 7571:           {
 7572:           firstchar = subfirstchar;
 7573:           firstcharflags = subfirstcharflags;
 7574:           groupsetfirstchar = TRUE;
 7575:           }
 7576:         else firstcharflags = REQ_NONE;
 7577:         zerofirstcharflags = REQ_NONE;
 7578:         }
 7579: 
 7580:       /* If firstchar was previously set, convert the subpattern's firstchar
 7581:       into reqchar if there wasn't one, using the vary flag that was in
 7582:       existence beforehand. */
 7583: 
 7584:       else if (subfirstcharflags >= 0 && subreqcharflags < 0)
 7585:         {
 7586:         subreqchar = subfirstchar;
 7587:         subreqcharflags = subfirstcharflags | tempreqvary;
 7588:         }
 7589: 
 7590:       /* If the subpattern set a required byte (or set a first byte that isn't
 7591:       really the first byte - see above), set it. */
 7592: 
 7593:       if (subreqcharflags >= 0)
 7594:         {
 7595:         reqchar = subreqchar;
 7596:         reqcharflags = subreqcharflags;
 7597:         }
 7598:       }
 7599: 
 7600:     /* For a forward assertion, we take the reqchar, if set. This can be
 7601:     helpful if the pattern that follows the assertion doesn't set a different
 7602:     char. For example, it's useful for /(?=abcde).+/. We can't set firstchar
 7603:     for an assertion, however because it leads to incorrect effect for patterns
 7604:     such as /(?=a)a.+/ when the "real" "a" would then become a reqchar instead
 7605:     of a firstchar. This is overcome by a scan at the end if there's no
 7606:     firstchar, looking for an asserted first char. */
 7607: 
 7608:     else if (bravalue == OP_ASSERT && subreqcharflags >= 0)
 7609:       {
 7610:       reqchar = subreqchar;
 7611:       reqcharflags = subreqcharflags;
 7612:       }
 7613:     break;     /* End of processing '(' */
 7614: 
 7615: 
 7616:     /* ===================================================================*/
 7617:     /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
 7618:     are arranged to be the negation of the corresponding OP_values in the
 7619:     default case when PCRE_UCP is not set. For the back references, the values
 7620:     are negative the reference number. Only back references and those types
 7621:     that consume a character may be repeated. We can test for values between
 7622:     ESC_b and ESC_Z for the latter; this may have to change if any new ones are
 7623:     ever created. */
 7624: 
 7625:     case CHAR_BACKSLASH:
 7626:     tempptr = ptr;
 7627:     escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options, FALSE);
 7628:     if (*errorcodeptr != 0) goto FAILED;
 7629: 
 7630:     if (escape == 0)                  /* The escape coded a single character */
 7631:       c = ec;
 7632:     else
 7633:       {
 7634:       if (escape == ESC_Q)            /* Handle start of quoted string */
 7635:         {
 7636:         if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
 7637:           ptr += 2;               /* avoid empty string */
 7638:             else inescq = TRUE;
 7639:         continue;
 7640:         }
 7641: 
 7642:       if (escape == ESC_E) continue;  /* Perl ignores an orphan \E */
 7643: 
 7644:       /* For metasequences that actually match a character, we disable the
 7645:       setting of a first character if it hasn't already been set. */
 7646: 
 7647:       if (firstcharflags == REQ_UNSET && escape > ESC_b && escape < ESC_Z)
 7648:         firstcharflags = REQ_NONE;
 7649: 
 7650:       /* Set values to reset to if this is followed by a zero repeat. */
 7651: 
 7652:       zerofirstchar = firstchar;
 7653:       zerofirstcharflags = firstcharflags;
 7654:       zeroreqchar = reqchar;
 7655:       zeroreqcharflags = reqcharflags;
 7656: 
 7657:       /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
 7658:       is a subroutine call by number (Oniguruma syntax). In fact, the value
 7659:       ESC_g is returned only for these cases. So we don't need to check for <
 7660:       or ' if the value is ESC_g. For the Perl syntax \g{n} the value is
 7661:       -n, and for the Perl syntax \g{name} the result is ESC_k (as
 7662:       that is a synonym for a named back reference). */
 7663: 
 7664:       if (escape == ESC_g)
 7665:         {
 7666:         const pcre_uchar *p;
 7667:         pcre_uint32 cf;
 7668: 
 7669:         save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
 7670:         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
 7671:           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
 7672: 
 7673:         /* These two statements stop the compiler for warning about possibly
 7674:         unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
 7675:         fact, because we do the check for a number below, the paths that
 7676:         would actually be in error are never taken. */
 7677: 
 7678:         skipbytes = 0;
 7679:         reset_bracount = FALSE;
 7680: 
 7681:         /* If it's not a signed or unsigned number, treat it as a name. */
 7682: 
 7683:         cf = ptr[1];
 7684:         if (cf != CHAR_PLUS && cf != CHAR_MINUS && !IS_DIGIT(cf))
 7685:           {
 7686:           is_recurse = TRUE;
 7687:           goto NAMED_REF_OR_RECURSE;
 7688:           }
 7689: 
 7690:         /* Signed or unsigned number (cf = ptr[1]) is known to be plus or minus
 7691:         or a digit. */
 7692: 
 7693:         p = ptr + 2;
 7694:         while (IS_DIGIT(*p)) p++;
 7695:         if (*p != (pcre_uchar)terminator)
 7696:           {
 7697:           *errorcodeptr = ERR57;
 7698:           break;
 7699:           }
 7700:         ptr++;
 7701:         goto HANDLE_NUMERICAL_RECURSION;
 7702:         }
 7703: 
 7704:       /* \k<name> or \k'name' is a back reference by name (Perl syntax).
 7705:       We also support \k{name} (.NET syntax).  */
 7706: 
 7707:       if (escape == ESC_k)
 7708:         {
 7709:         if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
 7710:           ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
 7711:           {
 7712:           *errorcodeptr = ERR69;
 7713:           break;
 7714:           }
 7715:         is_recurse = FALSE;
 7716:         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
 7717:           CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
 7718:           CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
 7719:         goto NAMED_REF_OR_RECURSE;
 7720:         }
 7721: 
 7722:       /* Back references are handled specially; must disable firstchar if
 7723:       not set to cope with cases like (?=(\w+))\1: which would otherwise set
 7724:       ':' later. */
 7725: 
 7726:       if (escape < 0)
 7727:         {
 7728:         open_capitem *oc;
 7729:         recno = -escape;
 7730: 
 7731:         /* Come here from named backref handling when the reference is to a
 7732:         single group (i.e. not to a duplicated name. */
 7733: 
 7734:         HANDLE_REFERENCE:
 7735:         if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
 7736:         previous = code;
 7737:         *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
 7738:         PUT2INC(code, 0, recno);
 7739:         cd->backref_map |= (recno < 32)? (1 << recno) : 1;
 7740:         if (recno > cd->top_backref) cd->top_backref = recno;
 7741: 
 7742:         /* Check to see if this back reference is recursive, that it, it
 7743:         is inside the group that it references. A flag is set so that the
 7744:         group can be made atomic. */
 7745: 
 7746:         for (oc = cd->open_caps; oc != NULL; oc = oc->next)
 7747:           {
 7748:           if (oc->number == recno)
 7749:             {
 7750:             oc->flag = TRUE;
 7751:             break;
 7752:             }
 7753:           }
 7754:         }
 7755: 
 7756:       /* So are Unicode property matches, if supported. */
 7757: 
 7758: #ifdef SUPPORT_UCP
 7759:       else if (escape == ESC_P || escape == ESC_p)
 7760:         {
 7761:         BOOL negated;
 7762:         unsigned int ptype = 0, pdata = 0;
 7763:         if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
 7764:           goto FAILED;
 7765:         previous = code;
 7766:         *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
 7767:         *code++ = ptype;
 7768:         *code++ = pdata;
 7769:         }
 7770: #else
 7771: 
 7772:       /* If Unicode properties are not supported, \X, \P, and \p are not
 7773:       allowed. */
 7774: 
 7775:       else if (escape == ESC_X || escape == ESC_P || escape == ESC_p)
 7776:         {
 7777:         *errorcodeptr = ERR45;
 7778:         goto FAILED;
 7779:         }
 7780: #endif
 7781: 
 7782:       /* For the rest (including \X when Unicode properties are supported), we
 7783:       can obtain the OP value by negating the escape value in the default
 7784:       situation when PCRE_UCP is not set. When it *is* set, we substitute
 7785:       Unicode property tests. Note that \b and \B do a one-character
 7786:       lookbehind, and \A also behaves as if it does. */
 7787: 
 7788:       else
 7789:         {
 7790:         if ((escape == ESC_b || escape == ESC_B || escape == ESC_A) &&
 7791:              cd->max_lookbehind == 0)
 7792:           cd->max_lookbehind = 1;
 7793: #ifdef SUPPORT_UCP
 7794:         if (escape >= ESC_DU && escape <= ESC_wu)
 7795:           {
 7796:           nestptr = ptr + 1;                   /* Where to resume */
 7797:           ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
 7798:           }
 7799:         else
 7800: #endif
 7801:         /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
 7802:         so that it works in DFA mode and in lookbehinds. */
 7803: 
 7804:           {
 7805:           previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
 7806:           *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
 7807:           }
 7808:         }
 7809:       continue;
 7810:       }
 7811: 
 7812:     /* We have a data character whose value is in c. In UTF-8 mode it may have
 7813:     a value > 127. We set its representation in the length/buffer, and then
 7814:     handle it as a data character. */
 7815: 
 7816: #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
 7817:     if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
 7818:       mclength = PRIV(ord2utf)(c, mcbuffer);
 7819:     else
 7820: #endif
 7821: 
 7822:      {
 7823:      mcbuffer[0] = c;
 7824:      mclength = 1;
 7825:      }
 7826:     goto ONE_CHAR;
 7827: 
 7828: 
 7829:     /* ===================================================================*/
 7830:     /* Handle a literal character. It is guaranteed not to be whitespace or #
 7831:     when the extended flag is set. If we are in a UTF mode, it may be a
 7832:     multi-unit literal character. */
 7833: 
 7834:     default:
 7835:     NORMAL_CHAR:
 7836:     mclength = 1;
 7837:     mcbuffer[0] = c;
 7838: 
 7839: #ifdef SUPPORT_UTF
 7840:     if (utf && HAS_EXTRALEN(c))
 7841:       ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
 7842: #endif
 7843: 
 7844:     /* At this point we have the character's bytes in mcbuffer, and the length
 7845:     in mclength. When not in UTF-8 mode, the length is always 1. */
 7846: 
 7847:     ONE_CHAR:
 7848:     previous = code;
 7849: 
 7850:     /* For caseless UTF-8 mode when UCP support is available, check whether
 7851:     this character has more than one other case. If so, generate a special
 7852:     OP_PROP item instead of OP_CHARI. */
 7853: 
 7854: #ifdef SUPPORT_UCP
 7855:     if (utf && (options & PCRE_CASELESS) != 0)
 7856:       {
 7857:       GETCHAR(c, mcbuffer);
 7858:       if ((c = UCD_CASESET(c)) != 0)
 7859:         {
 7860:         *code++ = OP_PROP;
 7861:         *code++ = PT_CLIST;
 7862:         *code++ = c;
 7863:         if (firstcharflags == REQ_UNSET)
 7864:           firstcharflags = zerofirstcharflags = REQ_NONE;
 7865:         break;
 7866:         }
 7867:       }
 7868: #endif
 7869: 
 7870:     /* Caseful matches, or not one of the multicase characters. */
 7871: 
 7872:     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR;
 7873:     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
 7874: 
 7875:     /* Remember if \r or \n were seen */
 7876: 
 7877:     if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
 7878:       cd->external_flags |= PCRE_HASCRORLF;
 7879: 
 7880:     /* Set the first and required bytes appropriately. If no previous first
 7881:     byte, set it from this character, but revert to none on a zero repeat.
 7882:     Otherwise, leave the firstchar value alone, and don't change it on a zero
 7883:     repeat. */
 7884: 
 7885:     if (firstcharflags == REQ_UNSET)
 7886:       {
 7887:       zerofirstcharflags = REQ_NONE;
 7888:       zeroreqchar = reqchar;
 7889:       zeroreqcharflags = reqcharflags;
 7890: 
 7891:       /* If the character is more than one byte long, we can set firstchar
 7892:       only if it is not to be matched caselessly. */
 7893: 
 7894:       if (mclength == 1 || req_caseopt == 0)
 7895:         {
 7896:         firstchar = mcbuffer[0] | req_caseopt;
 7897:         firstchar = mcbuffer[0];
 7898:         firstcharflags = req_caseopt;
 7899: 
 7900:         if (mclength != 1)
 7901:           {
 7902:           reqchar = code[-1];
 7903:           reqcharflags = cd->req_varyopt;
 7904:           }
 7905:         }
 7906:       else firstcharflags = reqcharflags = REQ_NONE;
 7907:       }
 7908: 
 7909:     /* firstchar was previously set; we can set reqchar only if the length is
 7910:     1 or the matching is caseful. */
 7911: 
 7912:     else
 7913:       {
 7914:       zerofirstchar = firstchar;
 7915:       zerofirstcharflags = firstcharflags;
 7916:       zeroreqchar = reqchar;
 7917:       zeroreqcharflags = reqcharflags;
 7918:       if (mclength == 1 || req_caseopt == 0)
 7919:         {
 7920:         reqchar = code[-1];
 7921:         reqcharflags = req_caseopt | cd->req_varyopt;
 7922:         }
 7923:       }
 7924: 
 7925:     break;            /* End of literal character handling */
 7926:     }
 7927:   }                   /* end of big loop */
 7928: 
 7929: 
 7930: /* Control never reaches here by falling through, only by a goto for all the
 7931: error states. Pass back the position in the pattern so that it can be displayed
 7932: to the user for diagnosing the error. */
 7933: 
 7934: FAILED:
 7935: *ptrptr = ptr;
 7936: return FALSE;
 7937: }
 7938: 
 7939: 
 7940: 
 7941: /*************************************************
 7942: *     Compile sequence of alternatives           *
 7943: *************************************************/
 7944: 
 7945: /* On entry, ptr is pointing past the bracket character, but on return it
 7946: points to the closing bracket, or vertical bar, or end of string. The code
 7947: variable is pointing at the byte into which the BRA operator has been stored.
 7948: This function is used during the pre-compile phase when we are trying to find
 7949: out the amount of memory needed, as well as during the real compile phase. The
 7950: value of lengthptr distinguishes the two phases.
 7951: 
 7952: Arguments:
 7953:   options           option bits, including any changes for this subpattern
 7954:   codeptr           -> the address of the current code pointer
 7955:   ptrptr            -> the address of the current pattern pointer
 7956:   errorcodeptr      -> pointer to error code variable
 7957:   lookbehind        TRUE if this is a lookbehind assertion
 7958:   reset_bracount    TRUE to reset the count for each branch
 7959:   skipbytes         skip this many bytes at start (for brackets and OP_COND)
 7960:   cond_depth        depth of nesting for conditional subpatterns
 7961:   firstcharptr      place to put the first required character
 7962:   firstcharflagsptr place to put the first character flags, or a negative number
 7963:   reqcharptr        place to put the last required character
 7964:   reqcharflagsptr   place to put the last required character flags, or a negative number
 7965:   bcptr             pointer to the chain of currently open branches
 7966:   cd                points to the data block with tables pointers etc.
 7967:   lengthptr         NULL during the real compile phase
 7968:                     points to length accumulator during pre-compile phase
 7969: 
 7970: Returns:            TRUE on success
 7971: */
 7972: 
 7973: static BOOL
 7974: compile_regex(int options, pcre_uchar **codeptr, const pcre_uchar **ptrptr,
 7975:   int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
 7976:   int cond_depth,
 7977:   pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
 7978:   pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
 7979:   branch_chain *bcptr, compile_data *cd, int *lengthptr)
 7980: {
 7981: const pcre_uchar *ptr = *ptrptr;
 7982: pcre_uchar *code = *codeptr;
 7983: pcre_uchar *last_branch = code;
 7984: pcre_uchar *start_bracket = code;
 7985: pcre_uchar *reverse_count = NULL;
 7986: open_capitem capitem;
 7987: int capnumber = 0;
 7988: pcre_uint32 firstchar, reqchar;
 7989: pcre_int32 firstcharflags, reqcharflags;
 7990: pcre_uint32 branchfirstchar, branchreqchar;
 7991: pcre_int32 branchfirstcharflags, branchreqcharflags;
 7992: int length;
 7993: unsigned int orig_bracount;
 7994: unsigned int max_bracount;
 7995: branch_chain bc;
 7996: 
 7997: bc.outer = bcptr;
 7998: bc.current_branch = code;
 7999: 
 8000: firstchar = reqchar = 0;
 8001: firstcharflags = reqcharflags = REQ_UNSET;
 8002: 
 8003: /* Accumulate the length for use in the pre-compile phase. Start with the
 8004: length of the BRA and KET and any extra bytes that are required at the
 8005: beginning. We accumulate in a local variable to save frequent testing of
 8006: lenthptr for NULL. We cannot do this by looking at the value of code at the
 8007: start and end of each alternative, because compiled items are discarded during
 8008: the pre-compile phase so that the work space is not exceeded. */
 8009: 
 8010: length = 2 + 2*LINK_SIZE + skipbytes;
 8011: 
 8012: /* WARNING: If the above line is changed for any reason, you must also change
 8013: the code that abstracts option settings at the start of the pattern and makes
 8014: them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
 8015: pre-compile phase to find out whether anything has yet been compiled or not. */
 8016: 
 8017: /* If this is a capturing subpattern, add to the chain of open capturing items
 8018: so that we can detect them if (*ACCEPT) is encountered. This is also used to
 8019: detect groups that contain recursive back references to themselves. Note that
 8020: only OP_CBRA need be tested here; changing this opcode to one of its variants,
 8021: e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
 8022: 
 8023: if (*code == OP_CBRA)
 8024:   {
 8025:   capnumber = GET2(code, 1 + LINK_SIZE);
 8026:   capitem.number = capnumber;
 8027:   capitem.next = cd->open_caps;
 8028:   capitem.flag = FALSE;
 8029:   cd->open_caps = &capitem;
 8030:   }
 8031: 
 8032: /* Offset is set zero to mark that this bracket is still open */
 8033: 
 8034: PUT(code, 1, 0);
 8035: code += 1 + LINK_SIZE + skipbytes;
 8036: 
 8037: /* Loop for each alternative branch */
 8038: 
 8039: orig_bracount = max_bracount = cd->bracount;
 8040: for (;;)
 8041:   {
 8042:   /* For a (?| group, reset the capturing bracket count so that each branch
 8043:   uses the same numbers. */
 8044: 
 8045:   if (reset_bracount) cd->bracount = orig_bracount;
 8046: 
 8047:   /* Set up dummy OP_REVERSE if lookbehind assertion */
 8048: 
 8049:   if (lookbehind)
 8050:     {
 8051:     *code++ = OP_REVERSE;
 8052:     reverse_count = code;
 8053:     PUTINC(code, 0, 0);
 8054:     length += 1 + LINK_SIZE;
 8055:     }
 8056: 
 8057:   /* Now compile the branch; in the pre-compile phase its length gets added
 8058:   into the length. */
 8059: 
 8060:   if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstchar,
 8061:         &branchfirstcharflags, &branchreqchar, &branchreqcharflags, &bc,
 8062:         cond_depth, cd, (lengthptr == NULL)? NULL : &length))
 8063:     {
 8064:     *ptrptr = ptr;
 8065:     return FALSE;
 8066:     }
 8067: 
 8068:   /* Keep the highest bracket count in case (?| was used and some branch
 8069:   has fewer than the rest. */
 8070: 
 8071:   if (cd->bracount > max_bracount) max_bracount = cd->bracount;
 8072: 
 8073:   /* In the real compile phase, there is some post-processing to be done. */
 8074: 
 8075:   if (lengthptr == NULL)
 8076:     {
 8077:     /* If this is the first branch, the firstchar and reqchar values for the
 8078:     branch become the values for the regex. */
 8079: 
 8080:     if (*last_branch != OP_ALT)
 8081:       {
 8082:       firstchar = branchfirstchar;
 8083:       firstcharflags = branchfirstcharflags;
 8084:       reqchar = branchreqchar;
 8085:       reqcharflags = branchreqcharflags;
 8086:       }
 8087: 
 8088:     /* If this is not the first branch, the first char and reqchar have to
 8089:     match the values from all the previous branches, except that if the
 8090:     previous value for reqchar didn't have REQ_VARY set, it can still match,
 8091:     and we set REQ_VARY for the regex. */
 8092: 
 8093:     else
 8094:       {
 8095:       /* If we previously had a firstchar, but it doesn't match the new branch,
 8096:       we have to abandon the firstchar for the regex, but if there was
 8097:       previously no reqchar, it takes on the value of the old firstchar. */
 8098: 
 8099:       if (firstcharflags >= 0 &&
 8100:           (firstcharflags != branchfirstcharflags || firstchar != branchfirstchar))
 8101:         {
 8102:         if (reqcharflags < 0)
 8103:           {
 8104:           reqchar = firstchar;
 8105:           reqcharflags = firstcharflags;
 8106:           }
 8107:         firstcharflags = REQ_NONE;
 8108:         }
 8109: 
 8110:       /* If we (now or from before) have no firstchar, a firstchar from the
 8111:       branch becomes a reqchar if there isn't a branch reqchar. */
 8112: 
 8113:       if (firstcharflags < 0 && branchfirstcharflags >= 0 && branchreqcharflags < 0)
 8114:         {
 8115:         branchreqchar = branchfirstchar;
 8116:         branchreqcharflags = branchfirstcharflags;
 8117:         }
 8118: 
 8119:       /* Now ensure that the reqchars match */
 8120: 
 8121:       if (((reqcharflags & ~REQ_VARY) != (branchreqcharflags & ~REQ_VARY)) ||
 8122:           reqchar != branchreqchar)
 8123:         reqcharflags = REQ_NONE;
 8124:       else
 8125:         {
 8126:         reqchar = branchreqchar;
 8127:         reqcharflags |= branchreqcharflags; /* To "or" REQ_VARY */
 8128:         }
 8129:       }
 8130: 
 8131:     /* If lookbehind, check that this branch matches a fixed-length string, and
 8132:     put the length into the OP_REVERSE item. Temporarily mark the end of the
 8133:     branch with OP_END. If the branch contains OP_RECURSE, the result is -3
 8134:     because there may be forward references that we can't check here. Set a
 8135:     flag to cause another lookbehind check at the end. Why not do it all at the
 8136:     end? Because common, erroneous checks are picked up here and the offset of
 8137:     the problem can be shown. */
 8138: 
 8139:     if (lookbehind)
 8140:       {
 8141:       int fixed_length;
 8142:       *code = OP_END;
 8143:       fixed_length = find_fixedlength(last_branch,  (options & PCRE_UTF8) != 0,
 8144:         FALSE, cd);
 8145:       DPRINTF(("fixed length = %d\n", fixed_length));
 8146:       if (fixed_length == -3)
 8147:         {
 8148:         cd->check_lookbehind = TRUE;
 8149:         }
 8150:       else if (fixed_length < 0)
 8151:         {
 8152:         *errorcodeptr = (fixed_length == -2)? ERR36 :
 8153:                         (fixed_length == -4)? ERR70: ERR25;
 8154:         *ptrptr = ptr;
 8155:         return FALSE;
 8156:         }
 8157:       else
 8158:         {
 8159:         if (fixed_length > cd->max_lookbehind)
 8160:           cd->max_lookbehind = fixed_length;
 8161:         PUT(reverse_count, 0, fixed_length);
 8162:         }
 8163:       }
 8164:     }
 8165: 
 8166:   /* Reached end of expression, either ')' or end of pattern. In the real
 8167:   compile phase, go back through the alternative branches and reverse the chain
 8168:   of offsets, with the field in the BRA item now becoming an offset to the
 8169:   first alternative. If there are no alternatives, it points to the end of the
 8170:   group. The length in the terminating ket is always the length of the whole
 8171:   bracketed item. Return leaving the pointer at the terminating char. */
 8172: 
 8173:   if (*ptr != CHAR_VERTICAL_LINE)
 8174:     {
 8175:     if (lengthptr == NULL)
 8176:       {
 8177:       int branch_length = (int)(code - last_branch);
 8178:       do
 8179:         {
 8180:         int prev_length = GET(last_branch, 1);
 8181:         PUT(last_branch, 1, branch_length);
 8182:         branch_length = prev_length;
 8183:         last_branch -= branch_length;
 8184:         }
 8185:       while (branch_length > 0);
 8186:       }
 8187: 
 8188:     /* Fill in the ket */
 8189: 
 8190:     *code = OP_KET;
 8191:     PUT(code, 1, (int)(code - start_bracket));
 8192:     code += 1 + LINK_SIZE;
 8193: 
 8194:     /* If it was a capturing subpattern, check to see if it contained any
 8195:     recursive back references. If so, we must wrap it in atomic brackets.
 8196:     In any event, remove the block from the chain. */
 8197: 
 8198:     if (capnumber > 0)
 8199:       {
 8200:       if (cd->open_caps->flag)
 8201:         {
 8202:         memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
 8203:           IN_UCHARS(code - start_bracket));
 8204:         *start_bracket = OP_ONCE;
 8205:         code += 1 + LINK_SIZE;
 8206:         PUT(start_bracket, 1, (int)(code - start_bracket));
 8207:         *code = OP_KET;
 8208:         PUT(code, 1, (int)(code - start_bracket));
 8209:         code += 1 + LINK_SIZE;
 8210:         length += 2 + 2*LINK_SIZE;
 8211:         }
 8212:       cd->open_caps = cd->open_caps->next;
 8213:       }
 8214: 
 8215:     /* Retain the highest bracket number, in case resetting was used. */
 8216: 
 8217:     cd->bracount = max_bracount;
 8218: 
 8219:     /* Set values to pass back */
 8220: 
 8221:     *codeptr = code;
 8222:     *ptrptr = ptr;
 8223:     *firstcharptr = firstchar;
 8224:     *firstcharflagsptr = firstcharflags;
 8225:     *reqcharptr = reqchar;
 8226:     *reqcharflagsptr = reqcharflags;
 8227:     if (lengthptr != NULL)
 8228:       {
 8229:       if (OFLOW_MAX - *lengthptr < length)
 8230:         {
 8231:         *errorcodeptr = ERR20;
 8232:         return FALSE;
 8233:         }
 8234:       *lengthptr += length;
 8235:       }
 8236:     return TRUE;
 8237:     }
 8238: 
 8239:   /* Another branch follows. In the pre-compile phase, we can move the code
 8240:   pointer back to where it was for the start of the first branch. (That is,
 8241:   pretend that each branch is the only one.)
 8242: 
 8243:   In the real compile phase, insert an ALT node. Its length field points back
 8244:   to the previous branch while the bracket remains open. At the end the chain
 8245:   is reversed. It's done like this so that the start of the bracket has a
 8246:   zero offset until it is closed, making it possible to detect recursion. */
 8247: 
 8248:   if (lengthptr != NULL)
 8249:     {
 8250:     code = *codeptr + 1 + LINK_SIZE + skipbytes;
 8251:     length += 1 + LINK_SIZE;
 8252:     }
 8253:   else
 8254:     {
 8255:     *code = OP_ALT;
 8256:     PUT(code, 1, (int)(code - last_branch));
 8257:     bc.current_branch = last_branch = code;
 8258:     code += 1 + LINK_SIZE;
 8259:     }
 8260: 
 8261:   ptr++;
 8262:   }
 8263: /* Control never reaches here */
 8264: }
 8265: 
 8266: 
 8267: 
 8268: 
 8269: /*************************************************
 8270: *          Check for anchored expression         *
 8271: *************************************************/
 8272: 
 8273: /* Try to find out if this is an anchored regular expression. Consider each
 8274: alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
 8275: all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
 8276: it's anchored. However, if this is a multiline pattern, then only OP_SOD will
 8277: be found, because ^ generates OP_CIRCM in that mode.
 8278: 
 8279: We can also consider a regex to be anchored if OP_SOM starts all its branches.
 8280: This is the code for \G, which means "match at start of match position, taking
 8281: into account the match offset".
 8282: 
 8283: A branch is also implicitly anchored if it starts with .* and DOTALL is set,
 8284: because that will try the rest of the pattern at all possible matching points,
 8285: so there is no point trying again.... er ....
 8286: 
 8287: .... except when the .* appears inside capturing parentheses, and there is a
 8288: subsequent back reference to those parentheses. We haven't enough information
 8289: to catch that case precisely.
 8290: 
 8291: At first, the best we could do was to detect when .* was in capturing brackets
 8292: and the highest back reference was greater than or equal to that level.
 8293: However, by keeping a bitmap of the first 31 back references, we can catch some
 8294: of the more common cases more precisely.
 8295: 
 8296: ... A second exception is when the .* appears inside an atomic group, because
 8297: this prevents the number of characters it matches from being adjusted.
 8298: 
 8299: Arguments:
 8300:   code           points to start of expression (the bracket)
 8301:   bracket_map    a bitmap of which brackets we are inside while testing; this
 8302:                   handles up to substring 31; after that we just have to take
 8303:                   the less precise approach
 8304:   cd             points to the compile data block
 8305:   atomcount      atomic group level
 8306: 
 8307: Returns:     TRUE or FALSE
 8308: */
 8309: 
 8310: static BOOL
 8311: is_anchored(register const pcre_uchar *code, unsigned int bracket_map,
 8312:   compile_data *cd, int atomcount)
 8313: {
 8314: do {
 8315:    const pcre_uchar *scode = first_significant_code(
 8316:      code + PRIV(OP_lengths)[*code], FALSE);
 8317:    register int op = *scode;
 8318: 
 8319:    /* Non-capturing brackets */
 8320: 
 8321:    if (op == OP_BRA  || op == OP_BRAPOS ||
 8322:        op == OP_SBRA || op == OP_SBRAPOS)
 8323:      {
 8324:      if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
 8325:      }
 8326: 
 8327:    /* Capturing brackets */
 8328: 
 8329:    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
 8330:             op == OP_SCBRA || op == OP_SCBRAPOS)
 8331:      {
 8332:      int n = GET2(scode, 1+LINK_SIZE);
 8333:      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
 8334:      if (!is_anchored(scode, new_map, cd, atomcount)) return FALSE;
 8335:      }
 8336: 
 8337:    /* Positive forward assertions and conditions */
 8338: 
 8339:    else if (op == OP_ASSERT || op == OP_COND)
 8340:      {
 8341:      if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
 8342:      }
 8343: 
 8344:    /* Atomic groups */
 8345: 
 8346:    else if (op == OP_ONCE || op == OP_ONCE_NC)
 8347:      {
 8348:      if (!is_anchored(scode, bracket_map, cd, atomcount + 1))
 8349:        return FALSE;
 8350:      }
 8351: 
 8352:    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
 8353:    it isn't in brackets that are or may be referenced or inside an atomic
 8354:    group. */
 8355: 
 8356:    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
 8357:              op == OP_TYPEPOSSTAR))
 8358:      {
 8359:      if (scode[1] != OP_ALLANY || (bracket_map & cd->backref_map) != 0 ||
 8360:          atomcount > 0 || cd->had_pruneorskip)
 8361:        return FALSE;
 8362:      }
 8363: 
 8364:    /* Check for explicit anchoring */
 8365: 
 8366:    else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
 8367: 
 8368:    code += GET(code, 1);
 8369:    }
 8370: while (*code == OP_ALT);   /* Loop for each alternative */
 8371: return TRUE;
 8372: }
 8373: 
 8374: 
 8375: 
 8376: /*************************************************
 8377: *         Check for starting with ^ or .*        *
 8378: *************************************************/
 8379: 
 8380: /* This is called to find out if every branch starts with ^ or .* so that
 8381: "first char" processing can be done to speed things up in multiline
 8382: matching and for non-DOTALL patterns that start with .* (which must start at
 8383: the beginning or after \n). As in the case of is_anchored() (see above), we
 8384: have to take account of back references to capturing brackets that contain .*
 8385: because in that case we can't make the assumption. Also, the appearance of .*
 8386: inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
 8387: count, because once again the assumption no longer holds.
 8388: 
 8389: Arguments:
 8390:   code           points to start of expression (the bracket)
 8391:   bracket_map    a bitmap of which brackets we are inside while testing; this
 8392:                   handles up to substring 31; after that we just have to take
 8393:                   the less precise approach
 8394:   cd             points to the compile data
 8395:   atomcount      atomic group level
 8396: 
 8397: Returns:         TRUE or FALSE
 8398: */
 8399: 
 8400: static BOOL
 8401: is_startline(const pcre_uchar *code, unsigned int bracket_map,
 8402:   compile_data *cd, int atomcount)
 8403: {
 8404: do {
 8405:    const pcre_uchar *scode = first_significant_code(
 8406:      code + PRIV(OP_lengths)[*code], FALSE);
 8407:    register int op = *scode;
 8408: 
 8409:    /* If we are at the start of a conditional assertion group, *both* the
 8410:    conditional assertion *and* what follows the condition must satisfy the test
 8411:    for start of line. Other kinds of condition fail. Note that there may be an
 8412:    auto-callout at the start of a condition. */
 8413: 
 8414:    if (op == OP_COND)
 8415:      {
 8416:      scode += 1 + LINK_SIZE;
 8417:      if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
 8418:      switch (*scode)
 8419:        {
 8420:        case OP_CREF:
 8421:        case OP_DNCREF:
 8422:        case OP_RREF:
 8423:        case OP_DNRREF:
 8424:        case OP_DEF:
 8425:        return FALSE;
 8426: 
 8427:        default:     /* Assertion */
 8428:        if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
 8429:        do scode += GET(scode, 1); while (*scode == OP_ALT);
 8430:        scode += 1 + LINK_SIZE;
 8431:        break;
 8432:        }
 8433:      scode = first_significant_code(scode, FALSE);
 8434:      op = *scode;
 8435:      }
 8436: 
 8437:    /* Non-capturing brackets */
 8438: 
 8439:    if (op == OP_BRA  || op == OP_BRAPOS ||
 8440:        op == OP_SBRA || op == OP_SBRAPOS)
 8441:      {
 8442:      if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
 8443:      }
 8444: 
 8445:    /* Capturing brackets */
 8446: 
 8447:    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
 8448:             op == OP_SCBRA || op == OP_SCBRAPOS)
 8449:      {
 8450:      int n = GET2(scode, 1+LINK_SIZE);
 8451:      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
 8452:      if (!is_startline(scode, new_map, cd, atomcount)) return FALSE;
 8453:      }
 8454: 
 8455:    /* Positive forward assertions */
 8456: 
 8457:    else if (op == OP_ASSERT)
 8458:      {
 8459:      if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
 8460:      }
 8461: 
 8462:    /* Atomic brackets */
 8463: 
 8464:    else if (op == OP_ONCE || op == OP_ONCE_NC)
 8465:      {
 8466:      if (!is_startline(scode, bracket_map, cd, atomcount + 1)) return FALSE;
 8467:      }
 8468: 
 8469:    /* .* means "start at start or after \n" if it isn't in atomic brackets or
 8470:    brackets that may be referenced, as long as the pattern does not contain
 8471:    *PRUNE or *SKIP, because these break the feature. Consider, for example,
 8472:    /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
 8473:    start of a line. */
 8474: 
 8475:    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
 8476:      {
 8477:      if (scode[1] != OP_ANY || (bracket_map & cd->backref_map) != 0 ||
 8478:          atomcount > 0 || cd->had_pruneorskip)
 8479:        return FALSE;
 8480:      }
 8481: 
 8482:    /* Check for explicit circumflex; anything else gives a FALSE result. Note
 8483:    in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC
 8484:    because the number of characters matched by .* cannot be adjusted inside
 8485:    them. */
 8486: 
 8487:    else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
 8488: 
 8489:    /* Move on to the next alternative */
 8490: 
 8491:    code += GET(code, 1);
 8492:    }
 8493: while (*code == OP_ALT);  /* Loop for each alternative */
 8494: return TRUE;
 8495: }
 8496: 
 8497: 
 8498: 
 8499: /*************************************************
 8500: *       Check for asserted fixed first char      *
 8501: *************************************************/
 8502: 
 8503: /* During compilation, the "first char" settings from forward assertions are
 8504: discarded, because they can cause conflicts with actual literals that follow.
 8505: However, if we end up without a first char setting for an unanchored pattern,
 8506: it is worth scanning the regex to see if there is an initial asserted first
 8507: char. If all branches start with the same asserted char, or with a
 8508: non-conditional bracket all of whose alternatives start with the same asserted
 8509: char (recurse ad lib), then we return that char, with the flags set to zero or
 8510: REQ_CASELESS; otherwise return zero with REQ_NONE in the flags.
 8511: 
 8512: Arguments:
 8513:   code       points to start of expression (the bracket)
 8514:   flags      points to the first char flags, or to REQ_NONE
 8515:   inassert   TRUE if in an assertion
 8516: 
 8517: Returns:     the fixed first char, or 0 with REQ_NONE in flags
 8518: */
 8519: 
 8520: static pcre_uint32
 8521: find_firstassertedchar(const pcre_uchar *code, pcre_int32 *flags,
 8522:   BOOL inassert)
 8523: {
 8524: register pcre_uint32 c = 0;
 8525: int cflags = REQ_NONE;
 8526: 
 8527: *flags = REQ_NONE;
 8528: do {
 8529:    pcre_uint32 d;
 8530:    int dflags;
 8531:    int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
 8532:              *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
 8533:    const pcre_uchar *scode = first_significant_code(code + 1+LINK_SIZE + xl,
 8534:      TRUE);
 8535:    register pcre_uchar op = *scode;
 8536: 
 8537:    switch(op)
 8538:      {
 8539:      default:
 8540:      return 0;
 8541: 
 8542:      case OP_BRA:
 8543:      case OP_BRAPOS:
 8544:      case OP_CBRA:
 8545:      case OP_SCBRA:
 8546:      case OP_CBRAPOS:
 8547:      case OP_SCBRAPOS:
 8548:      case OP_ASSERT:
 8549:      case OP_ONCE:
 8550:      case OP_ONCE_NC:
 8551:      d = find_firstassertedchar(scode, &dflags, op == OP_ASSERT);
 8552:      if (dflags < 0)
 8553:        return 0;
 8554:      if (cflags < 0) { c = d; cflags = dflags; } else if (c != d || cflags != dflags) return 0;
 8555:      break;
 8556: 
 8557:      case OP_EXACT:
 8558:      scode += IMM2_SIZE;
 8559:      /* Fall through */
 8560: 
 8561:      case OP_CHAR:
 8562:      case OP_PLUS:
 8563:      case OP_MINPLUS:
 8564:      case OP_POSPLUS:
 8565:      if (!inassert) return 0;
 8566:      if (cflags < 0) { c = scode[1]; cflags = 0; }
 8567:        else if (c != scode[1]) return 0;
 8568:      break;
 8569: 
 8570:      case OP_EXACTI:
 8571:      scode += IMM2_SIZE;
 8572:      /* Fall through */
 8573: 
 8574:      case OP_CHARI:
 8575:      case OP_PLUSI:
 8576:      case OP_MINPLUSI:
 8577:      case OP_POSPLUSI:
 8578:      if (!inassert) return 0;
 8579:      if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
 8580:        else if (c != scode[1]) return 0;
 8581:      break;
 8582:      }
 8583: 
 8584:    code += GET(code, 1);
 8585:    }
 8586: while (*code == OP_ALT);
 8587: 
 8588: *flags = cflags;
 8589: return c;
 8590: }
 8591: 
 8592: 
 8593: 
 8594: /*************************************************
 8595: *     Add an entry to the name/number table      *
 8596: *************************************************/
 8597: 
 8598: /* This function is called between compiling passes to add an entry to the
 8599: name/number table, maintaining alphabetical order. Checking for permitted
 8600: and forbidden duplicates has already been done.
 8601: 
 8602: Arguments:
 8603:   cd           the compile data block
 8604:   name         the name to add
 8605:   length       the length of the name
 8606:   groupno      the group number
 8607: 
 8608: Returns:       nothing
 8609: */
 8610: 
 8611: static void
 8612: add_name(compile_data *cd, const pcre_uchar *name, int length,
 8613:   unsigned int groupno)
 8614: {
 8615: int i;
 8616: pcre_uchar *slot = cd->name_table;
 8617: 
 8618: for (i = 0; i < cd->names_found; i++)
 8619:   {
 8620:   int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(length));
 8621:   if (crc == 0 && slot[IMM2_SIZE+length] != 0)
 8622:     crc = -1; /* Current name is a substring */
 8623: 
 8624:   /* Make space in the table and break the loop for an earlier name. For a
 8625:   duplicate or later name, carry on. We do this for duplicates so that in the
 8626:   simple case (when ?(| is not used) they are in order of their numbers. In all
 8627:   cases they are in the order in which they appear in the pattern. */
 8628: 
 8629:   if (crc < 0)
 8630:     {
 8631:     memmove(slot + cd->name_entry_size, slot,
 8632:       IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
 8633:     break;
 8634:     }
 8635: 
 8636:   /* Continue the loop for a later or duplicate name */
 8637: 
 8638:   slot += cd->name_entry_size;
 8639:   }
 8640: 
 8641: PUT2(slot, 0, groupno);
 8642: memcpy(slot + IMM2_SIZE, name, IN_UCHARS(length));
 8643: slot[IMM2_SIZE + length] = 0;
 8644: cd->names_found++;
 8645: }
 8646: 
 8647: 
 8648: 
 8649: /*************************************************
 8650: *        Compile a Regular Expression            *
 8651: *************************************************/
 8652: 
 8653: /* This function takes a string and returns a pointer to a block of store
 8654: holding a compiled version of the expression. The original API for this
 8655: function had no error code return variable; it is retained for backwards
 8656: compatibility. The new function is given a new name.
 8657: 
 8658: Arguments:
 8659:   pattern       the regular expression
 8660:   options       various option bits
 8661:   errorcodeptr  pointer to error code variable (pcre_compile2() only)
 8662:                   can be NULL if you don't want a code value
 8663:   errorptr      pointer to pointer to error text
 8664:   erroroffset   ptr offset in pattern where error was detected
 8665:   tables        pointer to character tables or NULL
 8666: 
 8667: Returns:        pointer to compiled data block, or NULL on error,
 8668:                 with errorptr and erroroffset set
 8669: */
 8670: 
 8671: #if defined COMPILE_PCRE8
 8672: PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
 8673: pcre_compile(const char *pattern, int options, const char **errorptr,
 8674:   int *erroroffset, const unsigned char *tables)
 8675: #elif defined COMPILE_PCRE16
 8676: PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
 8677: pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr,
 8678:   int *erroroffset, const unsigned char *tables)
 8679: #elif defined COMPILE_PCRE32
 8680: PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
 8681: pcre32_compile(PCRE_SPTR32 pattern, int options, const char **errorptr,
 8682:   int *erroroffset, const unsigned char *tables)
 8683: #endif
 8684: {
 8685: #if defined COMPILE_PCRE8
 8686: return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
 8687: #elif defined COMPILE_PCRE16
 8688: return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
 8689: #elif defined COMPILE_PCRE32
 8690: return pcre32_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
 8691: #endif
 8692: }
 8693: 
 8694: 
 8695: #if defined COMPILE_PCRE8
 8696: PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
 8697: pcre_compile2(const char *pattern, int options, int *errorcodeptr,
 8698:   const char **errorptr, int *erroroffset, const unsigned char *tables)
 8699: #elif defined COMPILE_PCRE16
 8700: PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
 8701: pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr,
 8702:   const char **errorptr, int *erroroffset, const unsigned char *tables)
 8703: #elif defined COMPILE_PCRE32
 8704: PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
 8705: pcre32_compile2(PCRE_SPTR32 pattern, int options, int *errorcodeptr,
 8706:   const char **errorptr, int *erroroffset, const unsigned char *tables)
 8707: #endif
 8708: {
 8709: REAL_PCRE *re;
 8710: int length = 1;  /* For final END opcode */
 8711: pcre_int32 firstcharflags, reqcharflags;
 8712: pcre_uint32 firstchar, reqchar;
 8713: pcre_uint32 limit_match = PCRE_UINT32_MAX;
 8714: pcre_uint32 limit_recursion = PCRE_UINT32_MAX;
 8715: int newline;
 8716: int errorcode = 0;
 8717: int skipatstart = 0;
 8718: BOOL utf;
 8719: BOOL never_utf = FALSE;
 8720: size_t size;
 8721: pcre_uchar *code;
 8722: const pcre_uchar *codestart;
 8723: const pcre_uchar *ptr;
 8724: compile_data compile_block;
 8725: compile_data *cd = &compile_block;
 8726: 
 8727: /* This space is used for "compiling" into during the first phase, when we are
 8728: computing the amount of memory that is needed. Compiled items are thrown away
 8729: as soon as possible, so that a fairly large buffer should be sufficient for
 8730: this purpose. The same space is used in the second phase for remembering where
 8731: to fill in forward references to subpatterns. That may overflow, in which case
 8732: new memory is obtained from malloc(). */
 8733: 
 8734: pcre_uchar cworkspace[COMPILE_WORK_SIZE];
 8735: 
 8736: /* This vector is used for remembering name groups during the pre-compile. In a
 8737: similar way to cworkspace, it can be expanded using malloc() if necessary. */
 8738: 
 8739: named_group named_groups[NAMED_GROUP_LIST_SIZE];
 8740: 
 8741: /* Set this early so that early errors get offset 0. */
 8742: 
 8743: ptr = (const pcre_uchar *)pattern;
 8744: 
 8745: /* We can't pass back an error message if errorptr is NULL; I guess the best we
 8746: can do is just return NULL, but we can set a code value if there is a code
 8747: pointer. */
 8748: 
 8749: if (errorptr == NULL)
 8750:   {
 8751:   if (errorcodeptr != NULL) *errorcodeptr = 99;
 8752:   return NULL;
 8753:   }
 8754: 
 8755: *errorptr = NULL;
 8756: if (errorcodeptr != NULL) *errorcodeptr = ERR0;
 8757: 
 8758: /* However, we can give a message for this error */
 8759: 
 8760: if (erroroffset == NULL)
 8761:   {
 8762:   errorcode = ERR16;
 8763:   goto PCRE_EARLY_ERROR_RETURN2;
 8764:   }
 8765: 
 8766: *erroroffset = 0;
 8767: 
 8768: /* Set up pointers to the individual character tables */
 8769: 
 8770: if (tables == NULL) tables = PRIV(default_tables);
 8771: cd->lcc = tables + lcc_offset;
 8772: cd->fcc = tables + fcc_offset;
 8773: cd->cbits = tables + cbits_offset;
 8774: cd->ctypes = tables + ctypes_offset;
 8775: 
 8776: /* Check that all undefined public option bits are zero */
 8777: 
 8778: if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
 8779:   {
 8780:   errorcode = ERR17;
 8781:   goto PCRE_EARLY_ERROR_RETURN;
 8782:   }
 8783: 
 8784: /* If PCRE_NEVER_UTF is set, remember it. */
 8785: 
 8786: if ((options & PCRE_NEVER_UTF) != 0) never_utf = TRUE;
 8787: 
 8788: /* Check for global one-time settings at the start of the pattern, and remember
 8789: the offset for later. */
 8790: 
 8791: cd->external_flags = 0;   /* Initialize here for LIMIT_MATCH/RECURSION */
 8792: 
 8793: while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
 8794:        ptr[skipatstart+1] == CHAR_ASTERISK)
 8795:   {
 8796:   int newnl = 0;
 8797:   int newbsr = 0;
 8798: 
 8799: /* For completeness and backward compatibility, (*UTFn) is supported in the
 8800: relevant libraries, but (*UTF) is generic and always supported. Note that
 8801: PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. */
 8802: 
 8803: #ifdef COMPILE_PCRE8
 8804:   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF8_RIGHTPAR, 5) == 0)
 8805:     { skipatstart += 7; options |= PCRE_UTF8; continue; }
 8806: #endif
 8807: #ifdef COMPILE_PCRE16
 8808:   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF16_RIGHTPAR, 6) == 0)
 8809:     { skipatstart += 8; options |= PCRE_UTF16; continue; }
 8810: #endif
 8811: #ifdef COMPILE_PCRE32
 8812:   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF32_RIGHTPAR, 6) == 0)
 8813:     { skipatstart += 8; options |= PCRE_UTF32; continue; }
 8814: #endif
 8815: 
 8816:   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 4) == 0)
 8817:     { skipatstart += 6; options |= PCRE_UTF8; continue; }
 8818:   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
 8819:     { skipatstart += 6; options |= PCRE_UCP; continue; }
 8820:   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_AUTO_POSSESS_RIGHTPAR, 16) == 0)
 8821:     { skipatstart += 18; options |= PCRE_NO_AUTO_POSSESS; continue; }
 8822:   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
 8823:     { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
 8824: 
 8825:   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_MATCH_EQ, 12) == 0)
 8826:     {
 8827:     pcre_uint32 c = 0;
 8828:     int p = skipatstart + 14;
 8829:     while (isdigit(ptr[p]))
 8830:       {
 8831:       if (c > PCRE_UINT32_MAX / 10 - 1) break;   /* Integer overflow */
 8832:       c = c*10 + ptr[p++] - CHAR_0;
 8833:       }
 8834:     if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
 8835:     if (c < limit_match)
 8836:       {
 8837:       limit_match = c;
 8838:       cd->external_flags |= PCRE_MLSET;
 8839:       }
 8840:     skipatstart = p;
 8841:     continue;
 8842:     }
 8843: 
 8844:   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_RECURSION_EQ, 16) == 0)
 8845:     {
 8846:     pcre_uint32 c = 0;
 8847:     int p = skipatstart + 18;
 8848:     while (isdigit(ptr[p]))
 8849:       {
 8850:       if (c > PCRE_UINT32_MAX / 10 - 1) break;   /* Integer overflow check */
 8851:       c = c*10 + ptr[p++] - CHAR_0;
 8852:       }
 8853:     if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
 8854:     if (c < limit_recursion)
 8855:       {
 8856:       limit_recursion = c;
 8857:       cd->external_flags |= PCRE_RLSET;
 8858:       }
 8859:     skipatstart = p;
 8860:     continue;
 8861:     }
 8862: 
 8863:   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CR_RIGHTPAR, 3) == 0)
 8864:     { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
 8865:   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LF_RIGHTPAR, 3)  == 0)
 8866:     { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
 8867:   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CRLF_RIGHTPAR, 5)  == 0)
 8868:     { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
 8869:   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANY_RIGHTPAR, 4) == 0)
 8870:     { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
 8871:   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANYCRLF_RIGHTPAR, 8) == 0)
 8872:     { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
 8873: 
 8874:   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
 8875:     { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
 8876:   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
 8877:     { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
 8878: 
 8879:   if (newnl != 0)
 8880:     options = (options & ~PCRE_NEWLINE_BITS) | newnl;
 8881:   else if (newbsr != 0)
 8882:     options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
 8883:   else break;
 8884:   }
 8885: 
 8886: /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
 8887: utf = (options & PCRE_UTF8) != 0;
 8888: if (utf && never_utf)
 8889:   {
 8890:   errorcode = ERR78;
 8891:   goto PCRE_EARLY_ERROR_RETURN2;
 8892:   }
 8893: 
 8894: /* Can't support UTF unless PCRE has been compiled to include the code. The
 8895: return of an error code from PRIV(valid_utf)() is a new feature, introduced in
 8896: release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
 8897: not used here. */
 8898: 
 8899: #ifdef SUPPORT_UTF
 8900: if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
 8901:      (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
 8902:   {
 8903: #if defined COMPILE_PCRE8
 8904:   errorcode = ERR44;
 8905: #elif defined COMPILE_PCRE16
 8906:   errorcode = ERR74;
 8907: #elif defined COMPILE_PCRE32
 8908:   errorcode = ERR77;
 8909: #endif
 8910:   goto PCRE_EARLY_ERROR_RETURN2;
 8911:   }
 8912: #else
 8913: if (utf)
 8914:   {
 8915:   errorcode = ERR32;
 8916:   goto PCRE_EARLY_ERROR_RETURN;
 8917:   }
 8918: #endif
 8919: 
 8920: /* Can't support UCP unless PCRE has been compiled to include the code. */
 8921: 
 8922: #ifndef SUPPORT_UCP
 8923: if ((options & PCRE_UCP) != 0)
 8924:   {
 8925:   errorcode = ERR67;
 8926:   goto PCRE_EARLY_ERROR_RETURN;
 8927:   }
 8928: #endif
 8929: 
 8930: /* Check validity of \R options. */
 8931: 
 8932: if ((options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) ==
 8933:      (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
 8934:   {
 8935:   errorcode = ERR56;
 8936:   goto PCRE_EARLY_ERROR_RETURN;
 8937:   }
 8938: 
 8939: /* Handle different types of newline. The three bits give seven cases. The
 8940: current code allows for fixed one- or two-byte sequences, plus "any" and
 8941: "anycrlf". */
 8942: 
 8943: switch (options & PCRE_NEWLINE_BITS)
 8944:   {
 8945:   case 0: newline = NEWLINE; break;   /* Build-time default */
 8946:   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
 8947:   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
 8948:   case PCRE_NEWLINE_CR+
 8949:        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
 8950:   case PCRE_NEWLINE_ANY: newline = -1; break;
 8951:   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
 8952:   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
 8953:   }
 8954: 
 8955: if (newline == -2)
 8956:   {
 8957:   cd->nltype = NLTYPE_ANYCRLF;
 8958:   }
 8959: else if (newline < 0)
 8960:   {
 8961:   cd->nltype = NLTYPE_ANY;
 8962:   }
 8963: else
 8964:   {
 8965:   cd->nltype = NLTYPE_FIXED;
 8966:   if (newline > 255)
 8967:     {
 8968:     cd->nllen = 2;
 8969:     cd->nl[0] = (newline >> 8) & 255;
 8970:     cd->nl[1] = newline & 255;
 8971:     }
 8972:   else
 8973:     {
 8974:     cd->nllen = 1;
 8975:     cd->nl[0] = newline;
 8976:     }
 8977:   }
 8978: 
 8979: /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
 8980: references to help in deciding whether (.*) can be treated as anchored or not.
 8981: */
 8982: 
 8983: cd->top_backref = 0;
 8984: cd->backref_map = 0;
 8985: 
 8986: /* Reflect pattern for debugging output */
 8987: 
 8988: DPRINTF(("------------------------------------------------------------------\n"));
 8989: #ifdef PCRE_DEBUG
 8990: print_puchar(stdout, (PCRE_PUCHAR)pattern);
 8991: #endif
 8992: DPRINTF(("\n"));
 8993: 
 8994: /* Pretend to compile the pattern while actually just accumulating the length
 8995: of memory required. This behaviour is triggered by passing a non-NULL final
 8996: argument to compile_regex(). We pass a block of workspace (cworkspace) for it
 8997: to compile parts of the pattern into; the compiled code is discarded when it is
 8998: no longer needed, so hopefully this workspace will never overflow, though there
 8999: is a test for its doing so. */
 9000: 
 9001: cd->bracount = cd->final_bracount = 0;
 9002: cd->names_found = 0;
 9003: cd->name_entry_size = 0;
 9004: cd->name_table = NULL;
 9005: cd->dupnames = FALSE;
 9006: cd->namedrefcount = 0;
 9007: cd->start_code = cworkspace;
 9008: cd->hwm = cworkspace;
 9009: cd->start_workspace = cworkspace;
 9010: cd->workspace_size = COMPILE_WORK_SIZE;
 9011: cd->named_groups = named_groups;
 9012: cd->named_group_list_size = NAMED_GROUP_LIST_SIZE;
 9013: cd->start_pattern = (const pcre_uchar *)pattern;
 9014: cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
 9015: cd->req_varyopt = 0;
 9016: cd->parens_depth = 0;
 9017: cd->assert_depth = 0;
 9018: cd->max_lookbehind = 0;
 9019: cd->external_options = options;
 9020: cd->open_caps = NULL;
 9021: 
 9022: /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
 9023: don't need to look at the result of the function here. The initial options have
 9024: been put into the cd block so that they can be changed if an option setting is
 9025: found within the regex right at the beginning. Bringing initial option settings
 9026: outside can help speed up starting point checks. */
 9027: 
 9028: ptr += skipatstart;
 9029: code = cworkspace;
 9030: *code = OP_BRA;
 9031: 
 9032: (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
 9033:   FALSE, 0, 0, &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL,
 9034:   cd, &length);
 9035: if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
 9036: 
 9037: DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
 9038:   (int)(cd->hwm - cworkspace)));
 9039: 
 9040: if (length > MAX_PATTERN_SIZE)
 9041:   {
 9042:   errorcode = ERR20;
 9043:   goto PCRE_EARLY_ERROR_RETURN;
 9044:   }
 9045: 
 9046: /* If there are groups with duplicate names and there are also references by
 9047: name, we must allow for the possibility of named references to duplicated
 9048: groups. These require an extra data item each. */
 9049: 
 9050: if (cd->dupnames && cd->namedrefcount > 0)
 9051:   length += cd->namedrefcount * IMM2_SIZE * sizeof(pcre_uchar);
 9052: 
 9053: /* Compute the size of the data block for storing the compiled pattern. Integer
 9054: overflow should no longer be possible because nowadays we limit the maximum
 9055: value of cd->names_found and cd->name_entry_size. */
 9056: 
 9057: size = sizeof(REAL_PCRE) +
 9058:   (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
 9059: 
 9060: /* Get the memory. */
 9061: 
 9062: re = (REAL_PCRE *)(PUBL(malloc))(size);
 9063: if (re == NULL)
 9064:   {
 9065:   errorcode = ERR21;
 9066:   goto PCRE_EARLY_ERROR_RETURN;
 9067:   }
 9068: 
 9069: /* Put in the magic number, and save the sizes, initial options, internal
 9070: flags, and character table pointer. NULL is used for the default character
 9071: tables. The nullpad field is at the end; it's there to help in the case when a
 9072: regex compiled on a system with 4-byte pointers is run on another with 8-byte
 9073: pointers. */
 9074: 
 9075: re->magic_number = MAGIC_NUMBER;
 9076: re->size = (int)size;
 9077: re->options = cd->external_options;
 9078: re->flags = cd->external_flags;
 9079: re->limit_match = limit_match;
 9080: re->limit_recursion = limit_recursion;
 9081: re->first_char = 0;
 9082: re->req_char = 0;
 9083: re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);
 9084: re->name_entry_size = cd->name_entry_size;
 9085: re->name_count = cd->names_found;
 9086: re->ref_count = 0;
 9087: re->tables = (tables == PRIV(default_tables))? NULL : tables;
 9088: re->nullpad = NULL;
 9089: #ifdef COMPILE_PCRE32
 9090: re->dummy = 0;
 9091: #else
 9092: re->dummy1 = re->dummy2 = re->dummy3 = 0;
 9093: #endif
 9094: 
 9095: /* The starting points of the name/number translation table and of the code are
 9096: passed around in the compile data block. The start/end pattern and initial
 9097: options are already set from the pre-compile phase, as is the name_entry_size
 9098: field. Reset the bracket count and the names_found field. Also reset the hwm
 9099: field; this time it's used for remembering forward references to subpatterns.
 9100: */
 9101: 
 9102: cd->final_bracount = cd->bracount;  /* Save for checking forward references */
 9103: cd->parens_depth = 0;
 9104: cd->assert_depth = 0;
 9105: cd->bracount = 0;
 9106: cd->max_lookbehind = 0;
 9107: cd->name_table = (pcre_uchar *)re + re->name_table_offset;
 9108: codestart = cd->name_table + re->name_entry_size * re->name_count;
 9109: cd->start_code = codestart;
 9110: cd->hwm = (pcre_uchar *)(cd->start_workspace);
 9111: cd->req_varyopt = 0;
 9112: cd->had_accept = FALSE;
 9113: cd->had_pruneorskip = FALSE;
 9114: cd->check_lookbehind = FALSE;
 9115: cd->open_caps = NULL;
 9116: 
 9117: /* If any named groups were found, create the name/number table from the list
 9118: created in the first pass. */
 9119: 
 9120: if (cd->names_found > 0)
 9121:   {
 9122:   int i = cd->names_found;
 9123:   named_group *ng = cd->named_groups;
 9124:   cd->names_found = 0;
 9125:   for (; i > 0; i--, ng++)
 9126:     add_name(cd, ng->name, ng->length, ng->number);
 9127:   if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
 9128:     (PUBL(free))((void *)cd->named_groups);
 9129:   }
 9130: 
 9131: /* Set up a starting, non-extracting bracket, then compile the expression. On
 9132: error, errorcode will be set non-zero, so we don't need to look at the result
 9133: of the function here. */
 9134: 
 9135: ptr = (const pcre_uchar *)pattern + skipatstart;
 9136: code = (pcre_uchar *)codestart;
 9137: *code = OP_BRA;
 9138: (void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0,
 9139:   &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL, cd, NULL);
 9140: re->top_bracket = cd->bracount;
 9141: re->top_backref = cd->top_backref;
 9142: re->max_lookbehind = cd->max_lookbehind;
 9143: re->flags = cd->external_flags | PCRE_MODE;
 9144: 
 9145: if (cd->had_accept)
 9146:   {
 9147:   reqchar = 0;              /* Must disable after (*ACCEPT) */
 9148:   reqcharflags = REQ_NONE;
 9149:   }
 9150: 
 9151: /* If not reached end of pattern on success, there's an excess bracket. */
 9152: 
 9153: if (errorcode == 0 && *ptr != CHAR_NULL) errorcode = ERR22;
 9154: 
 9155: /* Fill in the terminating state and check for disastrous overflow, but
 9156: if debugging, leave the test till after things are printed out. */
 9157: 
 9158: *code++ = OP_END;
 9159: 
 9160: #ifndef PCRE_DEBUG
 9161: if (code - codestart > length) errorcode = ERR23;
 9162: #endif
 9163: 
 9164: #ifdef SUPPORT_VALGRIND
 9165: /* If the estimated length exceeds the really used length, mark the extra
 9166: allocated memory as unaddressable, so that any out-of-bound reads can be
 9167: detected. */
 9168: VALGRIND_MAKE_MEM_NOACCESS(code, (length - (code - codestart)) * sizeof(pcre_uchar));
 9169: #endif
 9170: 
 9171: /* Fill in any forward references that are required. There may be repeated
 9172: references; optimize for them, as searching a large regex takes time. */
 9173: 
 9174: if (cd->hwm > cd->start_workspace)
 9175:   {
 9176:   int prev_recno = -1;
 9177:   const pcre_uchar *groupptr = NULL;
 9178:   while (errorcode == 0 && cd->hwm > cd->start_workspace)
 9179:     {
 9180:     int offset, recno;
 9181:     cd->hwm -= LINK_SIZE;
 9182:     offset = GET(cd->hwm, 0);
 9183:     recno = GET(codestart, offset);
 9184:     if (recno != prev_recno)
 9185:       {
 9186:       groupptr = PRIV(find_bracket)(codestart, utf, recno);
 9187:       prev_recno = recno;
 9188:       }
 9189:     if (groupptr == NULL) errorcode = ERR53;
 9190:       else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));
 9191:     }
 9192:   }
 9193: 
 9194: /* If the workspace had to be expanded, free the new memory. Set the pointer to
 9195: NULL to indicate that forward references have been filled in. */
 9196: 
 9197: if (cd->workspace_size > COMPILE_WORK_SIZE)
 9198:   (PUBL(free))((void *)cd->start_workspace);
 9199: cd->start_workspace = NULL;
 9200: 
 9201: /* Give an error if there's back reference to a non-existent capturing
 9202: subpattern. */
 9203: 
 9204: if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
 9205: 
 9206: /* Unless disabled, check whether single character iterators can be
 9207: auto-possessified. The function overwrites the appropriate opcode values. */
 9208: 
 9209: if ((options & PCRE_NO_AUTO_POSSESS) == 0)
 9210:   auto_possessify((pcre_uchar *)codestart, utf, cd);
 9211: 
 9212: /* If there were any lookbehind assertions that contained OP_RECURSE
 9213: (recursions or subroutine calls), a flag is set for them to be checked here,
 9214: because they may contain forward references. Actual recursions cannot be fixed
 9215: length, but subroutine calls can. It is done like this so that those without
 9216: OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
 9217: exceptional ones forgo this. We scan the pattern to check that they are fixed
 9218: length, and set their lengths. */
 9219: 
 9220: if (cd->check_lookbehind)
 9221:   {
 9222:   pcre_uchar *cc = (pcre_uchar *)codestart;
 9223: 
 9224:   /* Loop, searching for OP_REVERSE items, and process those that do not have
 9225:   their length set. (Actually, it will also re-process any that have a length
 9226:   of zero, but that is a pathological case, and it does no harm.) When we find
 9227:   one, we temporarily terminate the branch it is in while we scan it. */
 9228: 
 9229:   for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf, -1);
 9230:        cc != NULL;
 9231:        cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf, -1))
 9232:     {
 9233:     if (GET(cc, 1) == 0)
 9234:       {
 9235:       int fixed_length;
 9236:       pcre_uchar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
 9237:       int end_op = *be;
 9238:       *be = OP_END;
 9239:       fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE,
 9240:         cd);
 9241:       *be = end_op;
 9242:       DPRINTF(("fixed length = %d\n", fixed_length));
 9243:       if (fixed_length < 0)
 9244:         {
 9245:         errorcode = (fixed_length == -2)? ERR36 :
 9246:                     (fixed_length == -4)? ERR70 : ERR25;
 9247:         break;
 9248:         }
 9249:       if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length;
 9250:       PUT(cc, 1, fixed_length);
 9251:       }
 9252:     cc += 1 + LINK_SIZE;
 9253:     }
 9254:   }
 9255: 
 9256: /* Failed to compile, or error while post-processing */
 9257: 
 9258: if (errorcode != 0)
 9259:   {
 9260:   (PUBL(free))(re);
 9261:   PCRE_EARLY_ERROR_RETURN:
 9262:   *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);
 9263:   PCRE_EARLY_ERROR_RETURN2:
 9264:   *errorptr = find_error_text(errorcode);
 9265:   if (errorcodeptr != NULL) *errorcodeptr = errorcode;
 9266:   return NULL;
 9267:   }
 9268: 
 9269: /* If the anchored option was not passed, set the flag if we can determine that
 9270: the pattern is anchored by virtue of ^ characters or \A or anything else, such
 9271: as starting with non-atomic .* when DOTALL is set and there are no occurrences
 9272: of *PRUNE or *SKIP.
 9273: 
 9274: Otherwise, if we know what the first byte has to be, save it, because that
 9275: speeds up unanchored matches no end. If not, see if we can set the
 9276: PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
 9277: start with ^. and also when all branches start with non-atomic .* for
 9278: non-DOTALL matches when *PRUNE and SKIP are not present. */
 9279: 
 9280: if ((re->options & PCRE_ANCHORED) == 0)
 9281:   {
 9282:   if (is_anchored(codestart, 0, cd, 0)) re->options |= PCRE_ANCHORED;
 9283:   else
 9284:     {
 9285:     if (firstcharflags < 0)
 9286:       firstchar = find_firstassertedchar(codestart, &firstcharflags, FALSE);
 9287:     if (firstcharflags >= 0)   /* Remove caseless flag for non-caseable chars */
 9288:       {
 9289: #if defined COMPILE_PCRE8
 9290:       re->first_char = firstchar & 0xff;
 9291: #elif defined COMPILE_PCRE16
 9292:       re->first_char = firstchar & 0xffff;
 9293: #elif defined COMPILE_PCRE32
 9294:       re->first_char = firstchar;
 9295: #endif
 9296:       if ((firstcharflags & REQ_CASELESS) != 0)
 9297:         {
 9298: #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
 9299:         /* We ignore non-ASCII first chars in 8 bit mode. */
 9300:         if (utf)
 9301:           {
 9302:           if (re->first_char < 128)
 9303:             {
 9304:             if (cd->fcc[re->first_char] != re->first_char)
 9305:               re->flags |= PCRE_FCH_CASELESS;
 9306:             }
 9307:           else if (UCD_OTHERCASE(re->first_char) != re->first_char)
 9308:             re->flags |= PCRE_FCH_CASELESS;
 9309:           }
 9310:         else
 9311: #endif
 9312:         if (MAX_255(re->first_char)
 9313:             && cd->fcc[re->first_char] != re->first_char)
 9314:           re->flags |= PCRE_FCH_CASELESS;
 9315:         }
 9316: 
 9317:       re->flags |= PCRE_FIRSTSET;
 9318:       }
 9319: 
 9320:     else if (is_startline(codestart, 0, cd, 0)) re->flags |= PCRE_STARTLINE;
 9321:     }
 9322:   }
 9323: 
 9324: /* For an anchored pattern, we use the "required byte" only if it follows a
 9325: variable length item in the regex. Remove the caseless flag for non-caseable
 9326: bytes. */
 9327: 
 9328: if (reqcharflags >= 0 &&
 9329:      ((re->options & PCRE_ANCHORED) == 0 || (reqcharflags & REQ_VARY) != 0))
 9330:   {
 9331: #if defined COMPILE_PCRE8
 9332:   re->req_char = reqchar & 0xff;
 9333: #elif defined COMPILE_PCRE16
 9334:   re->req_char = reqchar & 0xffff;
 9335: #elif defined COMPILE_PCRE32
 9336:   re->req_char = reqchar;
 9337: #endif
 9338:   if ((reqcharflags & REQ_CASELESS) != 0)
 9339:     {
 9340: #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
 9341:     /* We ignore non-ASCII first chars in 8 bit mode. */
 9342:     if (utf)
 9343:       {
 9344:       if (re->req_char < 128)
 9345:         {
 9346:         if (cd->fcc[re->req_char] != re->req_char)
 9347:           re->flags |= PCRE_RCH_CASELESS;
 9348:         }
 9349:       else if (UCD_OTHERCASE(re->req_char) != re->req_char)
 9350:         re->flags |= PCRE_RCH_CASELESS;
 9351:       }
 9352:     else
 9353: #endif
 9354:     if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char)
 9355:       re->flags |= PCRE_RCH_CASELESS;
 9356:     }
 9357: 
 9358:   re->flags |= PCRE_REQCHSET;
 9359:   }
 9360: 
 9361: /* Print out the compiled data if debugging is enabled. This is never the
 9362: case when building a production library. */
 9363: 
 9364: #ifdef PCRE_DEBUG
 9365: printf("Length = %d top_bracket = %d top_backref = %d\n",
 9366:   length, re->top_bracket, re->top_backref);
 9367: 
 9368: printf("Options=%08x\n", re->options);
 9369: 
 9370: if ((re->flags & PCRE_FIRSTSET) != 0)
 9371:   {
 9372:   pcre_uchar ch = re->first_char;
 9373:   const char *caseless =
 9374:     ((re->flags & PCRE_FCH_CASELESS) == 0)? "" : " (caseless)";
 9375:   if (PRINTABLE(ch)) printf("First char = %c%s\n", ch, caseless);
 9376:     else printf("First char = \\x%02x%s\n", ch, caseless);
 9377:   }
 9378: 
 9379: if ((re->flags & PCRE_REQCHSET) != 0)
 9380:   {
 9381:   pcre_uchar ch = re->req_char;
 9382:   const char *caseless =
 9383:     ((re->flags & PCRE_RCH_CASELESS) == 0)? "" : " (caseless)";
 9384:   if (PRINTABLE(ch)) printf("Req char = %c%s\n", ch, caseless);
 9385:     else printf("Req char = \\x%02x%s\n", ch, caseless);
 9386:   }
 9387: 
 9388: #if defined COMPILE_PCRE8
 9389: pcre_printint((pcre *)re, stdout, TRUE);
 9390: #elif defined COMPILE_PCRE16
 9391: pcre16_printint((pcre *)re, stdout, TRUE);
 9392: #elif defined COMPILE_PCRE32
 9393: pcre32_printint((pcre *)re, stdout, TRUE);
 9394: #endif
 9395: 
 9396: /* This check is done here in the debugging case so that the code that
 9397: was compiled can be seen. */
 9398: 
 9399: if (code - codestart > length)
 9400:   {
 9401:   (PUBL(free))(re);
 9402:   *errorptr = find_error_text(ERR23);
 9403:   *erroroffset = ptr - (pcre_uchar *)pattern;
 9404:   if (errorcodeptr != NULL) *errorcodeptr = ERR23;
 9405:   return NULL;
 9406:   }
 9407: #endif   /* PCRE_DEBUG */
 9408: 
 9409: /* Check for a pattern than can match an empty string, so that this information
 9410: can be provided to applications. */
 9411: 
 9412: do
 9413:   {
 9414:   if (could_be_empty_branch(codestart, code, utf, cd, NULL))
 9415:     {
 9416:     re->flags |= PCRE_MATCH_EMPTY;
 9417:     break;
 9418:     }
 9419:   codestart += GET(codestart, 1);
 9420:   }
 9421: while (*codestart == OP_ALT);
 9422: 
 9423: #if defined COMPILE_PCRE8
 9424: return (pcre *)re;
 9425: #elif defined COMPILE_PCRE16
 9426: return (pcre16 *)re;
 9427: #elif defined COMPILE_PCRE32
 9428: return (pcre32 *)re;
 9429: #endif
 9430: }
 9431: 
 9432: /* End of pcre_compile.c */
 9433: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>