File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / pcre / pcre_compile.c
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Tue Feb 21 23:05:51 2012 UTC (12 years, 4 months ago) by misho
Branches: pcre, MAIN
CVS tags: v8_21, HEAD
pcre

    1: /*************************************************
    2: *      Perl-Compatible Regular Expressions       *
    3: *************************************************/
    4: 
    5: /* PCRE is a library of functions to support regular expressions whose syntax
    6: and semantics are as close as possible to those of the Perl 5 language.
    7: 
    8:                        Written by Philip Hazel
    9:            Copyright (c) 1997-2011 University of Cambridge
   10: 
   11: -----------------------------------------------------------------------------
   12: Redistribution and use in source and binary forms, with or without
   13: modification, are permitted provided that the following conditions are met:
   14: 
   15:     * Redistributions of source code must retain the above copyright notice,
   16:       this list of conditions and the following disclaimer.
   17: 
   18:     * Redistributions in binary form must reproduce the above copyright
   19:       notice, this list of conditions and the following disclaimer in the
   20:       documentation and/or other materials provided with the distribution.
   21: 
   22:     * Neither the name of the University of Cambridge nor the names of its
   23:       contributors may be used to endorse or promote products derived from
   24:       this software without specific prior written permission.
   25: 
   26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
   27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
   30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   36: POSSIBILITY OF SUCH DAMAGE.
   37: -----------------------------------------------------------------------------
   38: */
   39: 
   40: 
   41: /* This module contains the external function pcre_compile(), along with
   42: supporting internal functions that are not used by other modules. */
   43: 
   44: 
   45: #ifdef HAVE_CONFIG_H
   46: #include "config.h"
   47: #endif
   48: 
   49: #define NLBLOCK cd             /* Block containing newline information */
   50: #define PSSTART start_pattern  /* Field containing processed string start */
   51: #define PSEND   end_pattern    /* Field containing processed string end */
   52: 
   53: #include "pcre_internal.h"
   54: 
   55: 
   56: /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
   57: also used by pcretest. PCRE_DEBUG is not defined when building a production
   58: library. */
   59: 
   60: #ifdef PCRE_DEBUG
   61: #include "pcre_printint.src"
   62: #endif
   63: 
   64: 
   65: /* Macro for setting individual bits in class bitmaps. */
   66: 
   67: #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
   68: 
   69: /* Maximum length value to check against when making sure that the integer that
   70: holds the compiled pattern length does not overflow. We make it a bit less than
   71: INT_MAX to allow for adding in group terminating bytes, so that we don't have
   72: to check them every time. */
   73: 
   74: #define OFLOW_MAX (INT_MAX - 20)
   75: 
   76: 
   77: /*************************************************
   78: *      Code parameters and static tables         *
   79: *************************************************/
   80: 
   81: /* This value specifies the size of stack workspace that is used during the
   82: first pre-compile phase that determines how much memory is required. The regex
   83: is partly compiled into this space, but the compiled parts are discarded as
   84: soon as they can be, so that hopefully there will never be an overrun. The code
   85: does, however, check for an overrun. The largest amount I've seen used is 218,
   86: so this number is very generous.
   87: 
   88: The same workspace is used during the second, actual compile phase for
   89: remembering forward references to groups so that they can be filled in at the
   90: end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
   91: is 4 there is plenty of room for most patterns. However, the memory can get
   92: filled up by repetitions of forward references, for example patterns like
   93: /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
   94: that the workspace is expanded using malloc() in this situation. The value
   95: below is therefore a minimum, and we put a maximum on it for safety. The
   96: minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
   97: kicks in at the same number of forward references in all cases. */
   98: 
   99: #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
  100: #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
  101: 
  102: /* The overrun tests check for a slightly smaller size so that they detect the
  103: overrun before it actually does run off the end of the data block. */
  104: 
  105: #define WORK_SIZE_SAFETY_MARGIN (100)
  106: 
  107: 
  108: /* Table for handling escaped characters in the range '0'-'z'. Positive returns
  109: are simple data values; negative values are for special things like \d and so
  110: on. Zero means further processing is needed (for things like \x), or the escape
  111: is invalid. */
  112: 
  113: #ifndef EBCDIC
  114: 
  115: /* This is the "normal" table for ASCII systems or for EBCDIC systems running
  116: in UTF-8 mode. */
  117: 
  118: static const short int escapes[] = {
  119:      0,                       0,
  120:      0,                       0,
  121:      0,                       0,
  122:      0,                       0,
  123:      0,                       0,
  124:      CHAR_COLON,              CHAR_SEMICOLON,
  125:      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
  126:      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
  127:      CHAR_COMMERCIAL_AT,      -ESC_A,
  128:      -ESC_B,                  -ESC_C,
  129:      -ESC_D,                  -ESC_E,
  130:      0,                       -ESC_G,
  131:      -ESC_H,                  0,
  132:      0,                       -ESC_K,
  133:      0,                       0,
  134:      -ESC_N,                  0,
  135:      -ESC_P,                  -ESC_Q,
  136:      -ESC_R,                  -ESC_S,
  137:      0,                       0,
  138:      -ESC_V,                  -ESC_W,
  139:      -ESC_X,                  0,
  140:      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
  141:      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
  142:      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
  143:      CHAR_GRAVE_ACCENT,       7,
  144:      -ESC_b,                  0,
  145:      -ESC_d,                  ESC_e,
  146:      ESC_f,                   0,
  147:      -ESC_h,                  0,
  148:      0,                       -ESC_k,
  149:      0,                       0,
  150:      ESC_n,                   0,
  151:      -ESC_p,                  0,
  152:      ESC_r,                   -ESC_s,
  153:      ESC_tee,                 0,
  154:      -ESC_v,                  -ESC_w,
  155:      0,                       0,
  156:      -ESC_z
  157: };
  158: 
  159: #else
  160: 
  161: /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
  162: 
  163: static const short int escapes[] = {
  164: /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
  165: /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
  166: /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
  167: /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
  168: /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
  169: /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
  170: /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
  171: /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
  172: /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
  173: /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
  174: /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
  175: /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
  176: /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
  177: /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
  178: /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
  179: /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
  180: /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
  181: /*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
  182: /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
  183: /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
  184: /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
  185: /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
  186: /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
  187: };
  188: #endif
  189: 
  190: 
  191: /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
  192: searched linearly. Put all the names into a single string, in order to reduce
  193: the number of relocations when a shared library is dynamically linked. The
  194: string is built from string macros so that it works in UTF-8 mode on EBCDIC
  195: platforms. */
  196: 
  197: typedef struct verbitem {
  198:   int   len;                 /* Length of verb name */
  199:   int   op;                  /* Op when no arg, or -1 if arg mandatory */
  200:   int   op_arg;              /* Op when arg present, or -1 if not allowed */
  201: } verbitem;
  202: 
  203: static const char verbnames[] =
  204:   "\0"                       /* Empty name is a shorthand for MARK */
  205:   STRING_MARK0
  206:   STRING_ACCEPT0
  207:   STRING_COMMIT0
  208:   STRING_F0
  209:   STRING_FAIL0
  210:   STRING_PRUNE0
  211:   STRING_SKIP0
  212:   STRING_THEN;
  213: 
  214: static const verbitem verbs[] = {
  215:   { 0, -1,        OP_MARK },
  216:   { 4, -1,        OP_MARK },
  217:   { 6, OP_ACCEPT, -1 },
  218:   { 6, OP_COMMIT, -1 },
  219:   { 1, OP_FAIL,   -1 },
  220:   { 4, OP_FAIL,   -1 },
  221:   { 5, OP_PRUNE,  OP_PRUNE_ARG },
  222:   { 4, OP_SKIP,   OP_SKIP_ARG  },
  223:   { 4, OP_THEN,   OP_THEN_ARG  }
  224: };
  225: 
  226: static const int verbcount = sizeof(verbs)/sizeof(verbitem);
  227: 
  228: 
  229: /* Tables of names of POSIX character classes and their lengths. The names are
  230: now all in a single string, to reduce the number of relocations when a shared
  231: library is dynamically loaded. The list of lengths is terminated by a zero
  232: length entry. The first three must be alpha, lower, upper, as this is assumed
  233: for handling case independence. */
  234: 
  235: static const char posix_names[] =
  236:   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
  237:   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
  238:   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
  239:   STRING_word0  STRING_xdigit;
  240: 
  241: static const uschar posix_name_lengths[] = {
  242:   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
  243: 
  244: /* Table of class bit maps for each POSIX class. Each class is formed from a
  245: base map, with an optional addition or removal of another map. Then, for some
  246: classes, there is some additional tweaking: for [:blank:] the vertical space
  247: characters are removed, and for [:alpha:] and [:alnum:] the underscore
  248: character is removed. The triples in the table consist of the base map offset,
  249: second map offset or -1 if no second map, and a non-negative value for map
  250: addition or a negative value for map subtraction (if there are two maps). The
  251: absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
  252: remove vertical space characters, 2 => remove underscore. */
  253: 
  254: static const int posix_class_maps[] = {
  255:   cbit_word,  cbit_digit, -2,             /* alpha */
  256:   cbit_lower, -1,          0,             /* lower */
  257:   cbit_upper, -1,          0,             /* upper */
  258:   cbit_word,  -1,          2,             /* alnum - word without underscore */
  259:   cbit_print, cbit_cntrl,  0,             /* ascii */
  260:   cbit_space, -1,          1,             /* blank - a GNU extension */
  261:   cbit_cntrl, -1,          0,             /* cntrl */
  262:   cbit_digit, -1,          0,             /* digit */
  263:   cbit_graph, -1,          0,             /* graph */
  264:   cbit_print, -1,          0,             /* print */
  265:   cbit_punct, -1,          0,             /* punct */
  266:   cbit_space, -1,          0,             /* space */
  267:   cbit_word,  -1,          0,             /* word - a Perl extension */
  268:   cbit_xdigit,-1,          0              /* xdigit */
  269: };
  270: 
  271: /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
  272: substitutes must be in the order of the names, defined above, and there are
  273: both positive and negative cases. NULL means no substitute. */
  274: 
  275: #ifdef SUPPORT_UCP
  276: static const uschar *substitutes[] = {
  277:   (uschar *)"\\P{Nd}",    /* \D */
  278:   (uschar *)"\\p{Nd}",    /* \d */
  279:   (uschar *)"\\P{Xsp}",   /* \S */       /* NOTE: Xsp is Perl space */
  280:   (uschar *)"\\p{Xsp}",   /* \s */
  281:   (uschar *)"\\P{Xwd}",   /* \W */
  282:   (uschar *)"\\p{Xwd}"    /* \w */
  283: };
  284: 
  285: static const uschar *posix_substitutes[] = {
  286:   (uschar *)"\\p{L}",     /* alpha */
  287:   (uschar *)"\\p{Ll}",    /* lower */
  288:   (uschar *)"\\p{Lu}",    /* upper */
  289:   (uschar *)"\\p{Xan}",   /* alnum */
  290:   NULL,                   /* ascii */
  291:   (uschar *)"\\h",        /* blank */
  292:   NULL,                   /* cntrl */
  293:   (uschar *)"\\p{Nd}",    /* digit */
  294:   NULL,                   /* graph */
  295:   NULL,                   /* print */
  296:   NULL,                   /* punct */
  297:   (uschar *)"\\p{Xps}",   /* space */    /* NOTE: Xps is POSIX space */
  298:   (uschar *)"\\p{Xwd}",   /* word */
  299:   NULL,                   /* xdigit */
  300:   /* Negated cases */
  301:   (uschar *)"\\P{L}",     /* ^alpha */
  302:   (uschar *)"\\P{Ll}",    /* ^lower */
  303:   (uschar *)"\\P{Lu}",    /* ^upper */
  304:   (uschar *)"\\P{Xan}",   /* ^alnum */
  305:   NULL,                   /* ^ascii */
  306:   (uschar *)"\\H",        /* ^blank */
  307:   NULL,                   /* ^cntrl */
  308:   (uschar *)"\\P{Nd}",    /* ^digit */
  309:   NULL,                   /* ^graph */
  310:   NULL,                   /* ^print */
  311:   NULL,                   /* ^punct */
  312:   (uschar *)"\\P{Xps}",   /* ^space */   /* NOTE: Xps is POSIX space */
  313:   (uschar *)"\\P{Xwd}",   /* ^word */
  314:   NULL                    /* ^xdigit */
  315: };
  316: #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))
  317: #endif
  318: 
  319: #define STRING(a)  # a
  320: #define XSTRING(s) STRING(s)
  321: 
  322: /* The texts of compile-time error messages. These are "char *" because they
  323: are passed to the outside world. Do not ever re-use any error number, because
  324: they are documented. Always add a new error instead. Messages marked DEAD below
  325: are no longer used. This used to be a table of strings, but in order to reduce
  326: the number of relocations needed when a shared library is loaded dynamically,
  327: it is now one long string. We cannot use a table of offsets, because the
  328: lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
  329: simply count through to the one we want - this isn't a performance issue
  330: because these strings are used only when there is a compilation error.
  331: 
  332: Each substring ends with \0 to insert a null character. This includes the final
  333: substring, so that the whole string ends with \0\0, which can be detected when
  334: counting through. */
  335: 
  336: static const char error_texts[] =
  337:   "no error\0"
  338:   "\\ at end of pattern\0"
  339:   "\\c at end of pattern\0"
  340:   "unrecognized character follows \\\0"
  341:   "numbers out of order in {} quantifier\0"
  342:   /* 5 */
  343:   "number too big in {} quantifier\0"
  344:   "missing terminating ] for character class\0"
  345:   "invalid escape sequence in character class\0"
  346:   "range out of order in character class\0"
  347:   "nothing to repeat\0"
  348:   /* 10 */
  349:   "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
  350:   "internal error: unexpected repeat\0"
  351:   "unrecognized character after (? or (?-\0"
  352:   "POSIX named classes are supported only within a class\0"
  353:   "missing )\0"
  354:   /* 15 */
  355:   "reference to non-existent subpattern\0"
  356:   "erroffset passed as NULL\0"
  357:   "unknown option bit(s) set\0"
  358:   "missing ) after comment\0"
  359:   "parentheses nested too deeply\0"  /** DEAD **/
  360:   /* 20 */
  361:   "regular expression is too large\0"
  362:   "failed to get memory\0"
  363:   "unmatched parentheses\0"
  364:   "internal error: code overflow\0"
  365:   "unrecognized character after (?<\0"
  366:   /* 25 */
  367:   "lookbehind assertion is not fixed length\0"
  368:   "malformed number or name after (?(\0"
  369:   "conditional group contains more than two branches\0"
  370:   "assertion expected after (?(\0"
  371:   "(?R or (?[+-]digits must be followed by )\0"
  372:   /* 30 */
  373:   "unknown POSIX class name\0"
  374:   "POSIX collating elements are not supported\0"
  375:   "this version of PCRE is not compiled with PCRE_UTF8 support\0"
  376:   "spare error\0"  /** DEAD **/
  377:   "character value in \\x{...} sequence is too large\0"
  378:   /* 35 */
  379:   "invalid condition (?(0)\0"
  380:   "\\C not allowed in lookbehind assertion\0"
  381:   "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
  382:   "number after (?C is > 255\0"
  383:   "closing ) for (?C expected\0"
  384:   /* 40 */
  385:   "recursive call could loop indefinitely\0"
  386:   "unrecognized character after (?P\0"
  387:   "syntax error in subpattern name (missing terminator)\0"
  388:   "two named subpatterns have the same name\0"
  389:   "invalid UTF-8 string\0"
  390:   /* 45 */
  391:   "support for \\P, \\p, and \\X has not been compiled\0"
  392:   "malformed \\P or \\p sequence\0"
  393:   "unknown property name after \\P or \\p\0"
  394:   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
  395:   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
  396:   /* 50 */
  397:   "repeated subpattern is too long\0"    /** DEAD **/
  398:   "octal value is greater than \\377 (not in UTF-8 mode)\0"
  399:   "internal error: overran compiling workspace\0"
  400:   "internal error: previously-checked referenced subpattern not found\0"
  401:   "DEFINE group contains more than one branch\0"
  402:   /* 55 */
  403:   "repeating a DEFINE group is not allowed\0"  /** DEAD **/
  404:   "inconsistent NEWLINE options\0"
  405:   "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
  406:   "a numbered reference must not be zero\0"
  407:   "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
  408:   /* 60 */
  409:   "(*VERB) not recognized\0"
  410:   "number is too big\0"
  411:   "subpattern name expected\0"
  412:   "digit expected after (?+\0"
  413:   "] is an invalid data character in JavaScript compatibility mode\0"
  414:   /* 65 */
  415:   "different names for subpatterns of the same number are not allowed\0"
  416:   "(*MARK) must have an argument\0"
  417:   "this version of PCRE is not compiled with PCRE_UCP support\0"
  418:   "\\c must be followed by an ASCII character\0"
  419:   "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
  420:   /* 70 */
  421:   "internal error: unknown opcode in find_fixedlength()\0"
  422:   "\\N is not supported in a class\0"
  423:   "too many forward references\0"
  424:   ;
  425: 
  426: /* Table to identify digits and hex digits. This is used when compiling
  427: patterns. Note that the tables in chartables are dependent on the locale, and
  428: may mark arbitrary characters as digits - but the PCRE compiling code expects
  429: to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
  430: a private table here. It costs 256 bytes, but it is a lot faster than doing
  431: character value tests (at least in some simple cases I timed), and in some
  432: applications one wants PCRE to compile efficiently as well as match
  433: efficiently.
  434: 
  435: For convenience, we use the same bit definitions as in chartables:
  436: 
  437:   0x04   decimal digit
  438:   0x08   hexadecimal digit
  439: 
  440: Then we can use ctype_digit and ctype_xdigit in the code. */
  441: 
  442: #ifndef EBCDIC
  443: 
  444: /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
  445: UTF-8 mode. */
  446: 
  447: static const unsigned char digitab[] =
  448:   {
  449:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
  450:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
  451:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
  452:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
  453:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
  454:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
  455:   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
  456:   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
  457:   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
  458:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
  459:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
  460:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
  461:   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
  462:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
  463:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
  464:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
  465:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
  466:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
  467:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
  468:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
  469:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
  470:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
  471:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
  472:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
  473:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
  474:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
  475:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
  476:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
  477:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
  478:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
  479:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
  480:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
  481: 
  482: #else
  483: 
  484: /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
  485: 
  486: static const unsigned char digitab[] =
  487:   {
  488:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
  489:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
  490:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
  491:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
  492:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
  493:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
  494:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
  495:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
  496:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
  497:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
  498:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
  499:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
  500:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
  501:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
  502:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
  503:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
  504:   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
  505:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
  506:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
  507:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
  508:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
  509:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
  510:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
  511:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
  512:   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
  513:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
  514:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
  515:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
  516:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
  517:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
  518:   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
  519:   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
  520: 
  521: static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
  522:   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
  523:   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
  524:   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
  525:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
  526:   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
  527:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
  528:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
  529:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
  530:   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
  531:   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
  532:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
  533:   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
  534:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
  535:   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
  536:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
  537:   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
  538:   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
  539:   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
  540:   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
  541:   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
  542:   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
  543:   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
  544:   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
  545:   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
  546:   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
  547:   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
  548:   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
  549:   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
  550:   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
  551:   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
  552:   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
  553:   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
  554: #endif
  555: 
  556: 
  557: /* Definition to allow mutual recursion */
  558: 
  559: static BOOL
  560:   compile_regex(int, uschar **, const uschar **, int *, BOOL, BOOL, int, int,
  561:     int *, int *, branch_chain *, compile_data *, int *);
  562: 
  563: 
  564: 
  565: /*************************************************
  566: *            Find an error text                  *
  567: *************************************************/
  568: 
  569: /* The error texts are now all in one long string, to save on relocations. As
  570: some of the text is of unknown length, we can't use a table of offsets.
  571: Instead, just count through the strings. This is not a performance issue
  572: because it happens only when there has been a compilation error.
  573: 
  574: Argument:   the error number
  575: Returns:    pointer to the error string
  576: */
  577: 
  578: static const char *
  579: find_error_text(int n)
  580: {
  581: const char *s = error_texts;
  582: for (; n > 0; n--)
  583:   {
  584:   while (*s++ != 0) {};
  585:   if (*s == 0) return "Error text not found (please report)";
  586:   }
  587: return s;
  588: }
  589: 
  590: 
  591: /*************************************************
  592: *           Expand the workspace                 *
  593: *************************************************/
  594: 
  595: /* This function is called during the second compiling phase, if the number of
  596: forward references fills the existing workspace, which is originally a block on
  597: the stack. A larger block is obtained from malloc() unless the ultimate limit
  598: has been reached or the increase will be rather small.
  599: 
  600: Argument: pointer to the compile data block
  601: Returns:  0 if all went well, else an error number
  602: */
  603: 
  604: static int
  605: expand_workspace(compile_data *cd)
  606: {
  607: uschar *newspace;
  608: int newsize = cd->workspace_size * 2;
  609: 
  610: if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
  611: if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
  612:     newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
  613:  return ERR72;
  614: 
  615: newspace = (pcre_malloc)(newsize);
  616: if (newspace == NULL) return ERR21;
  617: 
  618: memcpy(newspace, cd->start_workspace, cd->workspace_size);
  619: cd->hwm = (uschar *)newspace + (cd->hwm - cd->start_workspace);
  620: if (cd->workspace_size > COMPILE_WORK_SIZE)
  621:   (pcre_free)((void *)cd->start_workspace);
  622: cd->start_workspace = newspace;
  623: cd->workspace_size = newsize;
  624: return 0;
  625: }
  626: 
  627: 
  628: 
  629: /*************************************************
  630: *            Check for counted repeat            *
  631: *************************************************/
  632: 
  633: /* This function is called when a '{' is encountered in a place where it might
  634: start a quantifier. It looks ahead to see if it really is a quantifier or not.
  635: It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
  636: where the ddds are digits.
  637: 
  638: Arguments:
  639:   p         pointer to the first char after '{'
  640: 
  641: Returns:    TRUE or FALSE
  642: */
  643: 
  644: static BOOL
  645: is_counted_repeat(const uschar *p)
  646: {
  647: if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
  648: while ((digitab[*p] & ctype_digit) != 0) p++;
  649: if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
  650: 
  651: if (*p++ != CHAR_COMMA) return FALSE;
  652: if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
  653: 
  654: if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
  655: while ((digitab[*p] & ctype_digit) != 0) p++;
  656: 
  657: return (*p == CHAR_RIGHT_CURLY_BRACKET);
  658: }
  659: 
  660: 
  661: 
  662: /*************************************************
  663: *            Handle escapes                      *
  664: *************************************************/
  665: 
  666: /* This function is called when a \ has been encountered. It either returns a
  667: positive value for a simple escape such as \n, or a negative value which
  668: encodes one of the more complicated things such as \d. A backreference to group
  669: n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
  670: UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
  671: ptr is pointing at the \. On exit, it is on the final character of the escape
  672: sequence.
  673: 
  674: Arguments:
  675:   ptrptr         points to the pattern position pointer
  676:   errorcodeptr   points to the errorcode variable
  677:   bracount       number of previous extracting brackets
  678:   options        the options bits
  679:   isclass        TRUE if inside a character class
  680: 
  681: Returns:         zero or positive => a data character
  682:                  negative => a special escape sequence
  683:                  on error, errorcodeptr is set
  684: */
  685: 
  686: static int
  687: check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
  688:   int options, BOOL isclass)
  689: {
  690: BOOL utf8 = (options & PCRE_UTF8) != 0;
  691: const uschar *ptr = *ptrptr + 1;
  692: int c, i;
  693: 
  694: GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
  695: ptr--;                            /* Set pointer back to the last byte */
  696: 
  697: /* If backslash is at the end of the pattern, it's an error. */
  698: 
  699: if (c == 0) *errorcodeptr = ERR1;
  700: 
  701: /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
  702: in a table. A non-zero result is something that can be returned immediately.
  703: Otherwise further processing may be required. */
  704: 
  705: #ifndef EBCDIC  /* ASCII/UTF-8 coding */
  706: else if (c < CHAR_0 || c > CHAR_z) {}                     /* Not alphanumeric */
  707: else if ((i = escapes[c - CHAR_0]) != 0) c = i;
  708: 
  709: #else           /* EBCDIC coding */
  710: else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
  711: else if ((i = escapes[c - 0x48]) != 0)  c = i;
  712: #endif
  713: 
  714: /* Escapes that need further processing, or are illegal. */
  715: 
  716: else
  717:   {
  718:   const uschar *oldptr;
  719:   BOOL braced, negated;
  720: 
  721:   switch (c)
  722:     {
  723:     /* A number of Perl escapes are not handled by PCRE. We give an explicit
  724:     error. */
  725: 
  726:     case CHAR_l:
  727:     case CHAR_L:
  728:     *errorcodeptr = ERR37;
  729:     break;
  730: 
  731:     case CHAR_u:
  732:     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
  733:       {
  734:       /* In JavaScript, \u must be followed by four hexadecimal numbers.
  735:       Otherwise it is a lowercase u letter. */
  736:       if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0
  737:            && (digitab[ptr[3]] & ctype_xdigit) != 0 && (digitab[ptr[4]] & ctype_xdigit) != 0)
  738:         {
  739:         c = 0;
  740:         for (i = 0; i < 4; ++i)
  741:           {
  742:           register int cc = *(++ptr);
  743: #ifndef EBCDIC  /* ASCII/UTF-8 coding */
  744:           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
  745:           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
  746: #else           /* EBCDIC coding */
  747:           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
  748:           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
  749: #endif
  750:           }
  751:         }
  752:       }
  753:     else
  754:       *errorcodeptr = ERR37;
  755:     break;
  756: 
  757:     case CHAR_U:
  758:     /* In JavaScript, \U is an uppercase U letter. */
  759:     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
  760:     break;
  761: 
  762:     /* In a character class, \g is just a literal "g". Outside a character
  763:     class, \g must be followed by one of a number of specific things:
  764: 
  765:     (1) A number, either plain or braced. If positive, it is an absolute
  766:     backreference. If negative, it is a relative backreference. This is a Perl
  767:     5.10 feature.
  768: 
  769:     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
  770:     is part of Perl's movement towards a unified syntax for back references. As
  771:     this is synonymous with \k{name}, we fudge it up by pretending it really
  772:     was \k.
  773: 
  774:     (3) For Oniguruma compatibility we also support \g followed by a name or a
  775:     number either in angle brackets or in single quotes. However, these are
  776:     (possibly recursive) subroutine calls, _not_ backreferences. Just return
  777:     the -ESC_g code (cf \k). */
  778: 
  779:     case CHAR_g:
  780:     if (isclass) break;
  781:     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
  782:       {
  783:       c = -ESC_g;
  784:       break;
  785:       }
  786: 
  787:     /* Handle the Perl-compatible cases */
  788: 
  789:     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
  790:       {
  791:       const uschar *p;
  792:       for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
  793:         if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
  794:       if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
  795:         {
  796:         c = -ESC_k;
  797:         break;
  798:         }
  799:       braced = TRUE;
  800:       ptr++;
  801:       }
  802:     else braced = FALSE;
  803: 
  804:     if (ptr[1] == CHAR_MINUS)
  805:       {
  806:       negated = TRUE;
  807:       ptr++;
  808:       }
  809:     else negated = FALSE;
  810: 
  811:     c = 0;
  812:     while ((digitab[ptr[1]] & ctype_digit) != 0)
  813:       c = c * 10 + *(++ptr) - CHAR_0;
  814: 
  815:     if (c < 0)   /* Integer overflow */
  816:       {
  817:       *errorcodeptr = ERR61;
  818:       break;
  819:       }
  820: 
  821:     if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
  822:       {
  823:       *errorcodeptr = ERR57;
  824:       break;
  825:       }
  826: 
  827:     if (c == 0)
  828:       {
  829:       *errorcodeptr = ERR58;
  830:       break;
  831:       }
  832: 
  833:     if (negated)
  834:       {
  835:       if (c > bracount)
  836:         {
  837:         *errorcodeptr = ERR15;
  838:         break;
  839:         }
  840:       c = bracount - (c - 1);
  841:       }
  842: 
  843:     c = -(ESC_REF + c);
  844:     break;
  845: 
  846:     /* The handling of escape sequences consisting of a string of digits
  847:     starting with one that is not zero is not straightforward. By experiment,
  848:     the way Perl works seems to be as follows:
  849: 
  850:     Outside a character class, the digits are read as a decimal number. If the
  851:     number is less than 10, or if there are that many previous extracting
  852:     left brackets, then it is a back reference. Otherwise, up to three octal
  853:     digits are read to form an escaped byte. Thus \123 is likely to be octal
  854:     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
  855:     value is greater than 377, the least significant 8 bits are taken. Inside a
  856:     character class, \ followed by a digit is always an octal number. */
  857: 
  858:     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
  859:     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
  860: 
  861:     if (!isclass)
  862:       {
  863:       oldptr = ptr;
  864:       c -= CHAR_0;
  865:       while ((digitab[ptr[1]] & ctype_digit) != 0)
  866:         c = c * 10 + *(++ptr) - CHAR_0;
  867:       if (c < 0)    /* Integer overflow */
  868:         {
  869:         *errorcodeptr = ERR61;
  870:         break;
  871:         }
  872:       if (c < 10 || c <= bracount)
  873:         {
  874:         c = -(ESC_REF + c);
  875:         break;
  876:         }
  877:       ptr = oldptr;      /* Put the pointer back and fall through */
  878:       }
  879: 
  880:     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
  881:     generates a binary zero byte and treats the digit as a following literal.
  882:     Thus we have to pull back the pointer by one. */
  883: 
  884:     if ((c = *ptr) >= CHAR_8)
  885:       {
  886:       ptr--;
  887:       c = 0;
  888:       break;
  889:       }
  890: 
  891:     /* \0 always starts an octal number, but we may drop through to here with a
  892:     larger first octal digit. The original code used just to take the least
  893:     significant 8 bits of octal numbers (I think this is what early Perls used
  894:     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
  895:     than 3 octal digits. */
  896: 
  897:     case CHAR_0:
  898:     c -= CHAR_0;
  899:     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
  900:         c = c * 8 + *(++ptr) - CHAR_0;
  901:     if (!utf8 && c > 255) *errorcodeptr = ERR51;
  902:     break;
  903: 
  904:     /* \x is complicated. \x{ddd} is a character number which can be greater
  905:     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
  906:     treated as a data character. */
  907: 
  908:     case CHAR_x:
  909:     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
  910:       {
  911:       /* In JavaScript, \x must be followed by two hexadecimal numbers.
  912:       Otherwise it is a lowercase x letter. */
  913:       if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0)
  914:         {
  915:         c = 0;
  916:         for (i = 0; i < 2; ++i)
  917:           {
  918:           register int cc = *(++ptr);
  919: #ifndef EBCDIC  /* ASCII/UTF-8 coding */
  920:           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
  921:           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
  922: #else           /* EBCDIC coding */
  923:           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
  924:           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
  925: #endif
  926:           }
  927:         }
  928:       break;
  929:       }
  930: 
  931:     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
  932:       {
  933:       const uschar *pt = ptr + 2;
  934:       int count = 0;
  935: 
  936:       c = 0;
  937:       while ((digitab[*pt] & ctype_xdigit) != 0)
  938:         {
  939:         register int cc = *pt++;
  940:         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
  941:         count++;
  942: 
  943: #ifndef EBCDIC  /* ASCII/UTF-8 coding */
  944:         if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
  945:         c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
  946: #else           /* EBCDIC coding */
  947:         if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
  948:         c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
  949: #endif
  950:         }
  951: 
  952:       if (*pt == CHAR_RIGHT_CURLY_BRACKET)
  953:         {
  954:         if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
  955:         ptr = pt;
  956:         break;
  957:         }
  958: 
  959:       /* If the sequence of hex digits does not end with '}', then we don't
  960:       recognize this construct; fall through to the normal \x handling. */
  961:       }
  962: 
  963:     /* Read just a single-byte hex-defined char */
  964: 
  965:     c = 0;
  966:     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
  967:       {
  968:       int cc;                                  /* Some compilers don't like */
  969:       cc = *(++ptr);                           /* ++ in initializers */
  970: #ifndef EBCDIC  /* ASCII/UTF-8 coding */
  971:       if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
  972:       c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
  973: #else           /* EBCDIC coding */
  974:       if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
  975:       c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
  976: #endif
  977:       }
  978:     break;
  979: 
  980:     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
  981:     An error is given if the byte following \c is not an ASCII character. This
  982:     coding is ASCII-specific, but then the whole concept of \cx is
  983:     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
  984: 
  985:     case CHAR_c:
  986:     c = *(++ptr);
  987:     if (c == 0)
  988:       {
  989:       *errorcodeptr = ERR2;
  990:       break;
  991:       }
  992: #ifndef EBCDIC    /* ASCII/UTF-8 coding */
  993:     if (c > 127)  /* Excludes all non-ASCII in either mode */
  994:       {
  995:       *errorcodeptr = ERR68;
  996:       break;
  997:       }
  998:     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
  999:     c ^= 0x40;
 1000: #else             /* EBCDIC coding */
 1001:     if (c >= CHAR_a && c <= CHAR_z) c += 64;
 1002:     c ^= 0xC0;
 1003: #endif
 1004:     break;
 1005: 
 1006:     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
 1007:     other alphanumeric following \ is an error if PCRE_EXTRA was set;
 1008:     otherwise, for Perl compatibility, it is a literal. This code looks a bit
 1009:     odd, but there used to be some cases other than the default, and there may
 1010:     be again in future, so I haven't "optimized" it. */
 1011: 
 1012:     default:
 1013:     if ((options & PCRE_EXTRA) != 0) switch(c)
 1014:       {
 1015:       default:
 1016:       *errorcodeptr = ERR3;
 1017:       break;
 1018:       }
 1019:     break;
 1020:     }
 1021:   }
 1022: 
 1023: /* Perl supports \N{name} for character names, as well as plain \N for "not
 1024: newline". PCRE does not support \N{name}. However, it does support
 1025: quantification such as \N{2,3}. */
 1026: 
 1027: if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
 1028:      !is_counted_repeat(ptr+2))
 1029:   *errorcodeptr = ERR37;
 1030: 
 1031: /* If PCRE_UCP is set, we change the values for \d etc. */
 1032: 
 1033: if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
 1034:   c -= (ESC_DU - ESC_D);
 1035: 
 1036: /* Set the pointer to the final character before returning. */
 1037: 
 1038: *ptrptr = ptr;
 1039: return c;
 1040: }
 1041: 
 1042: 
 1043: 
 1044: #ifdef SUPPORT_UCP
 1045: /*************************************************
 1046: *               Handle \P and \p                 *
 1047: *************************************************/
 1048: 
 1049: /* This function is called after \P or \p has been encountered, provided that
 1050: PCRE is compiled with support for Unicode properties. On entry, ptrptr is
 1051: pointing at the P or p. On exit, it is pointing at the final character of the
 1052: escape sequence.
 1053: 
 1054: Argument:
 1055:   ptrptr         points to the pattern position pointer
 1056:   negptr         points to a boolean that is set TRUE for negation else FALSE
 1057:   dptr           points to an int that is set to the detailed property value
 1058:   errorcodeptr   points to the error code variable
 1059: 
 1060: Returns:         type value from ucp_type_table, or -1 for an invalid type
 1061: */
 1062: 
 1063: static int
 1064: get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
 1065: {
 1066: int c, i, bot, top;
 1067: const uschar *ptr = *ptrptr;
 1068: char name[32];
 1069: 
 1070: c = *(++ptr);
 1071: if (c == 0) goto ERROR_RETURN;
 1072: 
 1073: *negptr = FALSE;
 1074: 
 1075: /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
 1076: negation. */
 1077: 
 1078: if (c == CHAR_LEFT_CURLY_BRACKET)
 1079:   {
 1080:   if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
 1081:     {
 1082:     *negptr = TRUE;
 1083:     ptr++;
 1084:     }
 1085:   for (i = 0; i < (int)sizeof(name) - 1; i++)
 1086:     {
 1087:     c = *(++ptr);
 1088:     if (c == 0) goto ERROR_RETURN;
 1089:     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
 1090:     name[i] = c;
 1091:     }
 1092:   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
 1093:   name[i] = 0;
 1094:   }
 1095: 
 1096: /* Otherwise there is just one following character */
 1097: 
 1098: else
 1099:   {
 1100:   name[0] = c;
 1101:   name[1] = 0;
 1102:   }
 1103: 
 1104: *ptrptr = ptr;
 1105: 
 1106: /* Search for a recognized property name using binary chop */
 1107: 
 1108: bot = 0;
 1109: top = _pcre_utt_size;
 1110: 
 1111: while (bot < top)
 1112:   {
 1113:   i = (bot + top) >> 1;
 1114:   c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
 1115:   if (c == 0)
 1116:     {
 1117:     *dptr = _pcre_utt[i].value;
 1118:     return _pcre_utt[i].type;
 1119:     }
 1120:   if (c > 0) bot = i + 1; else top = i;
 1121:   }
 1122: 
 1123: *errorcodeptr = ERR47;
 1124: *ptrptr = ptr;
 1125: return -1;
 1126: 
 1127: ERROR_RETURN:
 1128: *errorcodeptr = ERR46;
 1129: *ptrptr = ptr;
 1130: return -1;
 1131: }
 1132: #endif
 1133: 
 1134: 
 1135: 
 1136: 
 1137: /*************************************************
 1138: *         Read repeat counts                     *
 1139: *************************************************/
 1140: 
 1141: /* Read an item of the form {n,m} and return the values. This is called only
 1142: after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
 1143: so the syntax is guaranteed to be correct, but we need to check the values.
 1144: 
 1145: Arguments:
 1146:   p              pointer to first char after '{'
 1147:   minp           pointer to int for min
 1148:   maxp           pointer to int for max
 1149:                  returned as -1 if no max
 1150:   errorcodeptr   points to error code variable
 1151: 
 1152: Returns:         pointer to '}' on success;
 1153:                  current ptr on error, with errorcodeptr set non-zero
 1154: */
 1155: 
 1156: static const uschar *
 1157: read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
 1158: {
 1159: int min = 0;
 1160: int max = -1;
 1161: 
 1162: /* Read the minimum value and do a paranoid check: a negative value indicates
 1163: an integer overflow. */
 1164: 
 1165: while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
 1166: if (min < 0 || min > 65535)
 1167:   {
 1168:   *errorcodeptr = ERR5;
 1169:   return p;
 1170:   }
 1171: 
 1172: /* Read the maximum value if there is one, and again do a paranoid on its size.
 1173: Also, max must not be less than min. */
 1174: 
 1175: if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
 1176:   {
 1177:   if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
 1178:     {
 1179:     max = 0;
 1180:     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
 1181:     if (max < 0 || max > 65535)
 1182:       {
 1183:       *errorcodeptr = ERR5;
 1184:       return p;
 1185:       }
 1186:     if (max < min)
 1187:       {
 1188:       *errorcodeptr = ERR4;
 1189:       return p;
 1190:       }
 1191:     }
 1192:   }
 1193: 
 1194: /* Fill in the required variables, and pass back the pointer to the terminating
 1195: '}'. */
 1196: 
 1197: *minp = min;
 1198: *maxp = max;
 1199: return p;
 1200: }
 1201: 
 1202: 
 1203: 
 1204: /*************************************************
 1205: *  Subroutine for finding forward reference      *
 1206: *************************************************/
 1207: 
 1208: /* This recursive function is called only from find_parens() below. The
 1209: top-level call starts at the beginning of the pattern. All other calls must
 1210: start at a parenthesis. It scans along a pattern's text looking for capturing
 1211: subpatterns, and counting them. If it finds a named pattern that matches the
 1212: name it is given, it returns its number. Alternatively, if the name is NULL, it
 1213: returns when it reaches a given numbered subpattern. Recursion is used to keep
 1214: track of subpatterns that reset the capturing group numbers - the (?| feature.
 1215: 
 1216: This function was originally called only from the second pass, in which we know
 1217: that if (?< or (?' or (?P< is encountered, the name will be correctly
 1218: terminated because that is checked in the first pass. There is now one call to
 1219: this function in the first pass, to check for a recursive back reference by
 1220: name (so that we can make the whole group atomic). In this case, we need check
 1221: only up to the current position in the pattern, and that is still OK because
 1222: and previous occurrences will have been checked. To make this work, the test
 1223: for "end of pattern" is a check against cd->end_pattern in the main loop,
 1224: instead of looking for a binary zero. This means that the special first-pass
 1225: call can adjust cd->end_pattern temporarily. (Checks for binary zero while
 1226: processing items within the loop are OK, because afterwards the main loop will
 1227: terminate.)
 1228: 
 1229: Arguments:
 1230:   ptrptr       address of the current character pointer (updated)
 1231:   cd           compile background data
 1232:   name         name to seek, or NULL if seeking a numbered subpattern
 1233:   lorn         name length, or subpattern number if name is NULL
 1234:   xmode        TRUE if we are in /x mode
 1235:   utf8         TRUE if we are in UTF-8 mode
 1236:   count        pointer to the current capturing subpattern number (updated)
 1237: 
 1238: Returns:       the number of the named subpattern, or -1 if not found
 1239: */
 1240: 
 1241: static int
 1242: find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
 1243:   BOOL xmode, BOOL utf8, int *count)
 1244: {
 1245: uschar *ptr = *ptrptr;
 1246: int start_count = *count;
 1247: int hwm_count = start_count;
 1248: BOOL dup_parens = FALSE;
 1249: 
 1250: /* If the first character is a parenthesis, check on the type of group we are
 1251: dealing with. The very first call may not start with a parenthesis. */
 1252: 
 1253: if (ptr[0] == CHAR_LEFT_PARENTHESIS)
 1254:   {
 1255:   /* Handle specials such as (*SKIP) or (*UTF8) etc. */
 1256: 
 1257:   if (ptr[1] == CHAR_ASTERISK) ptr += 2;
 1258: 
 1259:   /* Handle a normal, unnamed capturing parenthesis. */
 1260: 
 1261:   else if (ptr[1] != CHAR_QUESTION_MARK)
 1262:     {
 1263:     *count += 1;
 1264:     if (name == NULL && *count == lorn) return *count;
 1265:     ptr++;
 1266:     }
 1267: 
 1268:   /* All cases now have (? at the start. Remember when we are in a group
 1269:   where the parenthesis numbers are duplicated. */
 1270: 
 1271:   else if (ptr[2] == CHAR_VERTICAL_LINE)
 1272:     {
 1273:     ptr += 3;
 1274:     dup_parens = TRUE;
 1275:     }
 1276: 
 1277:   /* Handle comments; all characters are allowed until a ket is reached. */
 1278: 
 1279:   else if (ptr[2] == CHAR_NUMBER_SIGN)
 1280:     {
 1281:     for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
 1282:     goto FAIL_EXIT;
 1283:     }
 1284: 
 1285:   /* Handle a condition. If it is an assertion, just carry on so that it
 1286:   is processed as normal. If not, skip to the closing parenthesis of the
 1287:   condition (there can't be any nested parens). */
 1288: 
 1289:   else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
 1290:     {
 1291:     ptr += 2;
 1292:     if (ptr[1] != CHAR_QUESTION_MARK)
 1293:       {
 1294:       while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
 1295:       if (*ptr != 0) ptr++;
 1296:       }
 1297:     }
 1298: 
 1299:   /* Start with (? but not a condition. */
 1300: 
 1301:   else
 1302:     {
 1303:     ptr += 2;
 1304:     if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */
 1305: 
 1306:     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
 1307: 
 1308:     if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
 1309:         ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
 1310:       {
 1311:       int term;
 1312:       const uschar *thisname;
 1313:       *count += 1;
 1314:       if (name == NULL && *count == lorn) return *count;
 1315:       term = *ptr++;
 1316:       if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
 1317:       thisname = ptr;
 1318:       while (*ptr != term) ptr++;
 1319:       if (name != NULL && lorn == ptr - thisname &&
 1320:           strncmp((const char *)name, (const char *)thisname, lorn) == 0)
 1321:         return *count;
 1322:       term++;
 1323:       }
 1324:     }
 1325:   }
 1326: 
 1327: /* Past any initial parenthesis handling, scan for parentheses or vertical
 1328: bars. Stop if we get to cd->end_pattern. Note that this is important for the
 1329: first-pass call when this value is temporarily adjusted to stop at the current
 1330: position. So DO NOT change this to a test for binary zero. */
 1331: 
 1332: for (; ptr < cd->end_pattern; ptr++)
 1333:   {
 1334:   /* Skip over backslashed characters and also entire \Q...\E */
 1335: 
 1336:   if (*ptr == CHAR_BACKSLASH)
 1337:     {
 1338:     if (*(++ptr) == 0) goto FAIL_EXIT;
 1339:     if (*ptr == CHAR_Q) for (;;)
 1340:       {
 1341:       while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
 1342:       if (*ptr == 0) goto FAIL_EXIT;
 1343:       if (*(++ptr) == CHAR_E) break;
 1344:       }
 1345:     continue;
 1346:     }
 1347: 
 1348:   /* Skip over character classes; this logic must be similar to the way they
 1349:   are handled for real. If the first character is '^', skip it. Also, if the
 1350:   first few characters (either before or after ^) are \Q\E or \E we skip them
 1351:   too. This makes for compatibility with Perl. Note the use of STR macros to
 1352:   encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
 1353: 
 1354:   if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
 1355:     {
 1356:     BOOL negate_class = FALSE;
 1357:     for (;;)
 1358:       {
 1359:       if (ptr[1] == CHAR_BACKSLASH)
 1360:         {
 1361:         if (ptr[2] == CHAR_E)
 1362:           ptr+= 2;
 1363:         else if (strncmp((const char *)ptr+2,
 1364:                  STR_Q STR_BACKSLASH STR_E, 3) == 0)
 1365:           ptr += 4;
 1366:         else
 1367:           break;
 1368:         }
 1369:       else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
 1370:         {
 1371:         negate_class = TRUE;
 1372:         ptr++;
 1373:         }
 1374:       else break;
 1375:       }
 1376: 
 1377:     /* If the next character is ']', it is a data character that must be
 1378:     skipped, except in JavaScript compatibility mode. */
 1379: 
 1380:     if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
 1381:         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
 1382:       ptr++;
 1383: 
 1384:     while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
 1385:       {
 1386:       if (*ptr == 0) return -1;
 1387:       if (*ptr == CHAR_BACKSLASH)
 1388:         {
 1389:         if (*(++ptr) == 0) goto FAIL_EXIT;
 1390:         if (*ptr == CHAR_Q) for (;;)
 1391:           {
 1392:           while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
 1393:           if (*ptr == 0) goto FAIL_EXIT;
 1394:           if (*(++ptr) == CHAR_E) break;
 1395:           }
 1396:         continue;
 1397:         }
 1398:       }
 1399:     continue;
 1400:     }
 1401: 
 1402:   /* Skip comments in /x mode */
 1403: 
 1404:   if (xmode && *ptr == CHAR_NUMBER_SIGN)
 1405:     {
 1406:     ptr++;
 1407:     while (*ptr != 0)
 1408:       {
 1409:       if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
 1410:       ptr++;
 1411: #ifdef SUPPORT_UTF8
 1412:       if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
 1413: #endif
 1414:       }
 1415:     if (*ptr == 0) goto FAIL_EXIT;
 1416:     continue;
 1417:     }
 1418: 
 1419:   /* Check for the special metacharacters */
 1420: 
 1421:   if (*ptr == CHAR_LEFT_PARENTHESIS)
 1422:     {
 1423:     int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
 1424:     if (rc > 0) return rc;
 1425:     if (*ptr == 0) goto FAIL_EXIT;
 1426:     }
 1427: 
 1428:   else if (*ptr == CHAR_RIGHT_PARENTHESIS)
 1429:     {
 1430:     if (dup_parens && *count < hwm_count) *count = hwm_count;
 1431:     goto FAIL_EXIT;
 1432:     }
 1433: 
 1434:   else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
 1435:     {
 1436:     if (*count > hwm_count) hwm_count = *count;
 1437:     *count = start_count;
 1438:     }
 1439:   }
 1440: 
 1441: FAIL_EXIT:
 1442: *ptrptr = ptr;
 1443: return -1;
 1444: }
 1445: 
 1446: 
 1447: 
 1448: 
 1449: /*************************************************
 1450: *       Find forward referenced subpattern       *
 1451: *************************************************/
 1452: 
 1453: /* This function scans along a pattern's text looking for capturing
 1454: subpatterns, and counting them. If it finds a named pattern that matches the
 1455: name it is given, it returns its number. Alternatively, if the name is NULL, it
 1456: returns when it reaches a given numbered subpattern. This is used for forward
 1457: references to subpatterns. We used to be able to start this scan from the
 1458: current compiling point, using the current count value from cd->bracount, and
 1459: do it all in a single loop, but the addition of the possibility of duplicate
 1460: subpattern numbers means that we have to scan from the very start, in order to
 1461: take account of such duplicates, and to use a recursive function to keep track
 1462: of the different types of group.
 1463: 
 1464: Arguments:
 1465:   cd           compile background data
 1466:   name         name to seek, or NULL if seeking a numbered subpattern
 1467:   lorn         name length, or subpattern number if name is NULL
 1468:   xmode        TRUE if we are in /x mode
 1469:   utf8         TRUE if we are in UTF-8 mode
 1470: 
 1471: Returns:       the number of the found subpattern, or -1 if not found
 1472: */
 1473: 
 1474: static int
 1475: find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,
 1476:   BOOL utf8)
 1477: {
 1478: uschar *ptr = (uschar *)cd->start_pattern;
 1479: int count = 0;
 1480: int rc;
 1481: 
 1482: /* If the pattern does not start with an opening parenthesis, the first call
 1483: to find_parens_sub() will scan right to the end (if necessary). However, if it
 1484: does start with a parenthesis, find_parens_sub() will return when it hits the
 1485: matching closing parens. That is why we have to have a loop. */
 1486: 
 1487: for (;;)
 1488:   {
 1489:   rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
 1490:   if (rc > 0 || *ptr++ == 0) break;
 1491:   }
 1492: 
 1493: return rc;
 1494: }
 1495: 
 1496: 
 1497: 
 1498: 
 1499: /*************************************************
 1500: *      Find first significant op code            *
 1501: *************************************************/
 1502: 
 1503: /* This is called by several functions that scan a compiled expression looking
 1504: for a fixed first character, or an anchoring op code etc. It skips over things
 1505: that do not influence this. For some calls, it makes sense to skip negative
 1506: forward and all backward assertions, and also the \b assertion; for others it
 1507: does not.
 1508: 
 1509: Arguments:
 1510:   code         pointer to the start of the group
 1511:   skipassert   TRUE if certain assertions are to be skipped
 1512: 
 1513: Returns:       pointer to the first significant opcode
 1514: */
 1515: 
 1516: static const uschar*
 1517: first_significant_code(const uschar *code, BOOL skipassert)
 1518: {
 1519: for (;;)
 1520:   {
 1521:   switch ((int)*code)
 1522:     {
 1523:     case OP_ASSERT_NOT:
 1524:     case OP_ASSERTBACK:
 1525:     case OP_ASSERTBACK_NOT:
 1526:     if (!skipassert) return code;
 1527:     do code += GET(code, 1); while (*code == OP_ALT);
 1528:     code += _pcre_OP_lengths[*code];
 1529:     break;
 1530: 
 1531:     case OP_WORD_BOUNDARY:
 1532:     case OP_NOT_WORD_BOUNDARY:
 1533:     if (!skipassert) return code;
 1534:     /* Fall through */
 1535: 
 1536:     case OP_CALLOUT:
 1537:     case OP_CREF:
 1538:     case OP_NCREF:
 1539:     case OP_RREF:
 1540:     case OP_NRREF:
 1541:     case OP_DEF:
 1542:     code += _pcre_OP_lengths[*code];
 1543:     break;
 1544: 
 1545:     default:
 1546:     return code;
 1547:     }
 1548:   }
 1549: /* Control never reaches here */
 1550: }
 1551: 
 1552: 
 1553: 
 1554: 
 1555: /*************************************************
 1556: *        Find the fixed length of a branch       *
 1557: *************************************************/
 1558: 
 1559: /* Scan a branch and compute the fixed length of subject that will match it,
 1560: if the length is fixed. This is needed for dealing with backward assertions.
 1561: In UTF8 mode, the result is in characters rather than bytes. The branch is
 1562: temporarily terminated with OP_END when this function is called.
 1563: 
 1564: This function is called when a backward assertion is encountered, so that if it
 1565: fails, the error message can point to the correct place in the pattern.
 1566: However, we cannot do this when the assertion contains subroutine calls,
 1567: because they can be forward references. We solve this by remembering this case
 1568: and doing the check at the end; a flag specifies which mode we are running in.
 1569: 
 1570: Arguments:
 1571:   code     points to the start of the pattern (the bracket)
 1572:   utf8     TRUE in UTF-8 mode
 1573:   atend    TRUE if called when the pattern is complete
 1574:   cd       the "compile data" structure
 1575: 
 1576: Returns:   the fixed length,
 1577:              or -1 if there is no fixed length,
 1578:              or -2 if \C was encountered (in UTF-8 mode only)
 1579:              or -3 if an OP_RECURSE item was encountered and atend is FALSE
 1580:              or -4 if an unknown opcode was encountered (internal error)
 1581: */
 1582: 
 1583: static int
 1584: find_fixedlength(uschar *code, BOOL utf8, BOOL atend, compile_data *cd)
 1585: {
 1586: int length = -1;
 1587: 
 1588: register int branchlength = 0;
 1589: register uschar *cc = code + 1 + LINK_SIZE;
 1590: 
 1591: /* Scan along the opcodes for this branch. If we get to the end of the
 1592: branch, check the length against that of the other branches. */
 1593: 
 1594: for (;;)
 1595:   {
 1596:   int d;
 1597:   uschar *ce, *cs;
 1598:   register int op = *cc;
 1599:   switch (op)
 1600:     {
 1601:     /* We only need to continue for OP_CBRA (normal capturing bracket) and
 1602:     OP_BRA (normal non-capturing bracket) because the other variants of these
 1603:     opcodes are all concerned with unlimited repeated groups, which of course
 1604:     are not of fixed length. */
 1605: 
 1606:     case OP_CBRA:
 1607:     case OP_BRA:
 1608:     case OP_ONCE:
 1609:     case OP_ONCE_NC:
 1610:     case OP_COND:
 1611:     d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), utf8, atend, cd);
 1612:     if (d < 0) return d;
 1613:     branchlength += d;
 1614:     do cc += GET(cc, 1); while (*cc == OP_ALT);
 1615:     cc += 1 + LINK_SIZE;
 1616:     break;
 1617: 
 1618:     /* Reached end of a branch; if it's a ket it is the end of a nested call.
 1619:     If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
 1620:     an ALT. If it is END it's the end of the outer call. All can be handled by
 1621:     the same code. Note that we must not include the OP_KETRxxx opcodes here,
 1622:     because they all imply an unlimited repeat. */
 1623: 
 1624:     case OP_ALT:
 1625:     case OP_KET:
 1626:     case OP_END:
 1627:     case OP_ACCEPT:
 1628:     case OP_ASSERT_ACCEPT:
 1629:     if (length < 0) length = branchlength;
 1630:       else if (length != branchlength) return -1;
 1631:     if (*cc != OP_ALT) return length;
 1632:     cc += 1 + LINK_SIZE;
 1633:     branchlength = 0;
 1634:     break;
 1635: 
 1636:     /* A true recursion implies not fixed length, but a subroutine call may
 1637:     be OK. If the subroutine is a forward reference, we can't deal with
 1638:     it until the end of the pattern, so return -3. */
 1639: 
 1640:     case OP_RECURSE:
 1641:     if (!atend) return -3;
 1642:     cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
 1643:     do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */
 1644:     if (cc > cs && cc < ce) return -1;                /* Recursion */
 1645:     d = find_fixedlength(cs + 2, utf8, atend, cd);
 1646:     if (d < 0) return d;
 1647:     branchlength += d;
 1648:     cc += 1 + LINK_SIZE;
 1649:     break;
 1650: 
 1651:     /* Skip over assertive subpatterns */
 1652: 
 1653:     case OP_ASSERT:
 1654:     case OP_ASSERT_NOT:
 1655:     case OP_ASSERTBACK:
 1656:     case OP_ASSERTBACK_NOT:
 1657:     do cc += GET(cc, 1); while (*cc == OP_ALT);
 1658:     /* Fall through */
 1659: 
 1660:     /* Skip over things that don't match chars */
 1661: 
 1662:     case OP_MARK:
 1663:     case OP_PRUNE_ARG:
 1664:     case OP_SKIP_ARG:
 1665:     case OP_THEN_ARG:
 1666:     cc += cc[1] + _pcre_OP_lengths[*cc];
 1667:     break;
 1668: 
 1669:     case OP_CALLOUT:
 1670:     case OP_CIRC:
 1671:     case OP_CIRCM:
 1672:     case OP_CLOSE:
 1673:     case OP_COMMIT:
 1674:     case OP_CREF:
 1675:     case OP_DEF:
 1676:     case OP_DOLL:
 1677:     case OP_DOLLM:
 1678:     case OP_EOD:
 1679:     case OP_EODN:
 1680:     case OP_FAIL:
 1681:     case OP_NCREF:
 1682:     case OP_NRREF:
 1683:     case OP_NOT_WORD_BOUNDARY:
 1684:     case OP_PRUNE:
 1685:     case OP_REVERSE:
 1686:     case OP_RREF:
 1687:     case OP_SET_SOM:
 1688:     case OP_SKIP:
 1689:     case OP_SOD:
 1690:     case OP_SOM:
 1691:     case OP_THEN:
 1692:     case OP_WORD_BOUNDARY:
 1693:     cc += _pcre_OP_lengths[*cc];
 1694:     break;
 1695: 
 1696:     /* Handle literal characters */
 1697: 
 1698:     case OP_CHAR:
 1699:     case OP_CHARI:
 1700:     case OP_NOT:
 1701:     case OP_NOTI:
 1702:     branchlength++;
 1703:     cc += 2;
 1704: #ifdef SUPPORT_UTF8
 1705:     if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
 1706: #endif
 1707:     break;
 1708: 
 1709:     /* Handle exact repetitions. The count is already in characters, but we
 1710:     need to skip over a multibyte character in UTF8 mode.  */
 1711: 
 1712:     case OP_EXACT:
 1713:     case OP_EXACTI:
 1714:     case OP_NOTEXACT:
 1715:     case OP_NOTEXACTI:
 1716:     branchlength += GET2(cc,1);
 1717:     cc += 4;
 1718: #ifdef SUPPORT_UTF8
 1719:     if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
 1720: #endif
 1721:     break;
 1722: 
 1723:     case OP_TYPEEXACT:
 1724:     branchlength += GET2(cc,1);
 1725:     if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
 1726:     cc += 4;
 1727:     break;
 1728: 
 1729:     /* Handle single-char matchers */
 1730: 
 1731:     case OP_PROP:
 1732:     case OP_NOTPROP:
 1733:     cc += 2;
 1734:     /* Fall through */
 1735: 
 1736:     case OP_HSPACE:
 1737:     case OP_VSPACE:
 1738:     case OP_NOT_HSPACE:
 1739:     case OP_NOT_VSPACE:
 1740:     case OP_NOT_DIGIT:
 1741:     case OP_DIGIT:
 1742:     case OP_NOT_WHITESPACE:
 1743:     case OP_WHITESPACE:
 1744:     case OP_NOT_WORDCHAR:
 1745:     case OP_WORDCHAR:
 1746:     case OP_ANY:
 1747:     case OP_ALLANY:
 1748:     branchlength++;
 1749:     cc++;
 1750:     break;
 1751: 
 1752:     /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
 1753:     otherwise \C is coded as OP_ALLANY. */
 1754: 
 1755:     case OP_ANYBYTE:
 1756:     return -2;
 1757: 
 1758:     /* Check a class for variable quantification */
 1759: 
 1760: #ifdef SUPPORT_UTF8
 1761:     case OP_XCLASS:
 1762:     cc += GET(cc, 1) - 33;
 1763:     /* Fall through */
 1764: #endif
 1765: 
 1766:     case OP_CLASS:
 1767:     case OP_NCLASS:
 1768:     cc += 33;
 1769: 
 1770:     switch (*cc)
 1771:       {
 1772:       case OP_CRPLUS:
 1773:       case OP_CRMINPLUS:
 1774:       case OP_CRSTAR:
 1775:       case OP_CRMINSTAR:
 1776:       case OP_CRQUERY:
 1777:       case OP_CRMINQUERY:
 1778:       return -1;
 1779: 
 1780:       case OP_CRRANGE:
 1781:       case OP_CRMINRANGE:
 1782:       if (GET2(cc,1) != GET2(cc,3)) return -1;
 1783:       branchlength += GET2(cc,1);
 1784:       cc += 5;
 1785:       break;
 1786: 
 1787:       default:
 1788:       branchlength++;
 1789:       }
 1790:     break;
 1791: 
 1792:     /* Anything else is variable length */
 1793: 
 1794:     case OP_ANYNL:
 1795:     case OP_BRAMINZERO:
 1796:     case OP_BRAPOS:
 1797:     case OP_BRAPOSZERO:
 1798:     case OP_BRAZERO:
 1799:     case OP_CBRAPOS:
 1800:     case OP_EXTUNI:
 1801:     case OP_KETRMAX:
 1802:     case OP_KETRMIN:
 1803:     case OP_KETRPOS:
 1804:     case OP_MINPLUS:
 1805:     case OP_MINPLUSI:
 1806:     case OP_MINQUERY:
 1807:     case OP_MINQUERYI:
 1808:     case OP_MINSTAR:
 1809:     case OP_MINSTARI:
 1810:     case OP_MINUPTO:
 1811:     case OP_MINUPTOI:
 1812:     case OP_NOTMINPLUS:
 1813:     case OP_NOTMINPLUSI:
 1814:     case OP_NOTMINQUERY:
 1815:     case OP_NOTMINQUERYI:
 1816:     case OP_NOTMINSTAR:
 1817:     case OP_NOTMINSTARI:
 1818:     case OP_NOTMINUPTO:
 1819:     case OP_NOTMINUPTOI:
 1820:     case OP_NOTPLUS:
 1821:     case OP_NOTPLUSI:
 1822:     case OP_NOTPOSPLUS:
 1823:     case OP_NOTPOSPLUSI:
 1824:     case OP_NOTPOSQUERY:
 1825:     case OP_NOTPOSQUERYI:
 1826:     case OP_NOTPOSSTAR:
 1827:     case OP_NOTPOSSTARI:
 1828:     case OP_NOTPOSUPTO:
 1829:     case OP_NOTPOSUPTOI:
 1830:     case OP_NOTQUERY:
 1831:     case OP_NOTQUERYI:
 1832:     case OP_NOTSTAR:
 1833:     case OP_NOTSTARI:
 1834:     case OP_NOTUPTO:
 1835:     case OP_NOTUPTOI:
 1836:     case OP_PLUS:
 1837:     case OP_PLUSI:
 1838:     case OP_POSPLUS:
 1839:     case OP_POSPLUSI:
 1840:     case OP_POSQUERY:
 1841:     case OP_POSQUERYI:
 1842:     case OP_POSSTAR:
 1843:     case OP_POSSTARI:
 1844:     case OP_POSUPTO:
 1845:     case OP_POSUPTOI:
 1846:     case OP_QUERY:
 1847:     case OP_QUERYI:
 1848:     case OP_REF:
 1849:     case OP_REFI:
 1850:     case OP_SBRA:
 1851:     case OP_SBRAPOS:
 1852:     case OP_SCBRA:
 1853:     case OP_SCBRAPOS:
 1854:     case OP_SCOND:
 1855:     case OP_SKIPZERO:
 1856:     case OP_STAR:
 1857:     case OP_STARI:
 1858:     case OP_TYPEMINPLUS:
 1859:     case OP_TYPEMINQUERY:
 1860:     case OP_TYPEMINSTAR:
 1861:     case OP_TYPEMINUPTO:
 1862:     case OP_TYPEPLUS:
 1863:     case OP_TYPEPOSPLUS:
 1864:     case OP_TYPEPOSQUERY:
 1865:     case OP_TYPEPOSSTAR:
 1866:     case OP_TYPEPOSUPTO:
 1867:     case OP_TYPEQUERY:
 1868:     case OP_TYPESTAR:
 1869:     case OP_TYPEUPTO:
 1870:     case OP_UPTO:
 1871:     case OP_UPTOI:
 1872:     return -1;
 1873: 
 1874:     /* Catch unrecognized opcodes so that when new ones are added they
 1875:     are not forgotten, as has happened in the past. */
 1876: 
 1877:     default:
 1878:     return -4;
 1879:     }
 1880:   }
 1881: /* Control never gets here */
 1882: }
 1883: 
 1884: 
 1885: 
 1886: 
 1887: /*************************************************
 1888: *    Scan compiled regex for specific bracket    *
 1889: *************************************************/
 1890: 
 1891: /* This little function scans through a compiled pattern until it finds a
 1892: capturing bracket with the given number, or, if the number is negative, an
 1893: instance of OP_REVERSE for a lookbehind. The function is global in the C sense
 1894: so that it can be called from pcre_study() when finding the minimum matching
 1895: length.
 1896: 
 1897: Arguments:
 1898:   code        points to start of expression
 1899:   utf8        TRUE in UTF-8 mode
 1900:   number      the required bracket number or negative to find a lookbehind
 1901: 
 1902: Returns:      pointer to the opcode for the bracket, or NULL if not found
 1903: */
 1904: 
 1905: const uschar *
 1906: _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
 1907: {
 1908: for (;;)
 1909:   {
 1910:   register int c = *code;
 1911: 
 1912:   if (c == OP_END) return NULL;
 1913: 
 1914:   /* XCLASS is used for classes that cannot be represented just by a bit
 1915:   map. This includes negated single high-valued characters. The length in
 1916:   the table is zero; the actual length is stored in the compiled code. */
 1917: 
 1918:   if (c == OP_XCLASS) code += GET(code, 1);
 1919: 
 1920:   /* Handle recursion */
 1921: 
 1922:   else if (c == OP_REVERSE)
 1923:     {
 1924:     if (number < 0) return (uschar *)code;
 1925:     code += _pcre_OP_lengths[c];
 1926:     }
 1927: 
 1928:   /* Handle capturing bracket */
 1929: 
 1930:   else if (c == OP_CBRA || c == OP_SCBRA ||
 1931:            c == OP_CBRAPOS || c == OP_SCBRAPOS)
 1932:     {
 1933:     int n = GET2(code, 1+LINK_SIZE);
 1934:     if (n == number) return (uschar *)code;
 1935:     code += _pcre_OP_lengths[c];
 1936:     }
 1937: 
 1938:   /* Otherwise, we can get the item's length from the table, except that for
 1939:   repeated character types, we have to test for \p and \P, which have an extra
 1940:   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
 1941:   must add in its length. */
 1942: 
 1943:   else
 1944:     {
 1945:     switch(c)
 1946:       {
 1947:       case OP_TYPESTAR:
 1948:       case OP_TYPEMINSTAR:
 1949:       case OP_TYPEPLUS:
 1950:       case OP_TYPEMINPLUS:
 1951:       case OP_TYPEQUERY:
 1952:       case OP_TYPEMINQUERY:
 1953:       case OP_TYPEPOSSTAR:
 1954:       case OP_TYPEPOSPLUS:
 1955:       case OP_TYPEPOSQUERY:
 1956:       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
 1957:       break;
 1958: 
 1959:       case OP_TYPEUPTO:
 1960:       case OP_TYPEMINUPTO:
 1961:       case OP_TYPEEXACT:
 1962:       case OP_TYPEPOSUPTO:
 1963:       if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
 1964:       break;
 1965: 
 1966:       case OP_MARK:
 1967:       case OP_PRUNE_ARG:
 1968:       case OP_SKIP_ARG:
 1969:       code += code[1];
 1970:       break;
 1971: 
 1972:       case OP_THEN_ARG:
 1973:       code += code[1];
 1974:       break;
 1975:       }
 1976: 
 1977:     /* Add in the fixed length from the table */
 1978: 
 1979:     code += _pcre_OP_lengths[c];
 1980: 
 1981:   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
 1982:   a multi-byte character. The length in the table is a minimum, so we have to
 1983:   arrange to skip the extra bytes. */
 1984: 
 1985: #ifdef SUPPORT_UTF8
 1986:     if (utf8) switch(c)
 1987:       {
 1988:       case OP_CHAR:
 1989:       case OP_CHARI:
 1990:       case OP_EXACT:
 1991:       case OP_EXACTI:
 1992:       case OP_UPTO:
 1993:       case OP_UPTOI:
 1994:       case OP_MINUPTO:
 1995:       case OP_MINUPTOI:
 1996:       case OP_POSUPTO:
 1997:       case OP_POSUPTOI:
 1998:       case OP_STAR:
 1999:       case OP_STARI:
 2000:       case OP_MINSTAR:
 2001:       case OP_MINSTARI:
 2002:       case OP_POSSTAR:
 2003:       case OP_POSSTARI:
 2004:       case OP_PLUS:
 2005:       case OP_PLUSI:
 2006:       case OP_MINPLUS:
 2007:       case OP_MINPLUSI:
 2008:       case OP_POSPLUS:
 2009:       case OP_POSPLUSI:
 2010:       case OP_QUERY:
 2011:       case OP_QUERYI:
 2012:       case OP_MINQUERY:
 2013:       case OP_MINQUERYI:
 2014:       case OP_POSQUERY:
 2015:       case OP_POSQUERYI:
 2016:       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
 2017:       break;
 2018:       }
 2019: #else
 2020:     (void)(utf8);  /* Keep compiler happy by referencing function argument */
 2021: #endif
 2022:     }
 2023:   }
 2024: }
 2025: 
 2026: 
 2027: 
 2028: /*************************************************
 2029: *   Scan compiled regex for recursion reference  *
 2030: *************************************************/
 2031: 
 2032: /* This little function scans through a compiled pattern until it finds an
 2033: instance of OP_RECURSE.
 2034: 
 2035: Arguments:
 2036:   code        points to start of expression
 2037:   utf8        TRUE in UTF-8 mode
 2038: 
 2039: Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
 2040: */
 2041: 
 2042: static const uschar *
 2043: find_recurse(const uschar *code, BOOL utf8)
 2044: {
 2045: for (;;)
 2046:   {
 2047:   register int c = *code;
 2048:   if (c == OP_END) return NULL;
 2049:   if (c == OP_RECURSE) return code;
 2050: 
 2051:   /* XCLASS is used for classes that cannot be represented just by a bit
 2052:   map. This includes negated single high-valued characters. The length in
 2053:   the table is zero; the actual length is stored in the compiled code. */
 2054: 
 2055:   if (c == OP_XCLASS) code += GET(code, 1);
 2056: 
 2057:   /* Otherwise, we can get the item's length from the table, except that for
 2058:   repeated character types, we have to test for \p and \P, which have an extra
 2059:   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
 2060:   must add in its length. */
 2061: 
 2062:   else
 2063:     {
 2064:     switch(c)
 2065:       {
 2066:       case OP_TYPESTAR:
 2067:       case OP_TYPEMINSTAR:
 2068:       case OP_TYPEPLUS:
 2069:       case OP_TYPEMINPLUS:
 2070:       case OP_TYPEQUERY:
 2071:       case OP_TYPEMINQUERY:
 2072:       case OP_TYPEPOSSTAR:
 2073:       case OP_TYPEPOSPLUS:
 2074:       case OP_TYPEPOSQUERY:
 2075:       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
 2076:       break;
 2077: 
 2078:       case OP_TYPEPOSUPTO:
 2079:       case OP_TYPEUPTO:
 2080:       case OP_TYPEMINUPTO:
 2081:       case OP_TYPEEXACT:
 2082:       if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
 2083:       break;
 2084: 
 2085:       case OP_MARK:
 2086:       case OP_PRUNE_ARG:
 2087:       case OP_SKIP_ARG:
 2088:       code += code[1];
 2089:       break;
 2090: 
 2091:       case OP_THEN_ARG:
 2092:       code += code[1];
 2093:       break;
 2094:       }
 2095: 
 2096:     /* Add in the fixed length from the table */
 2097: 
 2098:     code += _pcre_OP_lengths[c];
 2099: 
 2100:     /* In UTF-8 mode, opcodes that are followed by a character may be followed
 2101:     by a multi-byte character. The length in the table is a minimum, so we have
 2102:     to arrange to skip the extra bytes. */
 2103: 
 2104: #ifdef SUPPORT_UTF8
 2105:     if (utf8) switch(c)
 2106:       {
 2107:       case OP_CHAR:
 2108:       case OP_CHARI:
 2109:       case OP_EXACT:
 2110:       case OP_EXACTI:
 2111:       case OP_UPTO:
 2112:       case OP_UPTOI:
 2113:       case OP_MINUPTO:
 2114:       case OP_MINUPTOI:
 2115:       case OP_POSUPTO:
 2116:       case OP_POSUPTOI:
 2117:       case OP_STAR:
 2118:       case OP_STARI:
 2119:       case OP_MINSTAR:
 2120:       case OP_MINSTARI:
 2121:       case OP_POSSTAR:
 2122:       case OP_POSSTARI:
 2123:       case OP_PLUS:
 2124:       case OP_PLUSI:
 2125:       case OP_MINPLUS:
 2126:       case OP_MINPLUSI:
 2127:       case OP_POSPLUS:
 2128:       case OP_POSPLUSI:
 2129:       case OP_QUERY:
 2130:       case OP_QUERYI:
 2131:       case OP_MINQUERY:
 2132:       case OP_MINQUERYI:
 2133:       case OP_POSQUERY:
 2134:       case OP_POSQUERYI:
 2135:       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
 2136:       break;
 2137:       }
 2138: #else
 2139:     (void)(utf8);  /* Keep compiler happy by referencing function argument */
 2140: #endif
 2141:     }
 2142:   }
 2143: }
 2144: 
 2145: 
 2146: 
 2147: /*************************************************
 2148: *    Scan compiled branch for non-emptiness      *
 2149: *************************************************/
 2150: 
 2151: /* This function scans through a branch of a compiled pattern to see whether it
 2152: can match the empty string or not. It is called from could_be_empty()
 2153: below and from compile_branch() when checking for an unlimited repeat of a
 2154: group that can match nothing. Note that first_significant_code() skips over
 2155: backward and negative forward assertions when its final argument is TRUE. If we
 2156: hit an unclosed bracket, we return "empty" - this means we've struck an inner
 2157: bracket whose current branch will already have been scanned.
 2158: 
 2159: Arguments:
 2160:   code        points to start of search
 2161:   endcode     points to where to stop
 2162:   utf8        TRUE if in UTF8 mode
 2163:   cd          contains pointers to tables etc.
 2164: 
 2165: Returns:      TRUE if what is matched could be empty
 2166: */
 2167: 
 2168: static BOOL
 2169: could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
 2170:   compile_data *cd)
 2171: {
 2172: register int c;
 2173: for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE);
 2174:      code < endcode;
 2175:      code = first_significant_code(code + _pcre_OP_lengths[c], TRUE))
 2176:   {
 2177:   const uschar *ccode;
 2178: 
 2179:   c = *code;
 2180: 
 2181:   /* Skip over forward assertions; the other assertions are skipped by
 2182:   first_significant_code() with a TRUE final argument. */
 2183: 
 2184:   if (c == OP_ASSERT)
 2185:     {
 2186:     do code += GET(code, 1); while (*code == OP_ALT);
 2187:     c = *code;
 2188:     continue;
 2189:     }
 2190: 
 2191:   /* For a recursion/subroutine call, if its end has been reached, which
 2192:   implies a backward reference subroutine call, we can scan it. If it's a
 2193:   forward reference subroutine call, we can't. To detect forward reference
 2194:   we have to scan up the list that is kept in the workspace. This function is
 2195:   called only when doing the real compile, not during the pre-compile that
 2196:   measures the size of the compiled pattern. */
 2197: 
 2198:   if (c == OP_RECURSE)
 2199:     {
 2200:     const uschar *scode;
 2201:     BOOL empty_branch;
 2202: 
 2203:     /* Test for forward reference */
 2204: 
 2205:     for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
 2206:       if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
 2207: 
 2208:     /* Not a forward reference, test for completed backward reference */
 2209: 
 2210:     empty_branch = FALSE;
 2211:     scode = cd->start_code + GET(code, 1);
 2212:     if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
 2213: 
 2214:     /* Completed backwards reference */
 2215: 
 2216:     do
 2217:       {
 2218:       if (could_be_empty_branch(scode, endcode, utf8, cd))
 2219:         {
 2220:         empty_branch = TRUE;
 2221:         break;
 2222:         }
 2223:       scode += GET(scode, 1);
 2224:       }
 2225:     while (*scode == OP_ALT);
 2226: 
 2227:     if (!empty_branch) return FALSE;  /* All branches are non-empty */
 2228:     continue;
 2229:     }
 2230: 
 2231:   /* Groups with zero repeats can of course be empty; skip them. */
 2232: 
 2233:   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
 2234:       c == OP_BRAPOSZERO)
 2235:     {
 2236:     code += _pcre_OP_lengths[c];
 2237:     do code += GET(code, 1); while (*code == OP_ALT);
 2238:     c = *code;
 2239:     continue;
 2240:     }
 2241: 
 2242:   /* A nested group that is already marked as "could be empty" can just be
 2243:   skipped. */
 2244: 
 2245:   if (c == OP_SBRA  || c == OP_SBRAPOS ||
 2246:       c == OP_SCBRA || c == OP_SCBRAPOS)
 2247:     {
 2248:     do code += GET(code, 1); while (*code == OP_ALT);
 2249:     c = *code;
 2250:     continue;
 2251:     }
 2252: 
 2253:   /* For other groups, scan the branches. */
 2254: 
 2255:   if (c == OP_BRA  || c == OP_BRAPOS ||
 2256:       c == OP_CBRA || c == OP_CBRAPOS ||
 2257:       c == OP_ONCE || c == OP_ONCE_NC ||
 2258:       c == OP_COND)
 2259:     {
 2260:     BOOL empty_branch;
 2261:     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
 2262: 
 2263:     /* If a conditional group has only one branch, there is a second, implied,
 2264:     empty branch, so just skip over the conditional, because it could be empty.
 2265:     Otherwise, scan the individual branches of the group. */
 2266: 
 2267:     if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
 2268:       code += GET(code, 1);
 2269:     else
 2270:       {
 2271:       empty_branch = FALSE;
 2272:       do
 2273:         {
 2274:         if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
 2275:           empty_branch = TRUE;
 2276:         code += GET(code, 1);
 2277:         }
 2278:       while (*code == OP_ALT);
 2279:       if (!empty_branch) return FALSE;   /* All branches are non-empty */
 2280:       }
 2281: 
 2282:     c = *code;
 2283:     continue;
 2284:     }
 2285: 
 2286:   /* Handle the other opcodes */
 2287: 
 2288:   switch (c)
 2289:     {
 2290:     /* Check for quantifiers after a class. XCLASS is used for classes that
 2291:     cannot be represented just by a bit map. This includes negated single
 2292:     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
 2293:     actual length is stored in the compiled code, so we must update "code"
 2294:     here. */
 2295: 
 2296: #ifdef SUPPORT_UTF8
 2297:     case OP_XCLASS:
 2298:     ccode = code += GET(code, 1);
 2299:     goto CHECK_CLASS_REPEAT;
 2300: #endif
 2301: 
 2302:     case OP_CLASS:
 2303:     case OP_NCLASS:
 2304:     ccode = code + 33;
 2305: 
 2306: #ifdef SUPPORT_UTF8
 2307:     CHECK_CLASS_REPEAT:
 2308: #endif
 2309: 
 2310:     switch (*ccode)
 2311:       {
 2312:       case OP_CRSTAR:            /* These could be empty; continue */
 2313:       case OP_CRMINSTAR:
 2314:       case OP_CRQUERY:
 2315:       case OP_CRMINQUERY:
 2316:       break;
 2317: 
 2318:       default:                   /* Non-repeat => class must match */
 2319:       case OP_CRPLUS:            /* These repeats aren't empty */
 2320:       case OP_CRMINPLUS:
 2321:       return FALSE;
 2322: 
 2323:       case OP_CRRANGE:
 2324:       case OP_CRMINRANGE:
 2325:       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
 2326:       break;
 2327:       }
 2328:     break;
 2329: 
 2330:     /* Opcodes that must match a character */
 2331: 
 2332:     case OP_PROP:
 2333:     case OP_NOTPROP:
 2334:     case OP_EXTUNI:
 2335:     case OP_NOT_DIGIT:
 2336:     case OP_DIGIT:
 2337:     case OP_NOT_WHITESPACE:
 2338:     case OP_WHITESPACE:
 2339:     case OP_NOT_WORDCHAR:
 2340:     case OP_WORDCHAR:
 2341:     case OP_ANY:
 2342:     case OP_ALLANY:
 2343:     case OP_ANYBYTE:
 2344:     case OP_CHAR:
 2345:     case OP_CHARI:
 2346:     case OP_NOT:
 2347:     case OP_NOTI:
 2348:     case OP_PLUS:
 2349:     case OP_MINPLUS:
 2350:     case OP_POSPLUS:
 2351:     case OP_EXACT:
 2352:     case OP_NOTPLUS:
 2353:     case OP_NOTMINPLUS:
 2354:     case OP_NOTPOSPLUS:
 2355:     case OP_NOTEXACT:
 2356:     case OP_TYPEPLUS:
 2357:     case OP_TYPEMINPLUS:
 2358:     case OP_TYPEPOSPLUS:
 2359:     case OP_TYPEEXACT:
 2360:     return FALSE;
 2361: 
 2362:     /* These are going to continue, as they may be empty, but we have to
 2363:     fudge the length for the \p and \P cases. */
 2364: 
 2365:     case OP_TYPESTAR:
 2366:     case OP_TYPEMINSTAR:
 2367:     case OP_TYPEPOSSTAR:
 2368:     case OP_TYPEQUERY:
 2369:     case OP_TYPEMINQUERY:
 2370:     case OP_TYPEPOSQUERY:
 2371:     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
 2372:     break;
 2373: 
 2374:     /* Same for these */
 2375: 
 2376:     case OP_TYPEUPTO:
 2377:     case OP_TYPEMINUPTO:
 2378:     case OP_TYPEPOSUPTO:
 2379:     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
 2380:     break;
 2381: 
 2382:     /* End of branch */
 2383: 
 2384:     case OP_KET:
 2385:     case OP_KETRMAX:
 2386:     case OP_KETRMIN:
 2387:     case OP_KETRPOS:
 2388:     case OP_ALT:
 2389:     return TRUE;
 2390: 
 2391:     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
 2392:     MINUPTO, and POSUPTO may be followed by a multibyte character */
 2393: 
 2394: #ifdef SUPPORT_UTF8
 2395:     case OP_STAR:
 2396:     case OP_STARI:
 2397:     case OP_MINSTAR:
 2398:     case OP_MINSTARI:
 2399:     case OP_POSSTAR:
 2400:     case OP_POSSTARI:
 2401:     case OP_QUERY:
 2402:     case OP_QUERYI:
 2403:     case OP_MINQUERY:
 2404:     case OP_MINQUERYI:
 2405:     case OP_POSQUERY:
 2406:     case OP_POSQUERYI:
 2407:     if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
 2408:     break;
 2409: 
 2410:     case OP_UPTO:
 2411:     case OP_UPTOI:
 2412:     case OP_MINUPTO:
 2413:     case OP_MINUPTOI:
 2414:     case OP_POSUPTO:
 2415:     case OP_POSUPTOI:
 2416:     if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
 2417:     break;
 2418: #endif
 2419: 
 2420:     /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
 2421:     string. */
 2422: 
 2423:     case OP_MARK:
 2424:     case OP_PRUNE_ARG:
 2425:     case OP_SKIP_ARG:
 2426:     code += code[1];
 2427:     break;
 2428: 
 2429:     case OP_THEN_ARG:
 2430:     code += code[1];
 2431:     break;
 2432: 
 2433:     /* None of the remaining opcodes are required to match a character. */
 2434: 
 2435:     default:
 2436:     break;
 2437:     }
 2438:   }
 2439: 
 2440: return TRUE;
 2441: }
 2442: 
 2443: 
 2444: 
 2445: /*************************************************
 2446: *    Scan compiled regex for non-emptiness       *
 2447: *************************************************/
 2448: 
 2449: /* This function is called to check for left recursive calls. We want to check
 2450: the current branch of the current pattern to see if it could match the empty
 2451: string. If it could, we must look outwards for branches at other levels,
 2452: stopping when we pass beyond the bracket which is the subject of the recursion.
 2453: This function is called only during the real compile, not during the
 2454: pre-compile.
 2455: 
 2456: Arguments:
 2457:   code        points to start of the recursion
 2458:   endcode     points to where to stop (current RECURSE item)
 2459:   bcptr       points to the chain of current (unclosed) branch starts
 2460:   utf8        TRUE if in UTF-8 mode
 2461:   cd          pointers to tables etc
 2462: 
 2463: Returns:      TRUE if what is matched could be empty
 2464: */
 2465: 
 2466: static BOOL
 2467: could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
 2468:   BOOL utf8, compile_data *cd)
 2469: {
 2470: while (bcptr != NULL && bcptr->current_branch >= code)
 2471:   {
 2472:   if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
 2473:     return FALSE;
 2474:   bcptr = bcptr->outer;
 2475:   }
 2476: return TRUE;
 2477: }
 2478: 
 2479: 
 2480: 
 2481: /*************************************************
 2482: *           Check for POSIX class syntax         *
 2483: *************************************************/
 2484: 
 2485: /* This function is called when the sequence "[:" or "[." or "[=" is
 2486: encountered in a character class. It checks whether this is followed by a
 2487: sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
 2488: reach an unescaped ']' without the special preceding character, return FALSE.
 2489: 
 2490: Originally, this function only recognized a sequence of letters between the
 2491: terminators, but it seems that Perl recognizes any sequence of characters,
 2492: though of course unknown POSIX names are subsequently rejected. Perl gives an
 2493: "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
 2494: didn't consider this to be a POSIX class. Likewise for [:1234:].
 2495: 
 2496: The problem in trying to be exactly like Perl is in the handling of escapes. We
 2497: have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
 2498: class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
 2499: below handles the special case of \], but does not try to do any other escape
 2500: processing. This makes it different from Perl for cases such as [:l\ower:]
 2501: where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
 2502: "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
 2503: I think.
 2504: 
 2505: A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
 2506: It seems that the appearance of a nested POSIX class supersedes an apparent
 2507: external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
 2508: a digit.
 2509: 
 2510: In Perl, unescaped square brackets may also appear as part of class names. For
 2511: example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
 2512: [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
 2513: seem right at all. PCRE does not allow closing square brackets in POSIX class
 2514: names.
 2515: 
 2516: Arguments:
 2517:   ptr      pointer to the initial [
 2518:   endptr   where to return the end pointer
 2519: 
 2520: Returns:   TRUE or FALSE
 2521: */
 2522: 
 2523: static BOOL
 2524: check_posix_syntax(const uschar *ptr, const uschar **endptr)
 2525: {
 2526: int terminator;          /* Don't combine these lines; the Solaris cc */
 2527: terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
 2528: for (++ptr; *ptr != 0; ptr++)
 2529:   {
 2530:   if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
 2531:     ptr++;
 2532:   else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
 2533:   else
 2534:     {
 2535:     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
 2536:       {
 2537:       *endptr = ptr;
 2538:       return TRUE;
 2539:       }
 2540:     if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
 2541:          (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
 2542:           ptr[1] == CHAR_EQUALS_SIGN) &&
 2543:         check_posix_syntax(ptr, endptr))
 2544:       return FALSE;
 2545:     }
 2546:   }
 2547: return FALSE;
 2548: }
 2549: 
 2550: 
 2551: 
 2552: 
 2553: /*************************************************
 2554: *          Check POSIX class name                *
 2555: *************************************************/
 2556: 
 2557: /* This function is called to check the name given in a POSIX-style class entry
 2558: such as [:alnum:].
 2559: 
 2560: Arguments:
 2561:   ptr        points to the first letter
 2562:   len        the length of the name
 2563: 
 2564: Returns:     a value representing the name, or -1 if unknown
 2565: */
 2566: 
 2567: static int
 2568: check_posix_name(const uschar *ptr, int len)
 2569: {
 2570: const char *pn = posix_names;
 2571: register int yield = 0;
 2572: while (posix_name_lengths[yield] != 0)
 2573:   {
 2574:   if (len == posix_name_lengths[yield] &&
 2575:     strncmp((const char *)ptr, pn, len) == 0) return yield;
 2576:   pn += posix_name_lengths[yield] + 1;
 2577:   yield++;
 2578:   }
 2579: return -1;
 2580: }
 2581: 
 2582: 
 2583: /*************************************************
 2584: *    Adjust OP_RECURSE items in repeated group   *
 2585: *************************************************/
 2586: 
 2587: /* OP_RECURSE items contain an offset from the start of the regex to the group
 2588: that is referenced. This means that groups can be replicated for fixed
 2589: repetition simply by copying (because the recursion is allowed to refer to
 2590: earlier groups that are outside the current group). However, when a group is
 2591: optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
 2592: inserted before it, after it has been compiled. This means that any OP_RECURSE
 2593: items within it that refer to the group itself or any contained groups have to
 2594: have their offsets adjusted. That one of the jobs of this function. Before it
 2595: is called, the partially compiled regex must be temporarily terminated with
 2596: OP_END.
 2597: 
 2598: This function has been extended with the possibility of forward references for
 2599: recursions and subroutine calls. It must also check the list of such references
 2600: for the group we are dealing with. If it finds that one of the recursions in
 2601: the current group is on this list, it adjusts the offset in the list, not the
 2602: value in the reference (which is a group number).
 2603: 
 2604: Arguments:
 2605:   group      points to the start of the group
 2606:   adjust     the amount by which the group is to be moved
 2607:   utf8       TRUE in UTF-8 mode
 2608:   cd         contains pointers to tables etc.
 2609:   save_hwm   the hwm forward reference pointer at the start of the group
 2610: 
 2611: Returns:     nothing
 2612: */
 2613: 
 2614: static void
 2615: adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
 2616:   uschar *save_hwm)
 2617: {
 2618: uschar *ptr = group;
 2619: 
 2620: while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
 2621:   {
 2622:   int offset;
 2623:   uschar *hc;
 2624: 
 2625:   /* See if this recursion is on the forward reference list. If so, adjust the
 2626:   reference. */
 2627: 
 2628:   for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
 2629:     {
 2630:     offset = GET(hc, 0);
 2631:     if (cd->start_code + offset == ptr + 1)
 2632:       {
 2633:       PUT(hc, 0, offset + adjust);
 2634:       break;
 2635:       }
 2636:     }
 2637: 
 2638:   /* Otherwise, adjust the recursion offset if it's after the start of this
 2639:   group. */
 2640: 
 2641:   if (hc >= cd->hwm)
 2642:     {
 2643:     offset = GET(ptr, 1);
 2644:     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
 2645:     }
 2646: 
 2647:   ptr += 1 + LINK_SIZE;
 2648:   }
 2649: }
 2650: 
 2651: 
 2652: 
 2653: /*************************************************
 2654: *        Insert an automatic callout point       *
 2655: *************************************************/
 2656: 
 2657: /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
 2658: callout points before each pattern item.
 2659: 
 2660: Arguments:
 2661:   code           current code pointer
 2662:   ptr            current pattern pointer
 2663:   cd             pointers to tables etc
 2664: 
 2665: Returns:         new code pointer
 2666: */
 2667: 
 2668: static uschar *
 2669: auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
 2670: {
 2671: *code++ = OP_CALLOUT;
 2672: *code++ = 255;
 2673: PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
 2674: PUT(code, LINK_SIZE, 0);                       /* Default length */
 2675: return code + 2*LINK_SIZE;
 2676: }
 2677: 
 2678: 
 2679: 
 2680: /*************************************************
 2681: *         Complete a callout item                *
 2682: *************************************************/
 2683: 
 2684: /* A callout item contains the length of the next item in the pattern, which
 2685: we can't fill in till after we have reached the relevant point. This is used
 2686: for both automatic and manual callouts.
 2687: 
 2688: Arguments:
 2689:   previous_callout   points to previous callout item
 2690:   ptr                current pattern pointer
 2691:   cd                 pointers to tables etc
 2692: 
 2693: Returns:             nothing
 2694: */
 2695: 
 2696: static void
 2697: complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
 2698: {
 2699: int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
 2700: PUT(previous_callout, 2 + LINK_SIZE, length);
 2701: }
 2702: 
 2703: 
 2704: 
 2705: #ifdef SUPPORT_UCP
 2706: /*************************************************
 2707: *           Get othercase range                  *
 2708: *************************************************/
 2709: 
 2710: /* This function is passed the start and end of a class range, in UTF-8 mode
 2711: with UCP support. It searches up the characters, looking for internal ranges of
 2712: characters in the "other" case. Each call returns the next one, updating the
 2713: start address.
 2714: 
 2715: Arguments:
 2716:   cptr        points to starting character value; updated
 2717:   d           end value
 2718:   ocptr       where to put start of othercase range
 2719:   odptr       where to put end of othercase range
 2720: 
 2721: Yield:        TRUE when range returned; FALSE when no more
 2722: */
 2723: 
 2724: static BOOL
 2725: get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
 2726:   unsigned int *odptr)
 2727: {
 2728: unsigned int c, othercase, next;
 2729: 
 2730: for (c = *cptr; c <= d; c++)
 2731:   { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
 2732: 
 2733: if (c > d) return FALSE;
 2734: 
 2735: *ocptr = othercase;
 2736: next = othercase + 1;
 2737: 
 2738: for (++c; c <= d; c++)
 2739:   {
 2740:   if (UCD_OTHERCASE(c) != next) break;
 2741:   next++;
 2742:   }
 2743: 
 2744: *odptr = next - 1;
 2745: *cptr = c;
 2746: 
 2747: return TRUE;
 2748: }
 2749: 
 2750: 
 2751: 
 2752: /*************************************************
 2753: *        Check a character and a property        *
 2754: *************************************************/
 2755: 
 2756: /* This function is called by check_auto_possessive() when a property item
 2757: is adjacent to a fixed character.
 2758: 
 2759: Arguments:
 2760:   c            the character
 2761:   ptype        the property type
 2762:   pdata        the data for the type
 2763:   negated      TRUE if it's a negated property (\P or \p{^)
 2764: 
 2765: Returns:       TRUE if auto-possessifying is OK
 2766: */
 2767: 
 2768: static BOOL
 2769: check_char_prop(int c, int ptype, int pdata, BOOL negated)
 2770: {
 2771: const ucd_record *prop = GET_UCD(c);
 2772: switch(ptype)
 2773:   {
 2774:   case PT_LAMP:
 2775:   return (prop->chartype == ucp_Lu ||
 2776:           prop->chartype == ucp_Ll ||
 2777:           prop->chartype == ucp_Lt) == negated;
 2778: 
 2779:   case PT_GC:
 2780:   return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;
 2781: 
 2782:   case PT_PC:
 2783:   return (pdata == prop->chartype) == negated;
 2784: 
 2785:   case PT_SC:
 2786:   return (pdata == prop->script) == negated;
 2787: 
 2788:   /* These are specials */
 2789: 
 2790:   case PT_ALNUM:
 2791:   return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
 2792:           _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;
 2793: 
 2794:   case PT_SPACE:    /* Perl space */
 2795:   return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
 2796:           c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
 2797:           == negated;
 2798: 
 2799:   case PT_PXSPACE:  /* POSIX space */
 2800:   return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
 2801:           c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
 2802:           c == CHAR_FF || c == CHAR_CR)
 2803:           == negated;
 2804: 
 2805:   case PT_WORD:
 2806:   return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
 2807:           _pcre_ucp_gentype[prop->chartype] == ucp_N ||
 2808:           c == CHAR_UNDERSCORE) == negated;
 2809:   }
 2810: return FALSE;
 2811: }
 2812: #endif  /* SUPPORT_UCP */
 2813: 
 2814: 
 2815: 
 2816: /*************************************************
 2817: *     Check if auto-possessifying is possible    *
 2818: *************************************************/
 2819: 
 2820: /* This function is called for unlimited repeats of certain items, to see
 2821: whether the next thing could possibly match the repeated item. If not, it makes
 2822: sense to automatically possessify the repeated item.
 2823: 
 2824: Arguments:
 2825:   previous      pointer to the repeated opcode
 2826:   utf8          TRUE in UTF-8 mode
 2827:   ptr           next character in pattern
 2828:   options       options bits
 2829:   cd            contains pointers to tables etc.
 2830: 
 2831: Returns:        TRUE if possessifying is wanted
 2832: */
 2833: 
 2834: static BOOL
 2835: check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,
 2836:   int options, compile_data *cd)
 2837: {
 2838: int c, next;
 2839: int op_code = *previous++;
 2840: 
 2841: /* Skip whitespace and comments in extended mode */
 2842: 
 2843: if ((options & PCRE_EXTENDED) != 0)
 2844:   {
 2845:   for (;;)
 2846:     {
 2847:     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
 2848:     if (*ptr == CHAR_NUMBER_SIGN)
 2849:       {
 2850:       ptr++;
 2851:       while (*ptr != 0)
 2852:         {
 2853:         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
 2854:         ptr++;
 2855: #ifdef SUPPORT_UTF8
 2856:         if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
 2857: #endif
 2858:         }
 2859:       }
 2860:     else break;
 2861:     }
 2862:   }
 2863: 
 2864: /* If the next item is one that we can handle, get its value. A non-negative
 2865: value is a character, a negative value is an escape value. */
 2866: 
 2867: if (*ptr == CHAR_BACKSLASH)
 2868:   {
 2869:   int temperrorcode = 0;
 2870:   next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
 2871:   if (temperrorcode != 0) return FALSE;
 2872:   ptr++;    /* Point after the escape sequence */
 2873:   }
 2874: 
 2875: else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
 2876:   {
 2877: #ifdef SUPPORT_UTF8
 2878:   if (utf8) { GETCHARINC(next, ptr); } else
 2879: #endif
 2880:   next = *ptr++;
 2881:   }
 2882: 
 2883: else return FALSE;
 2884: 
 2885: /* Skip whitespace and comments in extended mode */
 2886: 
 2887: if ((options & PCRE_EXTENDED) != 0)
 2888:   {
 2889:   for (;;)
 2890:     {
 2891:     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
 2892:     if (*ptr == CHAR_NUMBER_SIGN)
 2893:       {
 2894:       ptr++;
 2895:       while (*ptr != 0)
 2896:         {
 2897:         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
 2898:         ptr++;
 2899: #ifdef SUPPORT_UTF8
 2900:         if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
 2901: #endif
 2902:         }
 2903:       }
 2904:     else break;
 2905:     }
 2906:   }
 2907: 
 2908: /* If the next thing is itself optional, we have to give up. */
 2909: 
 2910: if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
 2911:   strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
 2912:     return FALSE;
 2913: 
 2914: /* Now compare the next item with the previous opcode. First, handle cases when
 2915: the next item is a character. */
 2916: 
 2917: if (next >= 0) switch(op_code)
 2918:   {
 2919:   case OP_CHAR:
 2920: #ifdef SUPPORT_UTF8
 2921:   GETCHARTEST(c, previous);
 2922: #else
 2923:   c = *previous;
 2924: #endif
 2925:   return c != next;
 2926: 
 2927:   /* For CHARI (caseless character) we must check the other case. If we have
 2928:   Unicode property support, we can use it to test the other case of
 2929:   high-valued characters. */
 2930: 
 2931:   case OP_CHARI:
 2932: #ifdef SUPPORT_UTF8
 2933:   GETCHARTEST(c, previous);
 2934: #else
 2935:   c = *previous;
 2936: #endif
 2937:   if (c == next) return FALSE;
 2938: #ifdef SUPPORT_UTF8
 2939:   if (utf8)
 2940:     {
 2941:     unsigned int othercase;
 2942:     if (next < 128) othercase = cd->fcc[next]; else
 2943: #ifdef SUPPORT_UCP
 2944:     othercase = UCD_OTHERCASE((unsigned int)next);
 2945: #else
 2946:     othercase = NOTACHAR;
 2947: #endif
 2948:     return (unsigned int)c != othercase;
 2949:     }
 2950:   else
 2951: #endif  /* SUPPORT_UTF8 */
 2952:   return (c != cd->fcc[next]);  /* Non-UTF-8 mode */
 2953: 
 2954:   /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
 2955:   opcodes are not used for multi-byte characters, because they are coded using
 2956:   an XCLASS instead. */
 2957: 
 2958:   case OP_NOT:
 2959:   return (c = *previous) == next;
 2960: 
 2961:   case OP_NOTI:
 2962:   if ((c = *previous) == next) return TRUE;
 2963: #ifdef SUPPORT_UTF8
 2964:   if (utf8)
 2965:     {
 2966:     unsigned int othercase;
 2967:     if (next < 128) othercase = cd->fcc[next]; else
 2968: #ifdef SUPPORT_UCP
 2969:     othercase = UCD_OTHERCASE(next);
 2970: #else
 2971:     othercase = NOTACHAR;
 2972: #endif
 2973:     return (unsigned int)c == othercase;
 2974:     }
 2975:   else
 2976: #endif  /* SUPPORT_UTF8 */
 2977:   return (c == cd->fcc[next]);  /* Non-UTF-8 mode */
 2978: 
 2979:   /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
 2980:   When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
 2981: 
 2982:   case OP_DIGIT:
 2983:   return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
 2984: 
 2985:   case OP_NOT_DIGIT:
 2986:   return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
 2987: 
 2988:   case OP_WHITESPACE:
 2989:   return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
 2990: 
 2991:   case OP_NOT_WHITESPACE:
 2992:   return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
 2993: 
 2994:   case OP_WORDCHAR:
 2995:   return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
 2996: 
 2997:   case OP_NOT_WORDCHAR:
 2998:   return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
 2999: 
 3000:   case OP_HSPACE:
 3001:   case OP_NOT_HSPACE:
 3002:   switch(next)
 3003:     {
 3004:     case 0x09:
 3005:     case 0x20:
 3006:     case 0xa0:
 3007:     case 0x1680:
 3008:     case 0x180e:
 3009:     case 0x2000:
 3010:     case 0x2001:
 3011:     case 0x2002:
 3012:     case 0x2003:
 3013:     case 0x2004:
 3014:     case 0x2005:
 3015:     case 0x2006:
 3016:     case 0x2007:
 3017:     case 0x2008:
 3018:     case 0x2009:
 3019:     case 0x200A:
 3020:     case 0x202f:
 3021:     case 0x205f:
 3022:     case 0x3000:
 3023:     return op_code == OP_NOT_HSPACE;
 3024:     default:
 3025:     return op_code != OP_NOT_HSPACE;
 3026:     }
 3027: 
 3028:   case OP_ANYNL:
 3029:   case OP_VSPACE:
 3030:   case OP_NOT_VSPACE:
 3031:   switch(next)
 3032:     {
 3033:     case 0x0a:
 3034:     case 0x0b:
 3035:     case 0x0c:
 3036:     case 0x0d:
 3037:     case 0x85:
 3038:     case 0x2028:
 3039:     case 0x2029:
 3040:     return op_code == OP_NOT_VSPACE;
 3041:     default:
 3042:     return op_code != OP_NOT_VSPACE;
 3043:     }
 3044: 
 3045: #ifdef SUPPORT_UCP
 3046:   case OP_PROP:
 3047:   return check_char_prop(next, previous[0], previous[1], FALSE);
 3048: 
 3049:   case OP_NOTPROP:
 3050:   return check_char_prop(next, previous[0], previous[1], TRUE);
 3051: #endif
 3052: 
 3053:   default:
 3054:   return FALSE;
 3055:   }
 3056: 
 3057: 
 3058: /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
 3059: is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
 3060: generated only when PCRE_UCP is *not* set, that is, when only ASCII
 3061: characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
 3062: replaced by OP_PROP codes when PCRE_UCP is set. */
 3063: 
 3064: switch(op_code)
 3065:   {
 3066:   case OP_CHAR:
 3067:   case OP_CHARI:
 3068: #ifdef SUPPORT_UTF8
 3069:   GETCHARTEST(c, previous);
 3070: #else
 3071:   c = *previous;
 3072: #endif
 3073:   switch(-next)
 3074:     {
 3075:     case ESC_d:
 3076:     return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
 3077: 
 3078:     case ESC_D:
 3079:     return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
 3080: 
 3081:     case ESC_s:
 3082:     return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
 3083: 
 3084:     case ESC_S:
 3085:     return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
 3086: 
 3087:     case ESC_w:
 3088:     return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
 3089: 
 3090:     case ESC_W:
 3091:     return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
 3092: 
 3093:     case ESC_h:
 3094:     case ESC_H:
 3095:     switch(c)
 3096:       {
 3097:       case 0x09:
 3098:       case 0x20:
 3099:       case 0xa0:
 3100:       case 0x1680:
 3101:       case 0x180e:
 3102:       case 0x2000:
 3103:       case 0x2001:
 3104:       case 0x2002:
 3105:       case 0x2003:
 3106:       case 0x2004:
 3107:       case 0x2005:
 3108:       case 0x2006:
 3109:       case 0x2007:
 3110:       case 0x2008:
 3111:       case 0x2009:
 3112:       case 0x200A:
 3113:       case 0x202f:
 3114:       case 0x205f:
 3115:       case 0x3000:
 3116:       return -next != ESC_h;
 3117:       default:
 3118:       return -next == ESC_h;
 3119:       }
 3120: 
 3121:     case ESC_v:
 3122:     case ESC_V:
 3123:     switch(c)
 3124:       {
 3125:       case 0x0a:
 3126:       case 0x0b:
 3127:       case 0x0c:
 3128:       case 0x0d:
 3129:       case 0x85:
 3130:       case 0x2028:
 3131:       case 0x2029:
 3132:       return -next != ESC_v;
 3133:       default:
 3134:       return -next == ESC_v;
 3135:       }
 3136: 
 3137:     /* When PCRE_UCP is set, these values get generated for \d etc. Find
 3138:     their substitutions and process them. The result will always be either
 3139:     -ESC_p or -ESC_P. Then fall through to process those values. */
 3140: 
 3141: #ifdef SUPPORT_UCP
 3142:     case ESC_du:
 3143:     case ESC_DU:
 3144:     case ESC_wu:
 3145:     case ESC_WU:
 3146:     case ESC_su:
 3147:     case ESC_SU:
 3148:       {
 3149:       int temperrorcode = 0;
 3150:       ptr = substitutes[-next - ESC_DU];
 3151:       next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
 3152:       if (temperrorcode != 0) return FALSE;
 3153:       ptr++;    /* For compatibility */
 3154:       }
 3155:     /* Fall through */
 3156: 
 3157:     case ESC_p:
 3158:     case ESC_P:
 3159:       {
 3160:       int ptype, pdata, errorcodeptr;
 3161:       BOOL negated;
 3162: 
 3163:       ptr--;      /* Make ptr point at the p or P */
 3164:       ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
 3165:       if (ptype < 0) return FALSE;
 3166:       ptr++;      /* Point past the final curly ket */
 3167: 
 3168:       /* If the property item is optional, we have to give up. (When generated
 3169:       from \d etc by PCRE_UCP, this test will have been applied much earlier,
 3170:       to the original \d etc. At this point, ptr will point to a zero byte. */
 3171: 
 3172:       if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
 3173:         strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
 3174:           return FALSE;
 3175: 
 3176:       /* Do the property check. */
 3177: 
 3178:       return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
 3179:       }
 3180: #endif
 3181: 
 3182:     default:
 3183:     return FALSE;
 3184:     }
 3185: 
 3186:   /* In principle, support for Unicode properties should be integrated here as
 3187:   well. It means re-organizing the above code so as to get hold of the property
 3188:   values before switching on the op-code. However, I wonder how many patterns
 3189:   combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
 3190:   these op-codes are never generated.) */
 3191: 
 3192:   case OP_DIGIT:
 3193:   return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
 3194:          next == -ESC_h || next == -ESC_v || next == -ESC_R;
 3195: 
 3196:   case OP_NOT_DIGIT:
 3197:   return next == -ESC_d;
 3198: 
 3199:   case OP_WHITESPACE:
 3200:   return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
 3201: 
 3202:   case OP_NOT_WHITESPACE:
 3203:   return next == -ESC_s || next == -ESC_h || next == -ESC_v;
 3204: 
 3205:   case OP_HSPACE:
 3206:   return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
 3207:          next == -ESC_w || next == -ESC_v || next == -ESC_R;
 3208: 
 3209:   case OP_NOT_HSPACE:
 3210:   return next == -ESC_h;
 3211: 
 3212:   /* Can't have \S in here because VT matches \S (Perl anomaly) */
 3213:   case OP_ANYNL:
 3214:   case OP_VSPACE:
 3215:   return next == -ESC_V || next == -ESC_d || next == -ESC_w;
 3216: 
 3217:   case OP_NOT_VSPACE:
 3218:   return next == -ESC_v || next == -ESC_R;
 3219: 
 3220:   case OP_WORDCHAR:
 3221:   return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
 3222:          next == -ESC_v || next == -ESC_R;
 3223: 
 3224:   case OP_NOT_WORDCHAR:
 3225:   return next == -ESC_w || next == -ESC_d;
 3226: 
 3227:   default:
 3228:   return FALSE;
 3229:   }
 3230: 
 3231: /* Control does not reach here */
 3232: }
 3233: 
 3234: 
 3235: 
 3236: /*************************************************
 3237: *           Compile one branch                   *
 3238: *************************************************/
 3239: 
 3240: /* Scan the pattern, compiling it into the a vector. If the options are
 3241: changed during the branch, the pointer is used to change the external options
 3242: bits. This function is used during the pre-compile phase when we are trying
 3243: to find out the amount of memory needed, as well as during the real compile
 3244: phase. The value of lengthptr distinguishes the two phases.
 3245: 
 3246: Arguments:
 3247:   optionsptr     pointer to the option bits
 3248:   codeptr        points to the pointer to the current code point
 3249:   ptrptr         points to the current pattern pointer
 3250:   errorcodeptr   points to error code variable
 3251:   firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
 3252:   reqbyteptr     set to the last literal character required, else < 0
 3253:   bcptr          points to current branch chain
 3254:   cond_depth     conditional nesting depth
 3255:   cd             contains pointers to tables etc.
 3256:   lengthptr      NULL during the real compile phase
 3257:                  points to length accumulator during pre-compile phase
 3258: 
 3259: Returns:         TRUE on success
 3260:                  FALSE, with *errorcodeptr set non-zero on error
 3261: */
 3262: 
 3263: static BOOL
 3264: compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
 3265:   int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
 3266:   int cond_depth, compile_data *cd, int *lengthptr)
 3267: {
 3268: int repeat_type, op_type;
 3269: int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
 3270: int bravalue = 0;
 3271: int greedy_default, greedy_non_default;
 3272: int firstbyte, reqbyte;
 3273: int zeroreqbyte, zerofirstbyte;
 3274: int req_caseopt, reqvary, tempreqvary;
 3275: int options = *optionsptr;               /* May change dynamically */
 3276: int after_manual_callout = 0;
 3277: int length_prevgroup = 0;
 3278: register int c;
 3279: register uschar *code = *codeptr;
 3280: uschar *last_code = code;
 3281: uschar *orig_code = code;
 3282: uschar *tempcode;
 3283: BOOL inescq = FALSE;
 3284: BOOL groupsetfirstbyte = FALSE;
 3285: const uschar *ptr = *ptrptr;
 3286: const uschar *tempptr;
 3287: const uschar *nestptr = NULL;
 3288: uschar *previous = NULL;
 3289: uschar *previous_callout = NULL;
 3290: uschar *save_hwm = NULL;
 3291: uschar classbits[32];
 3292: 
 3293: /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
 3294: must not do this for other options (e.g. PCRE_EXTENDED) because they may change
 3295: dynamically as we process the pattern. */
 3296: 
 3297: #ifdef SUPPORT_UTF8
 3298: BOOL class_utf8;
 3299: BOOL utf8 = (options & PCRE_UTF8) != 0;
 3300: uschar *class_utf8data;
 3301: uschar *class_utf8data_base;
 3302: uschar utf8_char[6];
 3303: #else
 3304: BOOL utf8 = FALSE;
 3305: #endif
 3306: 
 3307: #ifdef PCRE_DEBUG
 3308: if (lengthptr != NULL) DPRINTF((">> start branch\n"));
 3309: #endif
 3310: 
 3311: /* Set up the default and non-default settings for greediness */
 3312: 
 3313: greedy_default = ((options & PCRE_UNGREEDY) != 0);
 3314: greedy_non_default = greedy_default ^ 1;
 3315: 
 3316: /* Initialize no first byte, no required byte. REQ_UNSET means "no char
 3317: matching encountered yet". It gets changed to REQ_NONE if we hit something that
 3318: matches a non-fixed char first char; reqbyte just remains unset if we never
 3319: find one.
 3320: 
 3321: When we hit a repeat whose minimum is zero, we may have to adjust these values
 3322: to take the zero repeat into account. This is implemented by setting them to
 3323: zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
 3324: item types that can be repeated set these backoff variables appropriately. */
 3325: 
 3326: firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
 3327: 
 3328: /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
 3329: according to the current setting of the caseless flag. REQ_CASELESS is a bit
 3330: value > 255. It is added into the firstbyte or reqbyte variables to record the
 3331: case status of the value. This is used only for ASCII characters. */
 3332: 
 3333: req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
 3334: 
 3335: /* Switch on next character until the end of the branch */
 3336: 
 3337: for (;; ptr++)
 3338:   {
 3339:   BOOL negate_class;
 3340:   BOOL should_flip_negation;
 3341:   BOOL possessive_quantifier;
 3342:   BOOL is_quantifier;
 3343:   BOOL is_recurse;
 3344:   BOOL reset_bracount;
 3345:   int class_charcount;
 3346:   int class_lastchar;
 3347:   int newoptions;
 3348:   int recno;
 3349:   int refsign;
 3350:   int skipbytes;
 3351:   int subreqbyte;
 3352:   int subfirstbyte;
 3353:   int terminator;
 3354:   int mclength;
 3355:   int tempbracount;
 3356:   uschar mcbuffer[8];
 3357: 
 3358:   /* Get next byte in the pattern */
 3359: 
 3360:   c = *ptr;
 3361: 
 3362:   /* If we are at the end of a nested substitution, revert to the outer level
 3363:   string. Nesting only happens one level deep. */
 3364: 
 3365:   if (c == 0 && nestptr != NULL)
 3366:     {
 3367:     ptr = nestptr;
 3368:     nestptr = NULL;
 3369:     c = *ptr;
 3370:     }
 3371: 
 3372:   /* If we are in the pre-compile phase, accumulate the length used for the
 3373:   previous cycle of this loop. */
 3374: 
 3375:   if (lengthptr != NULL)
 3376:     {
 3377: #ifdef PCRE_DEBUG
 3378:     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
 3379: #endif
 3380:     if (code > cd->start_workspace + cd->workspace_size -
 3381:         WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
 3382:       {
 3383:       *errorcodeptr = ERR52;
 3384:       goto FAILED;
 3385:       }
 3386: 
 3387:     /* There is at least one situation where code goes backwards: this is the
 3388:     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
 3389:     the class is simply eliminated. However, it is created first, so we have to
 3390:     allow memory for it. Therefore, don't ever reduce the length at this point.
 3391:     */
 3392: 
 3393:     if (code < last_code) code = last_code;
 3394: 
 3395:     /* Paranoid check for integer overflow */
 3396: 
 3397:     if (OFLOW_MAX - *lengthptr < code - last_code)
 3398:       {
 3399:       *errorcodeptr = ERR20;
 3400:       goto FAILED;
 3401:       }
 3402: 
 3403:     *lengthptr += (int)(code - last_code);
 3404:     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, (int)(code - last_code),
 3405:       c));
 3406: 
 3407:     /* If "previous" is set and it is not at the start of the work space, move
 3408:     it back to there, in order to avoid filling up the work space. Otherwise,
 3409:     if "previous" is NULL, reset the current code pointer to the start. */
 3410: 
 3411:     if (previous != NULL)
 3412:       {
 3413:       if (previous > orig_code)
 3414:         {
 3415:         memmove(orig_code, previous, code - previous);
 3416:         code -= previous - orig_code;
 3417:         previous = orig_code;
 3418:         }
 3419:       }
 3420:     else code = orig_code;
 3421: 
 3422:     /* Remember where this code item starts so we can pick up the length
 3423:     next time round. */
 3424: 
 3425:     last_code = code;
 3426:     }
 3427: 
 3428:   /* In the real compile phase, just check the workspace used by the forward
 3429:   reference list. */
 3430: 
 3431:   else if (cd->hwm > cd->start_workspace + cd->workspace_size -
 3432:            WORK_SIZE_SAFETY_MARGIN)
 3433:     {
 3434:     *errorcodeptr = ERR52;
 3435:     goto FAILED;
 3436:     }
 3437: 
 3438:   /* If in \Q...\E, check for the end; if not, we have a literal */
 3439: 
 3440:   if (inescq && c != 0)
 3441:     {
 3442:     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
 3443:       {
 3444:       inescq = FALSE;
 3445:       ptr++;
 3446:       continue;
 3447:       }
 3448:     else
 3449:       {
 3450:       if (previous_callout != NULL)
 3451:         {
 3452:         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
 3453:           complete_callout(previous_callout, ptr, cd);
 3454:         previous_callout = NULL;
 3455:         }
 3456:       if ((options & PCRE_AUTO_CALLOUT) != 0)
 3457:         {
 3458:         previous_callout = code;
 3459:         code = auto_callout(code, ptr, cd);
 3460:         }
 3461:       goto NORMAL_CHAR;
 3462:       }
 3463:     }
 3464: 
 3465:   /* Fill in length of a previous callout, except when the next thing is
 3466:   a quantifier. */
 3467: 
 3468:   is_quantifier =
 3469:     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
 3470:     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
 3471: 
 3472:   if (!is_quantifier && previous_callout != NULL &&
 3473:        after_manual_callout-- <= 0)
 3474:     {
 3475:     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
 3476:       complete_callout(previous_callout, ptr, cd);
 3477:     previous_callout = NULL;
 3478:     }
 3479: 
 3480:   /* In extended mode, skip white space and comments. */
 3481: 
 3482:   if ((options & PCRE_EXTENDED) != 0)
 3483:     {
 3484:     if ((cd->ctypes[c] & ctype_space) != 0) continue;
 3485:     if (c == CHAR_NUMBER_SIGN)
 3486:       {
 3487:       ptr++;
 3488:       while (*ptr != 0)
 3489:         {
 3490:         if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
 3491:         ptr++;
 3492: #ifdef SUPPORT_UTF8
 3493:         if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
 3494: #endif
 3495:         }
 3496:       if (*ptr != 0) continue;
 3497: 
 3498:       /* Else fall through to handle end of string */
 3499:       c = 0;
 3500:       }
 3501:     }
 3502: 
 3503:   /* No auto callout for quantifiers. */
 3504: 
 3505:   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
 3506:     {
 3507:     previous_callout = code;
 3508:     code = auto_callout(code, ptr, cd);
 3509:     }
 3510: 
 3511:   switch(c)
 3512:     {
 3513:     /* ===================================================================*/
 3514:     case 0:                        /* The branch terminates at string end */
 3515:     case CHAR_VERTICAL_LINE:       /* or | or ) */
 3516:     case CHAR_RIGHT_PARENTHESIS:
 3517:     *firstbyteptr = firstbyte;
 3518:     *reqbyteptr = reqbyte;
 3519:     *codeptr = code;
 3520:     *ptrptr = ptr;
 3521:     if (lengthptr != NULL)
 3522:       {
 3523:       if (OFLOW_MAX - *lengthptr < code - last_code)
 3524:         {
 3525:         *errorcodeptr = ERR20;
 3526:         goto FAILED;
 3527:         }
 3528:       *lengthptr += (int)(code - last_code);   /* To include callout length */
 3529:       DPRINTF((">> end branch\n"));
 3530:       }
 3531:     return TRUE;
 3532: 
 3533: 
 3534:     /* ===================================================================*/
 3535:     /* Handle single-character metacharacters. In multiline mode, ^ disables
 3536:     the setting of any following char as a first character. */
 3537: 
 3538:     case CHAR_CIRCUMFLEX_ACCENT:
 3539:     previous = NULL;
 3540:     if ((options & PCRE_MULTILINE) != 0)
 3541:       {
 3542:       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
 3543:       *code++ = OP_CIRCM;
 3544:       }
 3545:     else *code++ = OP_CIRC;
 3546:     break;
 3547: 
 3548:     case CHAR_DOLLAR_SIGN:
 3549:     previous = NULL;
 3550:     *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
 3551:     break;
 3552: 
 3553:     /* There can never be a first char if '.' is first, whatever happens about
 3554:     repeats. The value of reqbyte doesn't change either. */
 3555: 
 3556:     case CHAR_DOT:
 3557:     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
 3558:     zerofirstbyte = firstbyte;
 3559:     zeroreqbyte = reqbyte;
 3560:     previous = code;
 3561:     *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
 3562:     break;
 3563: 
 3564: 
 3565:     /* ===================================================================*/
 3566:     /* Character classes. If the included characters are all < 256, we build a
 3567:     32-byte bitmap of the permitted characters, except in the special case
 3568:     where there is only one such character. For negated classes, we build the
 3569:     map as usual, then invert it at the end. However, we use a different opcode
 3570:     so that data characters > 255 can be handled correctly.
 3571: 
 3572:     If the class contains characters outside the 0-255 range, a different
 3573:     opcode is compiled. It may optionally have a bit map for characters < 256,
 3574:     but those above are are explicitly listed afterwards. A flag byte tells
 3575:     whether the bitmap is present, and whether this is a negated class or not.
 3576: 
 3577:     In JavaScript compatibility mode, an isolated ']' causes an error. In
 3578:     default (Perl) mode, it is treated as a data character. */
 3579: 
 3580:     case CHAR_RIGHT_SQUARE_BRACKET:
 3581:     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
 3582:       {
 3583:       *errorcodeptr = ERR64;
 3584:       goto FAILED;
 3585:       }
 3586:     goto NORMAL_CHAR;
 3587: 
 3588:     case CHAR_LEFT_SQUARE_BRACKET:
 3589:     previous = code;
 3590: 
 3591:     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
 3592:     they are encountered at the top level, so we'll do that too. */
 3593: 
 3594:     if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
 3595:          ptr[1] == CHAR_EQUALS_SIGN) &&
 3596:         check_posix_syntax(ptr, &tempptr))
 3597:       {
 3598:       *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
 3599:       goto FAILED;
 3600:       }
 3601: 
 3602:     /* If the first character is '^', set the negation flag and skip it. Also,
 3603:     if the first few characters (either before or after ^) are \Q\E or \E we
 3604:     skip them too. This makes for compatibility with Perl. */
 3605: 
 3606:     negate_class = FALSE;
 3607:     for (;;)
 3608:       {
 3609:       c = *(++ptr);
 3610:       if (c == CHAR_BACKSLASH)
 3611:         {
 3612:         if (ptr[1] == CHAR_E)
 3613:           ptr++;
 3614:         else if (strncmp((const char *)ptr+1,
 3615:                           STR_Q STR_BACKSLASH STR_E, 3) == 0)
 3616:           ptr += 3;
 3617:         else
 3618:           break;
 3619:         }
 3620:       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
 3621:         negate_class = TRUE;
 3622:       else break;
 3623:       }
 3624: 
 3625:     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
 3626:     an initial ']' is taken as a data character -- the code below handles
 3627:     that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
 3628:     [^] must match any character, so generate OP_ALLANY. */
 3629: 
 3630:     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
 3631:         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
 3632:       {
 3633:       *code++ = negate_class? OP_ALLANY : OP_FAIL;
 3634:       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
 3635:       zerofirstbyte = firstbyte;
 3636:       break;
 3637:       }
 3638: 
 3639:     /* If a class contains a negative special such as \S, we need to flip the
 3640:     negation flag at the end, so that support for characters > 255 works
 3641:     correctly (they are all included in the class). */
 3642: 
 3643:     should_flip_negation = FALSE;
 3644: 
 3645:     /* Keep a count of chars with values < 256 so that we can optimize the case
 3646:     of just a single character (as long as it's < 256). However, For higher
 3647:     valued UTF-8 characters, we don't yet do any optimization. */
 3648: 
 3649:     class_charcount = 0;
 3650:     class_lastchar = -1;
 3651: 
 3652:     /* Initialize the 32-char bit map to all zeros. We build the map in a
 3653:     temporary bit of memory, in case the class contains only 1 character (less
 3654:     than 256), because in that case the compiled code doesn't use the bit map.
 3655:     */
 3656: 
 3657:     memset(classbits, 0, 32 * sizeof(uschar));
 3658: 
 3659: #ifdef SUPPORT_UTF8
 3660:     class_utf8 = FALSE;                       /* No chars >= 256 */
 3661:     class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
 3662:     class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
 3663: #endif
 3664: 
 3665:     /* Process characters until ] is reached. By writing this as a "do" it
 3666:     means that an initial ] is taken as a data character. At the start of the
 3667:     loop, c contains the first byte of the character. */
 3668: 
 3669:     if (c != 0) do
 3670:       {
 3671:       const uschar *oldptr;
 3672: 
 3673: #ifdef SUPPORT_UTF8
 3674:       if (utf8 && c > 127)
 3675:         {                           /* Braces are required because the */
 3676:         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
 3677:         }
 3678: 
 3679:       /* In the pre-compile phase, accumulate the length of any UTF-8 extra
 3680:       data and reset the pointer. This is so that very large classes that
 3681:       contain a zillion UTF-8 characters no longer overwrite the work space
 3682:       (which is on the stack). */
 3683: 
 3684:       if (lengthptr != NULL)
 3685:         {
 3686:         *lengthptr += (int)(class_utf8data - class_utf8data_base);
 3687:         class_utf8data = class_utf8data_base;
 3688:         }
 3689: 
 3690: #endif
 3691: 
 3692:       /* Inside \Q...\E everything is literal except \E */
 3693: 
 3694:       if (inescq)
 3695:         {
 3696:         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
 3697:           {
 3698:           inescq = FALSE;                   /* Reset literal state */
 3699:           ptr++;                            /* Skip the 'E' */
 3700:           continue;                         /* Carry on with next */
 3701:           }
 3702:         goto CHECK_RANGE;                   /* Could be range if \E follows */
 3703:         }
 3704: 
 3705:       /* Handle POSIX class names. Perl allows a negation extension of the
 3706:       form [:^name:]. A square bracket that doesn't match the syntax is
 3707:       treated as a literal. We also recognize the POSIX constructions
 3708:       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
 3709:       5.6 and 5.8 do. */
 3710: 
 3711:       if (c == CHAR_LEFT_SQUARE_BRACKET &&
 3712:           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
 3713:            ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
 3714:         {
 3715:         BOOL local_negate = FALSE;
 3716:         int posix_class, taboffset, tabopt;
 3717:         register const uschar *cbits = cd->cbits;
 3718:         uschar pbits[32];
 3719: 
 3720:         if (ptr[1] != CHAR_COLON)
 3721:           {
 3722:           *errorcodeptr = ERR31;
 3723:           goto FAILED;
 3724:           }
 3725: 
 3726:         ptr += 2;
 3727:         if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
 3728:           {
 3729:           local_negate = TRUE;
 3730:           should_flip_negation = TRUE;  /* Note negative special */
 3731:           ptr++;
 3732:           }
 3733: 
 3734:         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
 3735:         if (posix_class < 0)
 3736:           {
 3737:           *errorcodeptr = ERR30;
 3738:           goto FAILED;
 3739:           }
 3740: 
 3741:         /* If matching is caseless, upper and lower are converted to
 3742:         alpha. This relies on the fact that the class table starts with
 3743:         alpha, lower, upper as the first 3 entries. */
 3744: 
 3745:         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
 3746:           posix_class = 0;
 3747: 
 3748:         /* When PCRE_UCP is set, some of the POSIX classes are converted to
 3749:         different escape sequences that use Unicode properties. */
 3750: 
 3751: #ifdef SUPPORT_UCP
 3752:         if ((options & PCRE_UCP) != 0)
 3753:           {
 3754:           int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
 3755:           if (posix_substitutes[pc] != NULL)
 3756:             {
 3757:             nestptr = tempptr + 1;
 3758:             ptr = posix_substitutes[pc] - 1;
 3759:             continue;
 3760:             }
 3761:           }
 3762: #endif
 3763:         /* In the non-UCP case, we build the bit map for the POSIX class in a
 3764:         chunk of local store because we may be adding and subtracting from it,
 3765:         and we don't want to subtract bits that may be in the main map already.
 3766:         At the end we or the result into the bit map that is being built. */
 3767: 
 3768:         posix_class *= 3;
 3769: 
 3770:         /* Copy in the first table (always present) */
 3771: 
 3772:         memcpy(pbits, cbits + posix_class_maps[posix_class],
 3773:           32 * sizeof(uschar));
 3774: 
 3775:         /* If there is a second table, add or remove it as required. */
 3776: 
 3777:         taboffset = posix_class_maps[posix_class + 1];
 3778:         tabopt = posix_class_maps[posix_class + 2];
 3779: 
 3780:         if (taboffset >= 0)
 3781:           {
 3782:           if (tabopt >= 0)
 3783:             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
 3784:           else
 3785:             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
 3786:           }
 3787: 
 3788:         /* Not see if we need to remove any special characters. An option
 3789:         value of 1 removes vertical space and 2 removes underscore. */
 3790: 
 3791:         if (tabopt < 0) tabopt = -tabopt;
 3792:         if (tabopt == 1) pbits[1] &= ~0x3c;
 3793:           else if (tabopt == 2) pbits[11] &= 0x7f;
 3794: 
 3795:         /* Add the POSIX table or its complement into the main table that is
 3796:         being built and we are done. */
 3797: 
 3798:         if (local_negate)
 3799:           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
 3800:         else
 3801:           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
 3802: 
 3803:         ptr = tempptr + 1;
 3804:         class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
 3805:         continue;    /* End of POSIX syntax handling */
 3806:         }
 3807: 
 3808:       /* Backslash may introduce a single character, or it may introduce one
 3809:       of the specials, which just set a flag. The sequence \b is a special
 3810:       case. Inside a class (and only there) it is treated as backspace. We
 3811:       assume that other escapes have more than one character in them, so set
 3812:       class_charcount bigger than one. Unrecognized escapes fall through and
 3813:       are either treated as literal characters (by default), or are faulted if
 3814:       PCRE_EXTRA is set. */
 3815: 
 3816:       if (c == CHAR_BACKSLASH)
 3817:         {
 3818:         c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
 3819:         if (*errorcodeptr != 0) goto FAILED;
 3820: 
 3821:         if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */
 3822:         else if (-c == ESC_N)            /* \N is not supported in a class */
 3823:           {
 3824:           *errorcodeptr = ERR71;
 3825:           goto FAILED;
 3826:           }
 3827:         else if (-c == ESC_Q)            /* Handle start of quoted string */
 3828:           {
 3829:           if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
 3830:             {
 3831:             ptr += 2; /* avoid empty string */
 3832:             }
 3833:           else inescq = TRUE;
 3834:           continue;
 3835:           }
 3836:         else if (-c == ESC_E) continue;  /* Ignore orphan \E */
 3837: 
 3838:         if (c < 0)
 3839:           {
 3840:           register const uschar *cbits = cd->cbits;
 3841:           class_charcount += 2;     /* Greater than 1 is what matters */
 3842: 
 3843:           switch (-c)
 3844:             {
 3845: #ifdef SUPPORT_UCP
 3846:             case ESC_du:     /* These are the values given for \d etc */
 3847:             case ESC_DU:     /* when PCRE_UCP is set. We replace the */
 3848:             case ESC_wu:     /* escape sequence with an appropriate \p */
 3849:             case ESC_WU:     /* or \P to test Unicode properties instead */
 3850:             case ESC_su:     /* of the default ASCII testing. */
 3851:             case ESC_SU:
 3852:             nestptr = ptr;
 3853:             ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
 3854:             class_charcount -= 2;                /* Undo! */
 3855:             continue;
 3856: #endif
 3857:             case ESC_d:
 3858:             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
 3859:             continue;
 3860: 
 3861:             case ESC_D:
 3862:             should_flip_negation = TRUE;
 3863:             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
 3864:             continue;
 3865: 
 3866:             case ESC_w:
 3867:             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
 3868:             continue;
 3869: 
 3870:             case ESC_W:
 3871:             should_flip_negation = TRUE;
 3872:             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
 3873:             continue;
 3874: 
 3875:             /* Perl 5.004 onwards omits VT from \s, but we must preserve it
 3876:             if it was previously set by something earlier in the character
 3877:             class. */
 3878: 
 3879:             case ESC_s:
 3880:             classbits[0] |= cbits[cbit_space];
 3881:             classbits[1] |= cbits[cbit_space+1] & ~0x08;
 3882:             for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
 3883:             continue;
 3884: 
 3885:             case ESC_S:
 3886:             should_flip_negation = TRUE;
 3887:             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
 3888:             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
 3889:             continue;
 3890: 
 3891:             case ESC_h:
 3892:             SETBIT(classbits, 0x09); /* VT */
 3893:             SETBIT(classbits, 0x20); /* SPACE */
 3894:             SETBIT(classbits, 0xa0); /* NSBP */
 3895: #ifdef SUPPORT_UTF8
 3896:             if (utf8)
 3897:               {
 3898:               class_utf8 = TRUE;
 3899:               *class_utf8data++ = XCL_SINGLE;
 3900:               class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
 3901:               *class_utf8data++ = XCL_SINGLE;
 3902:               class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
 3903:               *class_utf8data++ = XCL_RANGE;
 3904:               class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
 3905:               class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
 3906:               *class_utf8data++ = XCL_SINGLE;
 3907:               class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
 3908:               *class_utf8data++ = XCL_SINGLE;
 3909:               class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
 3910:               *class_utf8data++ = XCL_SINGLE;
 3911:               class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
 3912:               }
 3913: #endif
 3914:             continue;
 3915: 
 3916:             case ESC_H:
 3917:             for (c = 0; c < 32; c++)
 3918:               {
 3919:               int x = 0xff;
 3920:               switch (c)
 3921:                 {
 3922:                 case 0x09/8: x ^= 1 << (0x09%8); break;
 3923:                 case 0x20/8: x ^= 1 << (0x20%8); break;
 3924:                 case 0xa0/8: x ^= 1 << (0xa0%8); break;
 3925:                 default: break;
 3926:                 }
 3927:               classbits[c] |= x;
 3928:               }
 3929: 
 3930: #ifdef SUPPORT_UTF8
 3931:             if (utf8)
 3932:               {
 3933:               class_utf8 = TRUE;
 3934:               *class_utf8data++ = XCL_RANGE;
 3935:               class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
 3936:               class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
 3937:               *class_utf8data++ = XCL_RANGE;
 3938:               class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
 3939:               class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
 3940:               *class_utf8data++ = XCL_RANGE;
 3941:               class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
 3942:               class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
 3943:               *class_utf8data++ = XCL_RANGE;
 3944:               class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
 3945:               class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
 3946:               *class_utf8data++ = XCL_RANGE;
 3947:               class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
 3948:               class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
 3949:               *class_utf8data++ = XCL_RANGE;
 3950:               class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
 3951:               class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
 3952:               *class_utf8data++ = XCL_RANGE;
 3953:               class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
 3954:               class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
 3955:               }
 3956: #endif
 3957:             continue;
 3958: 
 3959:             case ESC_v:
 3960:             SETBIT(classbits, 0x0a); /* LF */
 3961:             SETBIT(classbits, 0x0b); /* VT */
 3962:             SETBIT(classbits, 0x0c); /* FF */
 3963:             SETBIT(classbits, 0x0d); /* CR */
 3964:             SETBIT(classbits, 0x85); /* NEL */
 3965: #ifdef SUPPORT_UTF8
 3966:             if (utf8)
 3967:               {
 3968:               class_utf8 = TRUE;
 3969:               *class_utf8data++ = XCL_RANGE;
 3970:               class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
 3971:               class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
 3972:               }
 3973: #endif
 3974:             continue;
 3975: 
 3976:             case ESC_V:
 3977:             for (c = 0; c < 32; c++)
 3978:               {
 3979:               int x = 0xff;
 3980:               switch (c)
 3981:                 {
 3982:                 case 0x0a/8: x ^= 1 << (0x0a%8);
 3983:                              x ^= 1 << (0x0b%8);
 3984:                              x ^= 1 << (0x0c%8);
 3985:                              x ^= 1 << (0x0d%8);
 3986:                              break;
 3987:                 case 0x85/8: x ^= 1 << (0x85%8); break;
 3988:                 default: break;
 3989:                 }
 3990:               classbits[c] |= x;
 3991:               }
 3992: 
 3993: #ifdef SUPPORT_UTF8
 3994:             if (utf8)
 3995:               {
 3996:               class_utf8 = TRUE;
 3997:               *class_utf8data++ = XCL_RANGE;
 3998:               class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
 3999:               class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
 4000:               *class_utf8data++ = XCL_RANGE;
 4001:               class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
 4002:               class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
 4003:               }
 4004: #endif
 4005:             continue;
 4006: 
 4007: #ifdef SUPPORT_UCP
 4008:             case ESC_p:
 4009:             case ESC_P:
 4010:               {
 4011:               BOOL negated;
 4012:               int pdata;
 4013:               int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
 4014:               if (ptype < 0) goto FAILED;
 4015:               class_utf8 = TRUE;
 4016:               *class_utf8data++ = ((-c == ESC_p) != negated)?
 4017:                 XCL_PROP : XCL_NOTPROP;
 4018:               *class_utf8data++ = ptype;
 4019:               *class_utf8data++ = pdata;
 4020:               class_charcount -= 2;   /* Not a < 256 character */
 4021:               continue;
 4022:               }
 4023: #endif
 4024:             /* Unrecognized escapes are faulted if PCRE is running in its
 4025:             strict mode. By default, for compatibility with Perl, they are
 4026:             treated as literals. */
 4027: 
 4028:             default:
 4029:             if ((options & PCRE_EXTRA) != 0)
 4030:               {
 4031:               *errorcodeptr = ERR7;
 4032:               goto FAILED;
 4033:               }
 4034:             class_charcount -= 2;  /* Undo the default count from above */
 4035:             c = *ptr;              /* Get the final character and fall through */
 4036:             break;
 4037:             }
 4038:           }
 4039: 
 4040:         /* Fall through if we have a single character (c >= 0). This may be
 4041:         greater than 256 in UTF-8 mode. */
 4042: 
 4043:         }   /* End of backslash handling */
 4044: 
 4045:       /* A single character may be followed by '-' to form a range. However,
 4046:       Perl does not permit ']' to be the end of the range. A '-' character
 4047:       at the end is treated as a literal. Perl ignores orphaned \E sequences
 4048:       entirely. The code for handling \Q and \E is messy. */
 4049: 
 4050:       CHECK_RANGE:
 4051:       while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
 4052:         {
 4053:         inescq = FALSE;
 4054:         ptr += 2;
 4055:         }
 4056: 
 4057:       oldptr = ptr;
 4058: 
 4059:       /* Remember \r or \n */
 4060: 
 4061:       if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
 4062: 
 4063:       /* Check for range */
 4064: 
 4065:       if (!inescq && ptr[1] == CHAR_MINUS)
 4066:         {
 4067:         int d;
 4068:         ptr += 2;
 4069:         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
 4070: 
 4071:         /* If we hit \Q (not followed by \E) at this point, go into escaped
 4072:         mode. */
 4073: 
 4074:         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
 4075:           {
 4076:           ptr += 2;
 4077:           if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
 4078:             { ptr += 2; continue; }
 4079:           inescq = TRUE;
 4080:           break;
 4081:           }
 4082: 
 4083:         if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
 4084:           {
 4085:           ptr = oldptr;
 4086:           goto LONE_SINGLE_CHARACTER;
 4087:           }
 4088: 
 4089: #ifdef SUPPORT_UTF8
 4090:         if (utf8)
 4091:           {                           /* Braces are required because the */
 4092:           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
 4093:           }
 4094:         else
 4095: #endif
 4096:         d = *ptr;  /* Not UTF-8 mode */
 4097: 
 4098:         /* The second part of a range can be a single-character escape, but
 4099:         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
 4100:         in such circumstances. */
 4101: 
 4102:         if (!inescq && d == CHAR_BACKSLASH)
 4103:           {
 4104:           d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
 4105:           if (*errorcodeptr != 0) goto FAILED;
 4106: 
 4107:           /* \b is backspace; any other special means the '-' was literal */
 4108: 
 4109:           if (d < 0)
 4110:             {
 4111:             if (d == -ESC_b) d = CHAR_BS; else
 4112:               {
 4113:               ptr = oldptr;
 4114:               goto LONE_SINGLE_CHARACTER;  /* A few lines below */
 4115:               }
 4116:             }
 4117:           }
 4118: 
 4119:         /* Check that the two values are in the correct order. Optimize
 4120:         one-character ranges */
 4121: 
 4122:         if (d < c)
 4123:           {
 4124:           *errorcodeptr = ERR8;
 4125:           goto FAILED;
 4126:           }
 4127: 
 4128:         if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
 4129: 
 4130:         /* Remember \r or \n */
 4131: 
 4132:         if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
 4133: 
 4134:         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
 4135:         matching, we have to use an XCLASS with extra data items. Caseless
 4136:         matching for characters > 127 is available only if UCP support is
 4137:         available. */
 4138: 
 4139: #ifdef SUPPORT_UTF8
 4140:         if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
 4141:           {
 4142:           class_utf8 = TRUE;
 4143: 
 4144:           /* With UCP support, we can find the other case equivalents of
 4145:           the relevant characters. There may be several ranges. Optimize how
 4146:           they fit with the basic range. */
 4147: 
 4148: #ifdef SUPPORT_UCP
 4149:           if ((options & PCRE_CASELESS) != 0)
 4150:             {
 4151:             unsigned int occ, ocd;
 4152:             unsigned int cc = c;
 4153:             unsigned int origd = d;
 4154:             while (get_othercase_range(&cc, origd, &occ, &ocd))
 4155:               {
 4156:               if (occ >= (unsigned int)c &&
 4157:                   ocd <= (unsigned int)d)
 4158:                 continue;                          /* Skip embedded ranges */
 4159: 
 4160:               if (occ < (unsigned int)c  &&
 4161:                   ocd >= (unsigned int)c - 1)      /* Extend the basic range */
 4162:                 {                                  /* if there is overlap,   */
 4163:                 c = occ;                           /* noting that if occ < c */
 4164:                 continue;                          /* we can't have ocd > d  */
 4165:                 }                                  /* because a subrange is  */
 4166:               if (ocd > (unsigned int)d &&
 4167:                   occ <= (unsigned int)d + 1)      /* always shorter than    */
 4168:                 {                                  /* the basic range.       */
 4169:                 d = ocd;
 4170:                 continue;
 4171:                 }
 4172: 
 4173:               if (occ == ocd)
 4174:                 {
 4175:                 *class_utf8data++ = XCL_SINGLE;
 4176:                 }
 4177:               else
 4178:                 {
 4179:                 *class_utf8data++ = XCL_RANGE;
 4180:                 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
 4181:                 }
 4182:               class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
 4183:               }
 4184:             }
 4185: #endif  /* SUPPORT_UCP */
 4186: 
 4187:           /* Now record the original range, possibly modified for UCP caseless
 4188:           overlapping ranges. */
 4189: 
 4190:           *class_utf8data++ = XCL_RANGE;
 4191:           class_utf8data += _pcre_ord2utf8(c, class_utf8data);
 4192:           class_utf8data += _pcre_ord2utf8(d, class_utf8data);
 4193: 
 4194:           /* With UCP support, we are done. Without UCP support, there is no
 4195:           caseless matching for UTF-8 characters > 127; we can use the bit map
 4196:           for the smaller ones. */
 4197: 
 4198: #ifdef SUPPORT_UCP
 4199:           continue;    /* With next character in the class */
 4200: #else
 4201:           if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
 4202: 
 4203:           /* Adjust upper limit and fall through to set up the map */
 4204: 
 4205:           d = 127;
 4206: 
 4207: #endif  /* SUPPORT_UCP */
 4208:           }
 4209: #endif  /* SUPPORT_UTF8 */
 4210: 
 4211:         /* We use the bit map for all cases when not in UTF-8 mode; else
 4212:         ranges that lie entirely within 0-127 when there is UCP support; else
 4213:         for partial ranges without UCP support. */
 4214: 
 4215:         class_charcount += d - c + 1;
 4216:         class_lastchar = d;
 4217: 
 4218:         /* We can save a bit of time by skipping this in the pre-compile. */
 4219: 
 4220:         if (lengthptr == NULL) for (; c <= d; c++)
 4221:           {
 4222:           classbits[c/8] |= (1 << (c&7));
 4223:           if ((options & PCRE_CASELESS) != 0)
 4224:             {
 4225:             int uc = cd->fcc[c];           /* flip case */
 4226:             classbits[uc/8] |= (1 << (uc&7));
 4227:             }
 4228:           }
 4229: 
 4230:         continue;   /* Go get the next char in the class */
 4231:         }
 4232: 
 4233:       /* Handle a lone single character - we can get here for a normal
 4234:       non-escape char, or after \ that introduces a single character or for an
 4235:       apparent range that isn't. */
 4236: 
 4237:       LONE_SINGLE_CHARACTER:
 4238: 
 4239:       /* Handle a character that cannot go in the bit map */
 4240: 
 4241: #ifdef SUPPORT_UTF8
 4242:       if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
 4243:         {
 4244:         class_utf8 = TRUE;
 4245:         *class_utf8data++ = XCL_SINGLE;
 4246:         class_utf8data += _pcre_ord2utf8(c, class_utf8data);
 4247: 
 4248: #ifdef SUPPORT_UCP
 4249:         if ((options & PCRE_CASELESS) != 0)
 4250:           {
 4251:           unsigned int othercase;
 4252:           if ((othercase = UCD_OTHERCASE(c)) != c)
 4253:             {
 4254:             *class_utf8data++ = XCL_SINGLE;
 4255:             class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
 4256:             }
 4257:           }
 4258: #endif  /* SUPPORT_UCP */
 4259: 
 4260:         }
 4261:       else
 4262: #endif  /* SUPPORT_UTF8 */
 4263: 
 4264:       /* Handle a single-byte character */
 4265:         {
 4266:         classbits[c/8] |= (1 << (c&7));
 4267:         if ((options & PCRE_CASELESS) != 0)
 4268:           {
 4269:           c = cd->fcc[c];   /* flip case */
 4270:           classbits[c/8] |= (1 << (c&7));
 4271:           }
 4272:         class_charcount++;
 4273:         class_lastchar = c;
 4274:         }
 4275:       }
 4276: 
 4277:     /* Loop until ']' reached. This "while" is the end of the "do" far above.
 4278:     If we are at the end of an internal nested string, revert to the outer
 4279:     string. */
 4280: 
 4281:     while (((c = *(++ptr)) != 0 ||
 4282:            (nestptr != NULL &&
 4283:              (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != 0)) &&
 4284:            (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
 4285: 
 4286:     /* Check for missing terminating ']' */
 4287: 
 4288:     if (c == 0)
 4289:       {
 4290:       *errorcodeptr = ERR6;
 4291:       goto FAILED;
 4292:       }
 4293: 
 4294:     /* If class_charcount is 1, we saw precisely one character whose value is
 4295:     less than 256. As long as there were no characters >= 128 and there was no
 4296:     use of \p or \P, in other words, no use of any XCLASS features, we can
 4297:     optimize.
 4298: 
 4299:     In UTF-8 mode, we can optimize the negative case only if there were no
 4300:     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
 4301:     operate on single-bytes characters only. This is an historical hangover.
 4302:     Maybe one day we can tidy these opcodes to handle multi-byte characters.
 4303: 
 4304:     The optimization throws away the bit map. We turn the item into a
 4305:     1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
 4306:     Note that OP_NOT[I] does not support multibyte characters. In the positive
 4307:     case, it can cause firstbyte to be set. Otherwise, there can be no first
 4308:     char if this item is first, whatever repeat count may follow. In the case
 4309:     of reqbyte, save the previous value for reinstating. */
 4310: 
 4311: #ifdef SUPPORT_UTF8
 4312:     if (class_charcount == 1 && !class_utf8 &&
 4313:       (!utf8 || !negate_class || class_lastchar < 128))
 4314: #else
 4315:     if (class_charcount == 1)
 4316: #endif
 4317:       {
 4318:       zeroreqbyte = reqbyte;
 4319: 
 4320:       /* The OP_NOT[I] opcodes work on one-byte characters only. */
 4321: 
 4322:       if (negate_class)
 4323:         {
 4324:         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
 4325:         zerofirstbyte = firstbyte;
 4326:         *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
 4327:         *code++ = class_lastchar;
 4328:         break;
 4329:         }
 4330: 
 4331:       /* For a single, positive character, get the value into mcbuffer, and
 4332:       then we can handle this with the normal one-character code. */
 4333: 
 4334: #ifdef SUPPORT_UTF8
 4335:       if (utf8 && class_lastchar > 127)
 4336:         mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
 4337:       else
 4338: #endif
 4339:         {
 4340:         mcbuffer[0] = class_lastchar;
 4341:         mclength = 1;
 4342:         }
 4343:       goto ONE_CHAR;
 4344:       }       /* End of 1-char optimization */
 4345: 
 4346:     /* The general case - not the one-char optimization. If this is the first
 4347:     thing in the branch, there can be no first char setting, whatever the
 4348:     repeat count. Any reqbyte setting must remain unchanged after any kind of
 4349:     repeat. */
 4350: 
 4351:     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
 4352:     zerofirstbyte = firstbyte;
 4353:     zeroreqbyte = reqbyte;
 4354: 
 4355:     /* If there are characters with values > 255, we have to compile an
 4356:     extended class, with its own opcode, unless there was a negated special
 4357:     such as \S in the class, and PCRE_UCP is not set, because in that case all
 4358:     characters > 255 are in the class, so any that were explicitly given as
 4359:     well can be ignored. If (when there are explicit characters > 255 that must
 4360:     be listed) there are no characters < 256, we can omit the bitmap in the
 4361:     actual compiled code. */
 4362: 
 4363: #ifdef SUPPORT_UTF8
 4364:     if (class_utf8 && (!should_flip_negation || (options & PCRE_UCP) != 0))
 4365:       {
 4366:       *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
 4367:       *code++ = OP_XCLASS;
 4368:       code += LINK_SIZE;
 4369:       *code = negate_class? XCL_NOT : 0;
 4370: 
 4371:       /* If the map is required, move up the extra data to make room for it;
 4372:       otherwise just move the code pointer to the end of the extra data. */
 4373: 
 4374:       if (class_charcount > 0)
 4375:         {
 4376:         *code++ |= XCL_MAP;
 4377:         memmove(code + 32, code, class_utf8data - code);
 4378:         memcpy(code, classbits, 32);
 4379:         code = class_utf8data + 32;
 4380:         }
 4381:       else code = class_utf8data;
 4382: 
 4383:       /* Now fill in the complete length of the item */
 4384: 
 4385:       PUT(previous, 1, (int)(code - previous));
 4386:       break;   /* End of class handling */
 4387:       }
 4388: #endif
 4389: 
 4390:     /* If there are no characters > 255, or they are all to be included or
 4391:     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
 4392:     whole class was negated and whether there were negative specials such as \S
 4393:     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
 4394:     negating it if necessary. */
 4395: 
 4396:     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
 4397:     if (negate_class)
 4398:       {
 4399:       if (lengthptr == NULL)    /* Save time in the pre-compile phase */
 4400:         for (c = 0; c < 32; c++) code[c] = ~classbits[c];
 4401:       }
 4402:     else
 4403:       {
 4404:       memcpy(code, classbits, 32);
 4405:       }
 4406:     code += 32;
 4407:     break;
 4408: 
 4409: 
 4410:     /* ===================================================================*/
 4411:     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
 4412:     has been tested above. */
 4413: 
 4414:     case CHAR_LEFT_CURLY_BRACKET:
 4415:     if (!is_quantifier) goto NORMAL_CHAR;
 4416:     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
 4417:     if (*errorcodeptr != 0) goto FAILED;
 4418:     goto REPEAT;
 4419: 
 4420:     case CHAR_ASTERISK:
 4421:     repeat_min = 0;
 4422:     repeat_max = -1;
 4423:     goto REPEAT;
 4424: 
 4425:     case CHAR_PLUS:
 4426:     repeat_min = 1;
 4427:     repeat_max = -1;
 4428:     goto REPEAT;
 4429: 
 4430:     case CHAR_QUESTION_MARK:
 4431:     repeat_min = 0;
 4432:     repeat_max = 1;
 4433: 
 4434:     REPEAT:
 4435:     if (previous == NULL)
 4436:       {
 4437:       *errorcodeptr = ERR9;
 4438:       goto FAILED;
 4439:       }
 4440: 
 4441:     if (repeat_min == 0)
 4442:       {
 4443:       firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
 4444:       reqbyte = zeroreqbyte;        /* Ditto */
 4445:       }
 4446: 
 4447:     /* Remember whether this is a variable length repeat */
 4448: 
 4449:     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
 4450: 
 4451:     op_type = 0;                    /* Default single-char op codes */
 4452:     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
 4453: 
 4454:     /* Save start of previous item, in case we have to move it up in order to
 4455:     insert something before it. */
 4456: 
 4457:     tempcode = previous;
 4458: 
 4459:     /* If the next character is '+', we have a possessive quantifier. This
 4460:     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
 4461:     If the next character is '?' this is a minimizing repeat, by default,
 4462:     but if PCRE_UNGREEDY is set, it works the other way round. We change the
 4463:     repeat type to the non-default. */
 4464: 
 4465:     if (ptr[1] == CHAR_PLUS)
 4466:       {
 4467:       repeat_type = 0;                  /* Force greedy */
 4468:       possessive_quantifier = TRUE;
 4469:       ptr++;
 4470:       }
 4471:     else if (ptr[1] == CHAR_QUESTION_MARK)
 4472:       {
 4473:       repeat_type = greedy_non_default;
 4474:       ptr++;
 4475:       }
 4476:     else repeat_type = greedy_default;
 4477: 
 4478:     /* If previous was a recursion call, wrap it in atomic brackets so that
 4479:     previous becomes the atomic group. All recursions were so wrapped in the
 4480:     past, but it no longer happens for non-repeated recursions. In fact, the
 4481:     repeated ones could be re-implemented independently so as not to need this,
 4482:     but for the moment we rely on the code for repeating groups. */
 4483: 
 4484:     if (*previous == OP_RECURSE)
 4485:       {
 4486:       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
 4487:       *previous = OP_ONCE;
 4488:       PUT(previous, 1, 2 + 2*LINK_SIZE);
 4489:       previous[2 + 2*LINK_SIZE] = OP_KET;
 4490:       PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
 4491:       code += 2 + 2 * LINK_SIZE;
 4492:       length_prevgroup = 3 + 3*LINK_SIZE;
 4493: 
 4494:       /* When actually compiling, we need to check whether this was a forward
 4495:       reference, and if so, adjust the offset. */
 4496: 
 4497:       if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
 4498:         {
 4499:         int offset = GET(cd->hwm, -LINK_SIZE);
 4500:         if (offset == previous + 1 - cd->start_code)
 4501:           PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
 4502:         }
 4503:       }
 4504: 
 4505:     /* Now handle repetition for the different types of item. */
 4506: 
 4507:     /* If previous was a character match, abolish the item and generate a
 4508:     repeat item instead. If a char item has a minumum of more than one, ensure
 4509:     that it is set in reqbyte - it might not be if a sequence such as x{3} is
 4510:     the first thing in a branch because the x will have gone into firstbyte
 4511:     instead.  */
 4512: 
 4513:     if (*previous == OP_CHAR || *previous == OP_CHARI)
 4514:       {
 4515:       op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;
 4516: 
 4517:       /* Deal with UTF-8 characters that take up more than one byte. It's
 4518:       easier to write this out separately than try to macrify it. Use c to
 4519:       hold the length of the character in bytes, plus 0x80 to flag that it's a
 4520:       length rather than a small character. */
 4521: 
 4522: #ifdef SUPPORT_UTF8
 4523:       if (utf8 && (code[-1] & 0x80) != 0)
 4524:         {
 4525:         uschar *lastchar = code - 1;
 4526:         while((*lastchar & 0xc0) == 0x80) lastchar--;
 4527:         c = (int)(code - lastchar);     /* Length of UTF-8 character */
 4528:         memcpy(utf8_char, lastchar, c); /* Save the char */
 4529:         c |= 0x80;                      /* Flag c as a length */
 4530:         }
 4531:       else
 4532: #endif
 4533: 
 4534:       /* Handle the case of a single byte - either with no UTF8 support, or
 4535:       with UTF-8 disabled, or for a UTF-8 character < 128. */
 4536: 
 4537:         {
 4538:         c = code[-1];
 4539:         if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
 4540:         }
 4541: 
 4542:       /* If the repetition is unlimited, it pays to see if the next thing on
 4543:       the line is something that cannot possibly match this character. If so,
 4544:       automatically possessifying this item gains some performance in the case
 4545:       where the match fails. */
 4546: 
 4547:       if (!possessive_quantifier &&
 4548:           repeat_max < 0 &&
 4549:           check_auto_possessive(previous, utf8, ptr + 1, options, cd))
 4550:         {
 4551:         repeat_type = 0;    /* Force greedy */
 4552:         possessive_quantifier = TRUE;
 4553:         }
 4554: 
 4555:       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
 4556:       }
 4557: 
 4558:     /* If previous was a single negated character ([^a] or similar), we use
 4559:     one of the special opcodes, replacing it. The code is shared with single-
 4560:     character repeats by setting opt_type to add a suitable offset into
 4561:     repeat_type. We can also test for auto-possessification. OP_NOT and OP_NOTI
 4562:     are currently used only for single-byte chars. */
 4563: 
 4564:     else if (*previous == OP_NOT || *previous == OP_NOTI)
 4565:       {
 4566:       op_type = ((*previous == OP_NOT)? OP_NOTSTAR : OP_NOTSTARI) - OP_STAR;
 4567:       c = previous[1];
 4568:       if (!possessive_quantifier &&
 4569:           repeat_max < 0 &&
 4570:           check_auto_possessive(previous, utf8, ptr + 1, options, cd))
 4571:         {
 4572:         repeat_type = 0;    /* Force greedy */
 4573:         possessive_quantifier = TRUE;
 4574:         }
 4575:       goto OUTPUT_SINGLE_REPEAT;
 4576:       }
 4577: 
 4578:     /* If previous was a character type match (\d or similar), abolish it and
 4579:     create a suitable repeat item. The code is shared with single-character
 4580:     repeats by setting op_type to add a suitable offset into repeat_type. Note
 4581:     the the Unicode property types will be present only when SUPPORT_UCP is
 4582:     defined, but we don't wrap the little bits of code here because it just
 4583:     makes it horribly messy. */
 4584: 
 4585:     else if (*previous < OP_EODN)
 4586:       {
 4587:       uschar *oldcode;
 4588:       int prop_type, prop_value;
 4589:       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
 4590:       c = *previous;
 4591: 
 4592:       if (!possessive_quantifier &&
 4593:           repeat_max < 0 &&
 4594:           check_auto_possessive(previous, utf8, ptr + 1, options, cd))
 4595:         {
 4596:         repeat_type = 0;    /* Force greedy */
 4597:         possessive_quantifier = TRUE;
 4598:         }
 4599: 
 4600:       OUTPUT_SINGLE_REPEAT:
 4601:       if (*previous == OP_PROP || *previous == OP_NOTPROP)
 4602:         {
 4603:         prop_type = previous[1];
 4604:         prop_value = previous[2];
 4605:         }
 4606:       else prop_type = prop_value = -1;
 4607: 
 4608:       oldcode = code;
 4609:       code = previous;                  /* Usually overwrite previous item */
 4610: 
 4611:       /* If the maximum is zero then the minimum must also be zero; Perl allows
 4612:       this case, so we do too - by simply omitting the item altogether. */
 4613: 
 4614:       if (repeat_max == 0) goto END_REPEAT;
 4615: 
 4616:       /*--------------------------------------------------------------------*/
 4617:       /* This code is obsolete from release 8.00; the restriction was finally
 4618:       removed: */
 4619: 
 4620:       /* All real repeats make it impossible to handle partial matching (maybe
 4621:       one day we will be able to remove this restriction). */
 4622: 
 4623:       /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
 4624:       /*--------------------------------------------------------------------*/
 4625: 
 4626:       /* Combine the op_type with the repeat_type */
 4627: 
 4628:       repeat_type += op_type;
 4629: 
 4630:       /* A minimum of zero is handled either as the special case * or ?, or as
 4631:       an UPTO, with the maximum given. */
 4632: 
 4633:       if (repeat_min == 0)
 4634:         {
 4635:         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
 4636:           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
 4637:         else
 4638:           {
 4639:           *code++ = OP_UPTO + repeat_type;
 4640:           PUT2INC(code, 0, repeat_max);
 4641:           }
 4642:         }
 4643: 
 4644:       /* A repeat minimum of 1 is optimized into some special cases. If the
 4645:       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
 4646:       left in place and, if the maximum is greater than 1, we use OP_UPTO with
 4647:       one less than the maximum. */
 4648: 
 4649:       else if (repeat_min == 1)
 4650:         {
 4651:         if (repeat_max == -1)
 4652:           *code++ = OP_PLUS + repeat_type;
 4653:         else
 4654:           {
 4655:           code = oldcode;                 /* leave previous item in place */
 4656:           if (repeat_max == 1) goto END_REPEAT;
 4657:           *code++ = OP_UPTO + repeat_type;
 4658:           PUT2INC(code, 0, repeat_max - 1);
 4659:           }
 4660:         }
 4661: 
 4662:       /* The case {n,n} is just an EXACT, while the general case {n,m} is
 4663:       handled as an EXACT followed by an UPTO. */
 4664: 
 4665:       else
 4666:         {
 4667:         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
 4668:         PUT2INC(code, 0, repeat_min);
 4669: 
 4670:         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
 4671:         we have to insert the character for the previous code. For a repeated
 4672:         Unicode property match, there are two extra bytes that define the
 4673:         required property. In UTF-8 mode, long characters have their length in
 4674:         c, with the 0x80 bit as a flag. */
 4675: 
 4676:         if (repeat_max < 0)
 4677:           {
 4678: #ifdef SUPPORT_UTF8
 4679:           if (utf8 && c >= 128)
 4680:             {
 4681:             memcpy(code, utf8_char, c & 7);
 4682:             code += c & 7;
 4683:             }
 4684:           else
 4685: #endif
 4686:             {
 4687:             *code++ = c;
 4688:             if (prop_type >= 0)
 4689:               {
 4690:               *code++ = prop_type;
 4691:               *code++ = prop_value;
 4692:               }
 4693:             }
 4694:           *code++ = OP_STAR + repeat_type;
 4695:           }
 4696: 
 4697:         /* Else insert an UPTO if the max is greater than the min, again
 4698:         preceded by the character, for the previously inserted code. If the
 4699:         UPTO is just for 1 instance, we can use QUERY instead. */
 4700: 
 4701:         else if (repeat_max != repeat_min)
 4702:           {
 4703: #ifdef SUPPORT_UTF8
 4704:           if (utf8 && c >= 128)
 4705:             {
 4706:             memcpy(code, utf8_char, c & 7);
 4707:             code += c & 7;
 4708:             }
 4709:           else
 4710: #endif
 4711:           *code++ = c;
 4712:           if (prop_type >= 0)
 4713:             {
 4714:             *code++ = prop_type;
 4715:             *code++ = prop_value;
 4716:             }
 4717:           repeat_max -= repeat_min;
 4718: 
 4719:           if (repeat_max == 1)
 4720:             {
 4721:             *code++ = OP_QUERY + repeat_type;
 4722:             }
 4723:           else
 4724:             {
 4725:             *code++ = OP_UPTO + repeat_type;
 4726:             PUT2INC(code, 0, repeat_max);
 4727:             }
 4728:           }
 4729:         }
 4730: 
 4731:       /* The character or character type itself comes last in all cases. */
 4732: 
 4733: #ifdef SUPPORT_UTF8
 4734:       if (utf8 && c >= 128)
 4735:         {
 4736:         memcpy(code, utf8_char, c & 7);
 4737:         code += c & 7;
 4738:         }
 4739:       else
 4740: #endif
 4741:       *code++ = c;
 4742: 
 4743:       /* For a repeated Unicode property match, there are two extra bytes that
 4744:       define the required property. */
 4745: 
 4746: #ifdef SUPPORT_UCP
 4747:       if (prop_type >= 0)
 4748:         {
 4749:         *code++ = prop_type;
 4750:         *code++ = prop_value;
 4751:         }
 4752: #endif
 4753:       }
 4754: 
 4755:     /* If previous was a character class or a back reference, we put the repeat
 4756:     stuff after it, but just skip the item if the repeat was {0,0}. */
 4757: 
 4758:     else if (*previous == OP_CLASS ||
 4759:              *previous == OP_NCLASS ||
 4760: #ifdef SUPPORT_UTF8
 4761:              *previous == OP_XCLASS ||
 4762: #endif
 4763:              *previous == OP_REF ||
 4764:              *previous == OP_REFI)
 4765:       {
 4766:       if (repeat_max == 0)
 4767:         {
 4768:         code = previous;
 4769:         goto END_REPEAT;
 4770:         }
 4771: 
 4772:       /*--------------------------------------------------------------------*/
 4773:       /* This code is obsolete from release 8.00; the restriction was finally
 4774:       removed: */
 4775: 
 4776:       /* All real repeats make it impossible to handle partial matching (maybe
 4777:       one day we will be able to remove this restriction). */
 4778: 
 4779:       /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
 4780:       /*--------------------------------------------------------------------*/
 4781: 
 4782:       if (repeat_min == 0 && repeat_max == -1)
 4783:         *code++ = OP_CRSTAR + repeat_type;
 4784:       else if (repeat_min == 1 && repeat_max == -1)
 4785:         *code++ = OP_CRPLUS + repeat_type;
 4786:       else if (repeat_min == 0 && repeat_max == 1)
 4787:         *code++ = OP_CRQUERY + repeat_type;
 4788:       else
 4789:         {
 4790:         *code++ = OP_CRRANGE + repeat_type;
 4791:         PUT2INC(code, 0, repeat_min);
 4792:         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
 4793:         PUT2INC(code, 0, repeat_max);
 4794:         }
 4795:       }
 4796: 
 4797:     /* If previous was a bracket group, we may have to replicate it in certain
 4798:     cases. Note that at this point we can encounter only the "basic" bracket
 4799:     opcodes such as BRA and CBRA, as this is the place where they get converted
 4800:     into the more special varieties such as BRAPOS and SBRA. A test for >=
 4801:     OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
 4802:     ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow
 4803:     repetition of assertions, but now it does, for Perl compatibility. */
 4804: 
 4805:     else if (*previous >= OP_ASSERT && *previous <= OP_COND)
 4806:       {
 4807:       register int i;
 4808:       int len = (int)(code - previous);
 4809:       uschar *bralink = NULL;
 4810:       uschar *brazeroptr = NULL;
 4811: 
 4812:       /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
 4813:       we just ignore the repeat. */
 4814: 
 4815:       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
 4816:         goto END_REPEAT;
 4817: 
 4818:       /* There is no sense in actually repeating assertions. The only potential
 4819:       use of repetition is in cases when the assertion is optional. Therefore,
 4820:       if the minimum is greater than zero, just ignore the repeat. If the
 4821:       maximum is not not zero or one, set it to 1. */
 4822: 
 4823:       if (*previous < OP_ONCE)    /* Assertion */
 4824:         {
 4825:         if (repeat_min > 0) goto END_REPEAT;
 4826:         if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
 4827:         }
 4828: 
 4829:       /* The case of a zero minimum is special because of the need to stick
 4830:       OP_BRAZERO in front of it, and because the group appears once in the
 4831:       data, whereas in other cases it appears the minimum number of times. For
 4832:       this reason, it is simplest to treat this case separately, as otherwise
 4833:       the code gets far too messy. There are several special subcases when the
 4834:       minimum is zero. */
 4835: 
 4836:       if (repeat_min == 0)
 4837:         {
 4838:         /* If the maximum is also zero, we used to just omit the group from the
 4839:         output altogether, like this:
 4840: 
 4841:         ** if (repeat_max == 0)
 4842:         **   {
 4843:         **   code = previous;
 4844:         **   goto END_REPEAT;
 4845:         **   }
 4846: 
 4847:         However, that fails when a group or a subgroup within it is referenced
 4848:         as a subroutine from elsewhere in the pattern, so now we stick in
 4849:         OP_SKIPZERO in front of it so that it is skipped on execution. As we
 4850:         don't have a list of which groups are referenced, we cannot do this
 4851:         selectively.
 4852: 
 4853:         If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
 4854:         and do no more at this point. However, we do need to adjust any
 4855:         OP_RECURSE calls inside the group that refer to the group itself or any
 4856:         internal or forward referenced group, because the offset is from the
 4857:         start of the whole regex. Temporarily terminate the pattern while doing
 4858:         this. */
 4859: 
 4860:         if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
 4861:           {
 4862:           *code = OP_END;
 4863:           adjust_recurse(previous, 1, utf8, cd, save_hwm);
 4864:           memmove(previous+1, previous, len);
 4865:           code++;
 4866:           if (repeat_max == 0)
 4867:             {
 4868:             *previous++ = OP_SKIPZERO;
 4869:             goto END_REPEAT;
 4870:             }
 4871:           brazeroptr = previous;    /* Save for possessive optimizing */
 4872:           *previous++ = OP_BRAZERO + repeat_type;
 4873:           }
 4874: 
 4875:         /* If the maximum is greater than 1 and limited, we have to replicate
 4876:         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
 4877:         The first one has to be handled carefully because it's the original
 4878:         copy, which has to be moved up. The remainder can be handled by code
 4879:         that is common with the non-zero minimum case below. We have to
 4880:         adjust the value or repeat_max, since one less copy is required. Once
 4881:         again, we may have to adjust any OP_RECURSE calls inside the group. */
 4882: 
 4883:         else
 4884:           {
 4885:           int offset;
 4886:           *code = OP_END;
 4887:           adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
 4888:           memmove(previous + 2 + LINK_SIZE, previous, len);
 4889:           code += 2 + LINK_SIZE;
 4890:           *previous++ = OP_BRAZERO + repeat_type;
 4891:           *previous++ = OP_BRA;
 4892: 
 4893:           /* We chain together the bracket offset fields that have to be
 4894:           filled in later when the ends of the brackets are reached. */
 4895: 
 4896:           offset = (bralink == NULL)? 0 : (int)(previous - bralink);
 4897:           bralink = previous;
 4898:           PUTINC(previous, 0, offset);
 4899:           }
 4900: 
 4901:         repeat_max--;
 4902:         }
 4903: 
 4904:       /* If the minimum is greater than zero, replicate the group as many
 4905:       times as necessary, and adjust the maximum to the number of subsequent
 4906:       copies that we need. If we set a first char from the group, and didn't
 4907:       set a required char, copy the latter from the former. If there are any
 4908:       forward reference subroutine calls in the group, there will be entries on
 4909:       the workspace list; replicate these with an appropriate increment. */
 4910: 
 4911:       else
 4912:         {
 4913:         if (repeat_min > 1)
 4914:           {
 4915:           /* In the pre-compile phase, we don't actually do the replication. We
 4916:           just adjust the length as if we had. Do some paranoid checks for
 4917:           potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
 4918:           integer type when available, otherwise double. */
 4919: 
 4920:           if (lengthptr != NULL)
 4921:             {
 4922:             int delta = (repeat_min - 1)*length_prevgroup;
 4923:             if ((INT64_OR_DOUBLE)(repeat_min - 1)*
 4924:                   (INT64_OR_DOUBLE)length_prevgroup >
 4925:                     (INT64_OR_DOUBLE)INT_MAX ||
 4926:                 OFLOW_MAX - *lengthptr < delta)
 4927:               {
 4928:               *errorcodeptr = ERR20;
 4929:               goto FAILED;
 4930:               }
 4931:             *lengthptr += delta;
 4932:             }
 4933: 
 4934:           /* This is compiling for real. If there is a set first byte for
 4935:           the group, and we have not yet set a "required byte", set it. Make
 4936:           sure there is enough workspace for copying forward references before
 4937:           doing the copy. */
 4938: 
 4939:           else
 4940:             {
 4941:             if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
 4942: 
 4943:             for (i = 1; i < repeat_min; i++)
 4944:               {
 4945:               uschar *hc;
 4946:               uschar *this_hwm = cd->hwm;
 4947:               memcpy(code, previous, len);
 4948: 
 4949:               while (cd->hwm > cd->start_workspace + cd->workspace_size -
 4950:                      WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
 4951:                 {
 4952:                 int save_offset = save_hwm - cd->start_workspace;
 4953:                 int this_offset = this_hwm - cd->start_workspace;
 4954:                 *errorcodeptr = expand_workspace(cd);
 4955:                 if (*errorcodeptr != 0) goto FAILED;
 4956:                 save_hwm = (uschar *)cd->start_workspace + save_offset;
 4957:                 this_hwm = (uschar *)cd->start_workspace + this_offset;
 4958:                 }
 4959: 
 4960:               for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
 4961:                 {
 4962:                 PUT(cd->hwm, 0, GET(hc, 0) + len);
 4963:                 cd->hwm += LINK_SIZE;
 4964:                 }
 4965:               save_hwm = this_hwm;
 4966:               code += len;
 4967:               }
 4968:             }
 4969:           }
 4970: 
 4971:         if (repeat_max > 0) repeat_max -= repeat_min;
 4972:         }
 4973: 
 4974:       /* This code is common to both the zero and non-zero minimum cases. If
 4975:       the maximum is limited, it replicates the group in a nested fashion,
 4976:       remembering the bracket starts on a stack. In the case of a zero minimum,
 4977:       the first one was set up above. In all cases the repeat_max now specifies
 4978:       the number of additional copies needed. Again, we must remember to
 4979:       replicate entries on the forward reference list. */
 4980: 
 4981:       if (repeat_max >= 0)
 4982:         {
 4983:         /* In the pre-compile phase, we don't actually do the replication. We
 4984:         just adjust the length as if we had. For each repetition we must add 1
 4985:         to the length for BRAZERO and for all but the last repetition we must
 4986:         add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
 4987:         paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
 4988:         a 64-bit integer type when available, otherwise double. */
 4989: 
 4990:         if (lengthptr != NULL && repeat_max > 0)
 4991:           {
 4992:           int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
 4993:                       2 - 2*LINK_SIZE;   /* Last one doesn't nest */
 4994:           if ((INT64_OR_DOUBLE)repeat_max *
 4995:                 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
 4996:                   > (INT64_OR_DOUBLE)INT_MAX ||
 4997:               OFLOW_MAX - *lengthptr < delta)
 4998:             {
 4999:             *errorcodeptr = ERR20;
 5000:             goto FAILED;
 5001:             }
 5002:           *lengthptr += delta;
 5003:           }
 5004: 
 5005:         /* This is compiling for real */
 5006: 
 5007:         else for (i = repeat_max - 1; i >= 0; i--)
 5008:           {
 5009:           uschar *hc;
 5010:           uschar *this_hwm = cd->hwm;
 5011: 
 5012:           *code++ = OP_BRAZERO + repeat_type;
 5013: 
 5014:           /* All but the final copy start a new nesting, maintaining the
 5015:           chain of brackets outstanding. */
 5016: 
 5017:           if (i != 0)
 5018:             {
 5019:             int offset;
 5020:             *code++ = OP_BRA;
 5021:             offset = (bralink == NULL)? 0 : (int)(code - bralink);
 5022:             bralink = code;
 5023:             PUTINC(code, 0, offset);
 5024:             }
 5025: 
 5026:           memcpy(code, previous, len);
 5027: 
 5028:           /* Ensure there is enough workspace for forward references before
 5029:           copying them. */
 5030: 
 5031:           while (cd->hwm > cd->start_workspace + cd->workspace_size -
 5032:                  WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
 5033:             {
 5034:             int save_offset = save_hwm - cd->start_workspace;
 5035:             int this_offset = this_hwm - cd->start_workspace;
 5036:             *errorcodeptr = expand_workspace(cd);
 5037:             if (*errorcodeptr != 0) goto FAILED;
 5038:             save_hwm = (uschar *)cd->start_workspace + save_offset;
 5039:             this_hwm = (uschar *)cd->start_workspace + this_offset;
 5040:             }
 5041: 
 5042:           for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
 5043:             {
 5044:             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
 5045:             cd->hwm += LINK_SIZE;
 5046:             }
 5047:           save_hwm = this_hwm;
 5048:           code += len;
 5049:           }
 5050: 
 5051:         /* Now chain through the pending brackets, and fill in their length
 5052:         fields (which are holding the chain links pro tem). */
 5053: 
 5054:         while (bralink != NULL)
 5055:           {
 5056:           int oldlinkoffset;
 5057:           int offset = (int)(code - bralink + 1);
 5058:           uschar *bra = code - offset;
 5059:           oldlinkoffset = GET(bra, 1);
 5060:           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
 5061:           *code++ = OP_KET;
 5062:           PUTINC(code, 0, offset);
 5063:           PUT(bra, 1, offset);
 5064:           }
 5065:         }
 5066: 
 5067:       /* If the maximum is unlimited, set a repeater in the final copy. For
 5068:       ONCE brackets, that's all we need to do. However, possessively repeated
 5069:       ONCE brackets can be converted into non-capturing brackets, as the
 5070:       behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
 5071:       deal with possessive ONCEs specially.
 5072: 
 5073:       Otherwise, when we are doing the actual compile phase, check to see
 5074:       whether this group is one that could match an empty string. If so,
 5075:       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
 5076:       that runtime checking can be done. [This check is also applied to ONCE
 5077:       groups at runtime, but in a different way.]
 5078: 
 5079:       Then, if the quantifier was possessive and the bracket is not a
 5080:       conditional, we convert the BRA code to the POS form, and the KET code to
 5081:       KETRPOS. (It turns out to be convenient at runtime to detect this kind of
 5082:       subpattern at both the start and at the end.) The use of special opcodes
 5083:       makes it possible to reduce greatly the stack usage in pcre_exec(). If
 5084:       the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
 5085: 
 5086:       Then, if the minimum number of matches is 1 or 0, cancel the possessive
 5087:       flag so that the default action below, of wrapping everything inside
 5088:       atomic brackets, does not happen. When the minimum is greater than 1,
 5089:       there will be earlier copies of the group, and so we still have to wrap
 5090:       the whole thing. */
 5091: 
 5092:       else
 5093:         {
 5094:         uschar *ketcode = code - 1 - LINK_SIZE;
 5095:         uschar *bracode = ketcode - GET(ketcode, 1);
 5096: 
 5097:         /* Convert possessive ONCE brackets to non-capturing */
 5098: 
 5099:         if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
 5100:             possessive_quantifier) *bracode = OP_BRA;
 5101: 
 5102:         /* For non-possessive ONCE brackets, all we need to do is to
 5103:         set the KET. */
 5104: 
 5105:         if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
 5106:           *ketcode = OP_KETRMAX + repeat_type;
 5107: 
 5108:         /* Handle non-ONCE brackets and possessive ONCEs (which have been
 5109:         converted to non-capturing above). */
 5110: 
 5111:         else
 5112:           {
 5113:           /* In the compile phase, check for empty string matching. */
 5114: 
 5115:           if (lengthptr == NULL)
 5116:             {
 5117:             uschar *scode = bracode;
 5118:             do
 5119:               {
 5120:               if (could_be_empty_branch(scode, ketcode, utf8, cd))
 5121:                 {
 5122:                 *bracode += OP_SBRA - OP_BRA;
 5123:                 break;
 5124:                 }
 5125:               scode += GET(scode, 1);
 5126:               }
 5127:             while (*scode == OP_ALT);
 5128:             }
 5129: 
 5130:           /* Handle possessive quantifiers. */
 5131: 
 5132:           if (possessive_quantifier)
 5133:             {
 5134:             /* For COND brackets, we wrap the whole thing in a possessively
 5135:             repeated non-capturing bracket, because we have not invented POS
 5136:             versions of the COND opcodes. Because we are moving code along, we
 5137:             must ensure that any pending recursive references are updated. */
 5138: 
 5139:             if (*bracode == OP_COND || *bracode == OP_SCOND)
 5140:               {
 5141:               int nlen = (int)(code - bracode);
 5142:               *code = OP_END;
 5143:               adjust_recurse(bracode, 1 + LINK_SIZE, utf8, cd, save_hwm);
 5144:               memmove(bracode + 1+LINK_SIZE, bracode, nlen);
 5145:               code += 1 + LINK_SIZE;
 5146:               nlen += 1 + LINK_SIZE;
 5147:               *bracode = OP_BRAPOS;
 5148:               *code++ = OP_KETRPOS;
 5149:               PUTINC(code, 0, nlen);
 5150:               PUT(bracode, 1, nlen);
 5151:               }
 5152: 
 5153:             /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
 5154: 
 5155:             else
 5156:               {
 5157:               *bracode += 1;              /* Switch to xxxPOS opcodes */
 5158:               *ketcode = OP_KETRPOS;
 5159:               }
 5160: 
 5161:             /* If the minimum is zero, mark it as possessive, then unset the
 5162:             possessive flag when the minimum is 0 or 1. */
 5163: 
 5164:             if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
 5165:             if (repeat_min < 2) possessive_quantifier = FALSE;
 5166:             }
 5167: 
 5168:           /* Non-possessive quantifier */
 5169: 
 5170:           else *ketcode = OP_KETRMAX + repeat_type;
 5171:           }
 5172:         }
 5173:       }
 5174: 
 5175:     /* If previous is OP_FAIL, it was generated by an empty class [] in
 5176:     JavaScript mode. The other ways in which OP_FAIL can be generated, that is
 5177:     by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
 5178:     error above. We can just ignore the repeat in JS case. */
 5179: 
 5180:     else if (*previous == OP_FAIL) goto END_REPEAT;
 5181: 
 5182:     /* Else there's some kind of shambles */
 5183: 
 5184:     else
 5185:       {
 5186:       *errorcodeptr = ERR11;
 5187:       goto FAILED;
 5188:       }
 5189: 
 5190:     /* If the character following a repeat is '+', or if certain optimization
 5191:     tests above succeeded, possessive_quantifier is TRUE. For some opcodes,
 5192:     there are special alternative opcodes for this case. For anything else, we
 5193:     wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'
 5194:     notation is just syntactic sugar, taken from Sun's Java package, but the
 5195:     special opcodes can optimize it.
 5196: 
 5197:     Some (but not all) possessively repeated subpatterns have already been
 5198:     completely handled in the code just above. For them, possessive_quantifier
 5199:     is always FALSE at this stage.
 5200: 
 5201:     Note that the repeated item starts at tempcode, not at previous, which
 5202:     might be the first part of a string whose (former) last char we repeated.
 5203: 
 5204:     Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
 5205:     an 'upto' may follow. We skip over an 'exact' item, and then test the
 5206:     length of what remains before proceeding. */
 5207: 
 5208:     if (possessive_quantifier)
 5209:       {
 5210:       int len;
 5211: 
 5212:       if (*tempcode == OP_TYPEEXACT)
 5213:         tempcode += _pcre_OP_lengths[*tempcode] +
 5214:           ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);
 5215: 
 5216:       else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
 5217:         {
 5218:         tempcode += _pcre_OP_lengths[*tempcode];
 5219: #ifdef SUPPORT_UTF8
 5220:         if (utf8 && tempcode[-1] >= 0xc0)
 5221:           tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];
 5222: #endif
 5223:         }
 5224: 
 5225:       len = (int)(code - tempcode);
 5226:       if (len > 0) switch (*tempcode)
 5227:         {
 5228:         case OP_STAR:  *tempcode = OP_POSSTAR; break;
 5229:         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
 5230:         case OP_QUERY: *tempcode = OP_POSQUERY; break;
 5231:         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
 5232: 
 5233:         case OP_STARI:  *tempcode = OP_POSSTARI; break;
 5234:         case OP_PLUSI:  *tempcode = OP_POSPLUSI; break;
 5235:         case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
 5236:         case OP_UPTOI:  *tempcode = OP_POSUPTOI; break;
 5237: 
 5238:         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
 5239:         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
 5240:         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
 5241:         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
 5242: 
 5243:         case OP_NOTSTARI:  *tempcode = OP_NOTPOSSTARI; break;
 5244:         case OP_NOTPLUSI:  *tempcode = OP_NOTPOSPLUSI; break;
 5245:         case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
 5246:         case OP_NOTUPTOI:  *tempcode = OP_NOTPOSUPTOI; break;
 5247: 
 5248:         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
 5249:         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
 5250:         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
 5251:         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
 5252: 
 5253:         /* Because we are moving code along, we must ensure that any
 5254:         pending recursive references are updated. */
 5255: 
 5256:         default:
 5257:         *code = OP_END;
 5258:         adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm);
 5259:         memmove(tempcode + 1+LINK_SIZE, tempcode, len);
 5260:         code += 1 + LINK_SIZE;
 5261:         len += 1 + LINK_SIZE;
 5262:         tempcode[0] = OP_ONCE;
 5263:         *code++ = OP_KET;
 5264:         PUTINC(code, 0, len);
 5265:         PUT(tempcode, 1, len);
 5266:         break;
 5267:         }
 5268:       }
 5269: 
 5270:     /* In all case we no longer have a previous item. We also set the
 5271:     "follows varying string" flag for subsequently encountered reqbytes if
 5272:     it isn't already set and we have just passed a varying length item. */
 5273: 
 5274:     END_REPEAT:
 5275:     previous = NULL;
 5276:     cd->req_varyopt |= reqvary;
 5277:     break;
 5278: 
 5279: 
 5280:     /* ===================================================================*/
 5281:     /* Start of nested parenthesized sub-expression, or comment or lookahead or
 5282:     lookbehind or option setting or condition or all the other extended
 5283:     parenthesis forms.  */
 5284: 
 5285:     case CHAR_LEFT_PARENTHESIS:
 5286:     newoptions = options;
 5287:     skipbytes = 0;
 5288:     bravalue = OP_CBRA;
 5289:     save_hwm = cd->hwm;
 5290:     reset_bracount = FALSE;
 5291: 
 5292:     /* First deal with various "verbs" that can be introduced by '*'. */
 5293: 
 5294:     if (*(++ptr) == CHAR_ASTERISK &&
 5295:          ((cd->ctypes[ptr[1]] & ctype_letter) != 0 || ptr[1] == ':'))
 5296:       {
 5297:       int i, namelen;
 5298:       int arglen = 0;
 5299:       const char *vn = verbnames;
 5300:       const uschar *name = ptr + 1;
 5301:       const uschar *arg = NULL;
 5302:       previous = NULL;
 5303:       while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
 5304:       namelen = (int)(ptr - name);
 5305: 
 5306:       /* It appears that Perl allows any characters whatsoever, other than
 5307:       a closing parenthesis, to appear in arguments, so we no longer insist on
 5308:       letters, digits, and underscores. */
 5309: 
 5310:       if (*ptr == CHAR_COLON)
 5311:         {
 5312:         arg = ++ptr;
 5313:         while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
 5314:         arglen = (int)(ptr - arg);
 5315:         }
 5316: 
 5317:       if (*ptr != CHAR_RIGHT_PARENTHESIS)
 5318:         {
 5319:         *errorcodeptr = ERR60;
 5320:         goto FAILED;
 5321:         }
 5322: 
 5323:       /* Scan the table of verb names */
 5324: 
 5325:       for (i = 0; i < verbcount; i++)
 5326:         {
 5327:         if (namelen == verbs[i].len &&
 5328:             strncmp((char *)name, vn, namelen) == 0)
 5329:           {
 5330:           /* Check for open captures before ACCEPT and convert it to
 5331:           ASSERT_ACCEPT if in an assertion. */
 5332: 
 5333:           if (verbs[i].op == OP_ACCEPT)
 5334:             {
 5335:             open_capitem *oc;
 5336:             if (arglen != 0)
 5337:               {
 5338:               *errorcodeptr = ERR59;
 5339:               goto FAILED;
 5340:               }
 5341:             cd->had_accept = TRUE;
 5342:             for (oc = cd->open_caps; oc != NULL; oc = oc->next)
 5343:               {
 5344:               *code++ = OP_CLOSE;
 5345:               PUT2INC(code, 0, oc->number);
 5346:               }
 5347:             *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
 5348: 
 5349:             /* Do not set firstbyte after *ACCEPT */
 5350:             if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
 5351:             }
 5352: 
 5353:           /* Handle other cases with/without an argument */
 5354: 
 5355:           else if (arglen == 0)
 5356:             {
 5357:             if (verbs[i].op < 0)   /* Argument is mandatory */
 5358:               {
 5359:               *errorcodeptr = ERR66;
 5360:               goto FAILED;
 5361:               }
 5362:             *code = verbs[i].op;
 5363:             if (*code++ == OP_THEN) cd->external_flags |= PCRE_HASTHEN;
 5364:             }
 5365: 
 5366:           else
 5367:             {
 5368:             if (verbs[i].op_arg < 0)   /* Argument is forbidden */
 5369:               {
 5370:               *errorcodeptr = ERR59;
 5371:               goto FAILED;
 5372:               }
 5373:             *code = verbs[i].op_arg;
 5374:             if (*code++ == OP_THEN_ARG) cd->external_flags |= PCRE_HASTHEN;
 5375:             *code++ = arglen;
 5376:             memcpy(code, arg, arglen);
 5377:             code += arglen;
 5378:             *code++ = 0;
 5379:             }
 5380: 
 5381:           break;  /* Found verb, exit loop */
 5382:           }
 5383: 
 5384:         vn += verbs[i].len + 1;
 5385:         }
 5386: 
 5387:       if (i < verbcount) continue;    /* Successfully handled a verb */
 5388:       *errorcodeptr = ERR60;          /* Verb not recognized */
 5389:       goto FAILED;
 5390:       }
 5391: 
 5392:     /* Deal with the extended parentheses; all are introduced by '?', and the
 5393:     appearance of any of them means that this is not a capturing group. */
 5394: 
 5395:     else if (*ptr == CHAR_QUESTION_MARK)
 5396:       {
 5397:       int i, set, unset, namelen;
 5398:       int *optset;
 5399:       const uschar *name;
 5400:       uschar *slot;
 5401: 
 5402:       switch (*(++ptr))
 5403:         {
 5404:         case CHAR_NUMBER_SIGN:                 /* Comment; skip to ket */
 5405:         ptr++;
 5406:         while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
 5407:         if (*ptr == 0)
 5408:           {
 5409:           *errorcodeptr = ERR18;
 5410:           goto FAILED;
 5411:           }
 5412:         continue;
 5413: 
 5414: 
 5415:         /* ------------------------------------------------------------ */
 5416:         case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
 5417:         reset_bracount = TRUE;
 5418:         /* Fall through */
 5419: 
 5420:         /* ------------------------------------------------------------ */
 5421:         case CHAR_COLON:          /* Non-capturing bracket */
 5422:         bravalue = OP_BRA;
 5423:         ptr++;
 5424:         break;
 5425: 
 5426: 
 5427:         /* ------------------------------------------------------------ */
 5428:         case CHAR_LEFT_PARENTHESIS:
 5429:         bravalue = OP_COND;       /* Conditional group */
 5430: 
 5431:         /* A condition can be an assertion, a number (referring to a numbered
 5432:         group), a name (referring to a named group), or 'R', referring to
 5433:         recursion. R<digits> and R&name are also permitted for recursion tests.
 5434: 
 5435:         There are several syntaxes for testing a named group: (?(name)) is used
 5436:         by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
 5437: 
 5438:         There are two unfortunate ambiguities, caused by history. (a) 'R' can
 5439:         be the recursive thing or the name 'R' (and similarly for 'R' followed
 5440:         by digits), and (b) a number could be a name that consists of digits.
 5441:         In both cases, we look for a name first; if not found, we try the other
 5442:         cases. */
 5443: 
 5444:         /* For conditions that are assertions, check the syntax, and then exit
 5445:         the switch. This will take control down to where bracketed groups,
 5446:         including assertions, are processed. */
 5447: 
 5448:         if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
 5449:             ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
 5450:           break;
 5451: 
 5452:         /* Most other conditions use OP_CREF (a couple change to OP_RREF
 5453:         below), and all need to skip 3 bytes at the start of the group. */
 5454: 
 5455:         code[1+LINK_SIZE] = OP_CREF;
 5456:         skipbytes = 3;
 5457:         refsign = -1;
 5458: 
 5459:         /* Check for a test for recursion in a named group. */
 5460: 
 5461:         if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
 5462:           {
 5463:           terminator = -1;
 5464:           ptr += 2;
 5465:           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
 5466:           }
 5467: 
 5468:         /* Check for a test for a named group's having been set, using the Perl
 5469:         syntax (?(<name>) or (?('name') */
 5470: 
 5471:         else if (ptr[1] == CHAR_LESS_THAN_SIGN)
 5472:           {
 5473:           terminator = CHAR_GREATER_THAN_SIGN;
 5474:           ptr++;
 5475:           }
 5476:         else if (ptr[1] == CHAR_APOSTROPHE)
 5477:           {
 5478:           terminator = CHAR_APOSTROPHE;
 5479:           ptr++;
 5480:           }
 5481:         else
 5482:           {
 5483:           terminator = 0;
 5484:           if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
 5485:           }
 5486: 
 5487:         /* We now expect to read a name; any thing else is an error */
 5488: 
 5489:         if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
 5490:           {
 5491:           ptr += 1;  /* To get the right offset */
 5492:           *errorcodeptr = ERR28;
 5493:           goto FAILED;
 5494:           }
 5495: 
 5496:         /* Read the name, but also get it as a number if it's all digits */
 5497: 
 5498:         recno = 0;
 5499:         name = ++ptr;
 5500:         while ((cd->ctypes[*ptr] & ctype_word) != 0)
 5501:           {
 5502:           if (recno >= 0)
 5503:             recno = ((digitab[*ptr] & ctype_digit) != 0)?
 5504:               recno * 10 + *ptr - CHAR_0 : -1;
 5505:           ptr++;
 5506:           }
 5507:         namelen = (int)(ptr - name);
 5508: 
 5509:         if ((terminator > 0 && *ptr++ != terminator) ||
 5510:             *ptr++ != CHAR_RIGHT_PARENTHESIS)
 5511:           {
 5512:           ptr--;      /* Error offset */
 5513:           *errorcodeptr = ERR26;
 5514:           goto FAILED;
 5515:           }
 5516: 
 5517:         /* Do no further checking in the pre-compile phase. */
 5518: 
 5519:         if (lengthptr != NULL) break;
 5520: 
 5521:         /* In the real compile we do the work of looking for the actual
 5522:         reference. If the string started with "+" or "-" we require the rest to
 5523:         be digits, in which case recno will be set. */
 5524: 
 5525:         if (refsign > 0)
 5526:           {
 5527:           if (recno <= 0)
 5528:             {
 5529:             *errorcodeptr = ERR58;
 5530:             goto FAILED;
 5531:             }
 5532:           recno = (refsign == CHAR_MINUS)?
 5533:             cd->bracount - recno + 1 : recno +cd->bracount;
 5534:           if (recno <= 0 || recno > cd->final_bracount)
 5535:             {
 5536:             *errorcodeptr = ERR15;
 5537:             goto FAILED;
 5538:             }
 5539:           PUT2(code, 2+LINK_SIZE, recno);
 5540:           break;
 5541:           }
 5542: 
 5543:         /* Otherwise (did not start with "+" or "-"), start by looking for the
 5544:         name. If we find a name, add one to the opcode to change OP_CREF or
 5545:         OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,
 5546:         except they record that the reference was originally to a name. The
 5547:         information is used to check duplicate names. */
 5548: 
 5549:         slot = cd->name_table;
 5550:         for (i = 0; i < cd->names_found; i++)
 5551:           {
 5552:           if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
 5553:           slot += cd->name_entry_size;
 5554:           }
 5555: 
 5556:         /* Found a previous named subpattern */
 5557: 
 5558:         if (i < cd->names_found)
 5559:           {
 5560:           recno = GET2(slot, 0);
 5561:           PUT2(code, 2+LINK_SIZE, recno);
 5562:           code[1+LINK_SIZE]++;
 5563:           }
 5564: 
 5565:         /* Search the pattern for a forward reference */
 5566: 
 5567:         else if ((i = find_parens(cd, name, namelen,
 5568:                         (options & PCRE_EXTENDED) != 0, utf8)) > 0)
 5569:           {
 5570:           PUT2(code, 2+LINK_SIZE, i);
 5571:           code[1+LINK_SIZE]++;
 5572:           }
 5573: 
 5574:         /* If terminator == 0 it means that the name followed directly after
 5575:         the opening parenthesis [e.g. (?(abc)...] and in this case there are
 5576:         some further alternatives to try. For the cases where terminator != 0
 5577:         [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
 5578:         now checked all the possibilities, so give an error. */
 5579: 
 5580:         else if (terminator != 0)
 5581:           {
 5582:           *errorcodeptr = ERR15;
 5583:           goto FAILED;
 5584:           }
 5585: 
 5586:         /* Check for (?(R) for recursion. Allow digits after R to specify a
 5587:         specific group number. */
 5588: 
 5589:         else if (*name == CHAR_R)
 5590:           {
 5591:           recno = 0;
 5592:           for (i = 1; i < namelen; i++)
 5593:             {
 5594:             if ((digitab[name[i]] & ctype_digit) == 0)
 5595:               {
 5596:               *errorcodeptr = ERR15;
 5597:               goto FAILED;
 5598:               }
 5599:             recno = recno * 10 + name[i] - CHAR_0;
 5600:             }
 5601:           if (recno == 0) recno = RREF_ANY;
 5602:           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
 5603:           PUT2(code, 2+LINK_SIZE, recno);
 5604:           }
 5605: 
 5606:         /* Similarly, check for the (?(DEFINE) "condition", which is always
 5607:         false. */
 5608: 
 5609:         else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)
 5610:           {
 5611:           code[1+LINK_SIZE] = OP_DEF;
 5612:           skipbytes = 1;
 5613:           }
 5614: 
 5615:         /* Check for the "name" actually being a subpattern number. We are
 5616:         in the second pass here, so final_bracount is set. */
 5617: 
 5618:         else if (recno > 0 && recno <= cd->final_bracount)
 5619:           {
 5620:           PUT2(code, 2+LINK_SIZE, recno);
 5621:           }
 5622: 
 5623:         /* Either an unidentified subpattern, or a reference to (?(0) */
 5624: 
 5625:         else
 5626:           {
 5627:           *errorcodeptr = (recno == 0)? ERR35: ERR15;
 5628:           goto FAILED;
 5629:           }
 5630:         break;
 5631: 
 5632: 
 5633:         /* ------------------------------------------------------------ */
 5634:         case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
 5635:         bravalue = OP_ASSERT;
 5636:         cd->assert_depth += 1;
 5637:         ptr++;
 5638:         break;
 5639: 
 5640: 
 5641:         /* ------------------------------------------------------------ */
 5642:         case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
 5643:         ptr++;
 5644:         if (*ptr == CHAR_RIGHT_PARENTHESIS)    /* Optimize (?!) */
 5645:           {
 5646:           *code++ = OP_FAIL;
 5647:           previous = NULL;
 5648:           continue;
 5649:           }
 5650:         bravalue = OP_ASSERT_NOT;
 5651:         cd->assert_depth += 1;
 5652:         break;
 5653: 
 5654: 
 5655:         /* ------------------------------------------------------------ */
 5656:         case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
 5657:         switch (ptr[1])
 5658:           {
 5659:           case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
 5660:           bravalue = OP_ASSERTBACK;
 5661:           cd->assert_depth += 1;
 5662:           ptr += 2;
 5663:           break;
 5664: 
 5665:           case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
 5666:           bravalue = OP_ASSERTBACK_NOT;
 5667:           cd->assert_depth += 1;
 5668:           ptr += 2;
 5669:           break;
 5670: 
 5671:           default:                /* Could be name define, else bad */
 5672:           if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
 5673:           ptr++;                  /* Correct offset for error */
 5674:           *errorcodeptr = ERR24;
 5675:           goto FAILED;
 5676:           }
 5677:         break;
 5678: 
 5679: 
 5680:         /* ------------------------------------------------------------ */
 5681:         case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
 5682:         bravalue = OP_ONCE;
 5683:         ptr++;
 5684:         break;
 5685: 
 5686: 
 5687:         /* ------------------------------------------------------------ */
 5688:         case CHAR_C:                 /* Callout - may be followed by digits; */
 5689:         previous_callout = code;     /* Save for later completion */
 5690:         after_manual_callout = 1;    /* Skip one item before completing */
 5691:         *code++ = OP_CALLOUT;
 5692:           {
 5693:           int n = 0;
 5694:           while ((digitab[*(++ptr)] & ctype_digit) != 0)
 5695:             n = n * 10 + *ptr - CHAR_0;
 5696:           if (*ptr != CHAR_RIGHT_PARENTHESIS)
 5697:             {
 5698:             *errorcodeptr = ERR39;
 5699:             goto FAILED;
 5700:             }
 5701:           if (n > 255)
 5702:             {
 5703:             *errorcodeptr = ERR38;
 5704:             goto FAILED;
 5705:             }
 5706:           *code++ = n;
 5707:           PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
 5708:           PUT(code, LINK_SIZE, 0);                          /* Default length */
 5709:           code += 2 * LINK_SIZE;
 5710:           }
 5711:         previous = NULL;
 5712:         continue;
 5713: 
 5714: 
 5715:         /* ------------------------------------------------------------ */
 5716:         case CHAR_P:              /* Python-style named subpattern handling */
 5717:         if (*(++ptr) == CHAR_EQUALS_SIGN ||
 5718:             *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
 5719:           {
 5720:           is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
 5721:           terminator = CHAR_RIGHT_PARENTHESIS;
 5722:           goto NAMED_REF_OR_RECURSE;
 5723:           }
 5724:         else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
 5725:           {
 5726:           *errorcodeptr = ERR41;
 5727:           goto FAILED;
 5728:           }
 5729:         /* Fall through to handle (?P< as (?< is handled */
 5730: 
 5731: 
 5732:         /* ------------------------------------------------------------ */
 5733:         DEFINE_NAME:    /* Come here from (?< handling */
 5734:         case CHAR_APOSTROPHE:
 5735:           {
 5736:           terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
 5737:             CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
 5738:           name = ++ptr;
 5739: 
 5740:           while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
 5741:           namelen = (int)(ptr - name);
 5742: 
 5743:           /* In the pre-compile phase, just do a syntax check. */
 5744: 
 5745:           if (lengthptr != NULL)
 5746:             {
 5747:             if (*ptr != terminator)
 5748:               {
 5749:               *errorcodeptr = ERR42;
 5750:               goto FAILED;
 5751:               }
 5752:             if (cd->names_found >= MAX_NAME_COUNT)
 5753:               {
 5754:               *errorcodeptr = ERR49;
 5755:               goto FAILED;
 5756:               }
 5757:             if (namelen + 3 > cd->name_entry_size)
 5758:               {
 5759:               cd->name_entry_size = namelen + 3;
 5760:               if (namelen > MAX_NAME_SIZE)
 5761:                 {
 5762:                 *errorcodeptr = ERR48;
 5763:                 goto FAILED;
 5764:                 }
 5765:               }
 5766:             }
 5767: 
 5768:           /* In the real compile, create the entry in the table, maintaining
 5769:           alphabetical order. Duplicate names for different numbers are
 5770:           permitted only if PCRE_DUPNAMES is set. Duplicate names for the same
 5771:           number are always OK. (An existing number can be re-used if (?|
 5772:           appears in the pattern.) In either event, a duplicate name results in
 5773:           a duplicate entry in the table, even if the number is the same. This
 5774:           is because the number of names, and hence the table size, is computed
 5775:           in the pre-compile, and it affects various numbers and pointers which
 5776:           would all have to be modified, and the compiled code moved down, if
 5777:           duplicates with the same number were omitted from the table. This
 5778:           doesn't seem worth the hassle. However, *different* names for the
 5779:           same number are not permitted. */
 5780: 
 5781:           else
 5782:             {
 5783:             BOOL dupname = FALSE;
 5784:             slot = cd->name_table;
 5785: 
 5786:             for (i = 0; i < cd->names_found; i++)
 5787:               {
 5788:               int crc = memcmp(name, slot+2, namelen);
 5789:               if (crc == 0)
 5790:                 {
 5791:                 if (slot[2+namelen] == 0)
 5792:                   {
 5793:                   if (GET2(slot, 0) != cd->bracount + 1 &&
 5794:                       (options & PCRE_DUPNAMES) == 0)
 5795:                     {
 5796:                     *errorcodeptr = ERR43;
 5797:                     goto FAILED;
 5798:                     }
 5799:                   else dupname = TRUE;
 5800:                   }
 5801:                 else crc = -1;      /* Current name is a substring */
 5802:                 }
 5803: 
 5804:               /* Make space in the table and break the loop for an earlier
 5805:               name. For a duplicate or later name, carry on. We do this for
 5806:               duplicates so that in the simple case (when ?(| is not used) they
 5807:               are in order of their numbers. */
 5808: 
 5809:               if (crc < 0)
 5810:                 {
 5811:                 memmove(slot + cd->name_entry_size, slot,
 5812:                   (cd->names_found - i) * cd->name_entry_size);
 5813:                 break;
 5814:                 }
 5815: 
 5816:               /* Continue the loop for a later or duplicate name */
 5817: 
 5818:               slot += cd->name_entry_size;
 5819:               }
 5820: 
 5821:             /* For non-duplicate names, check for a duplicate number before
 5822:             adding the new name. */
 5823: 
 5824:             if (!dupname)
 5825:               {
 5826:               uschar *cslot = cd->name_table;
 5827:               for (i = 0; i < cd->names_found; i++)
 5828:                 {
 5829:                 if (cslot != slot)
 5830:                   {
 5831:                   if (GET2(cslot, 0) == cd->bracount + 1)
 5832:                     {
 5833:                     *errorcodeptr = ERR65;
 5834:                     goto FAILED;
 5835:                     }
 5836:                   }
 5837:                 else i--;
 5838:                 cslot += cd->name_entry_size;
 5839:                 }
 5840:               }
 5841: 
 5842:             PUT2(slot, 0, cd->bracount + 1);
 5843:             memcpy(slot + 2, name, namelen);
 5844:             slot[2+namelen] = 0;
 5845:             }
 5846:           }
 5847: 
 5848:         /* In both pre-compile and compile, count the number of names we've
 5849:         encountered. */
 5850: 
 5851:         cd->names_found++;
 5852:         ptr++;                    /* Move past > or ' */
 5853:         goto NUMBERED_GROUP;
 5854: 
 5855: 
 5856:         /* ------------------------------------------------------------ */
 5857:         case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
 5858:         terminator = CHAR_RIGHT_PARENTHESIS;
 5859:         is_recurse = TRUE;
 5860:         /* Fall through */
 5861: 
 5862:         /* We come here from the Python syntax above that handles both
 5863:         references (?P=name) and recursion (?P>name), as well as falling
 5864:         through from the Perl recursion syntax (?&name). We also come here from
 5865:         the Perl \k<name> or \k'name' back reference syntax and the \k{name}
 5866:         .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
 5867: 
 5868:         NAMED_REF_OR_RECURSE:
 5869:         name = ++ptr;
 5870:         while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
 5871:         namelen = (int)(ptr - name);
 5872: 
 5873:         /* In the pre-compile phase, do a syntax check. We used to just set
 5874:         a dummy reference number, because it was not used in the first pass.
 5875:         However, with the change of recursive back references to be atomic,
 5876:         we have to look for the number so that this state can be identified, as
 5877:         otherwise the incorrect length is computed. If it's not a backwards
 5878:         reference, the dummy number will do. */
 5879: 
 5880:         if (lengthptr != NULL)
 5881:           {
 5882:           const uschar *temp;
 5883: 
 5884:           if (namelen == 0)
 5885:             {
 5886:             *errorcodeptr = ERR62;
 5887:             goto FAILED;
 5888:             }
 5889:           if (*ptr != terminator)
 5890:             {
 5891:             *errorcodeptr = ERR42;
 5892:             goto FAILED;
 5893:             }
 5894:           if (namelen > MAX_NAME_SIZE)
 5895:             {
 5896:             *errorcodeptr = ERR48;
 5897:             goto FAILED;
 5898:             }
 5899: 
 5900:           /* The name table does not exist in the first pass, so we cannot
 5901:           do a simple search as in the code below. Instead, we have to scan the
 5902:           pattern to find the number. It is important that we scan it only as
 5903:           far as we have got because the syntax of named subpatterns has not
 5904:           been checked for the rest of the pattern, and find_parens() assumes
 5905:           correct syntax. In any case, it's a waste of resources to scan
 5906:           further. We stop the scan at the current point by temporarily
 5907:           adjusting the value of cd->endpattern. */
 5908: 
 5909:           temp = cd->end_pattern;
 5910:           cd->end_pattern = ptr;
 5911:           recno = find_parens(cd, name, namelen,
 5912:             (options & PCRE_EXTENDED) != 0, utf8);
 5913:           cd->end_pattern = temp;
 5914:           if (recno < 0) recno = 0;    /* Forward ref; set dummy number */
 5915:           }
 5916: 
 5917:         /* In the real compile, seek the name in the table. We check the name
 5918:         first, and then check that we have reached the end of the name in the
 5919:         table. That way, if the name that is longer than any in the table,
 5920:         the comparison will fail without reading beyond the table entry. */
 5921: 
 5922:         else
 5923:           {
 5924:           slot = cd->name_table;
 5925:           for (i = 0; i < cd->names_found; i++)
 5926:             {
 5927:             if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
 5928:                 slot[2+namelen] == 0)
 5929:               break;
 5930:             slot += cd->name_entry_size;
 5931:             }
 5932: 
 5933:           if (i < cd->names_found)         /* Back reference */
 5934:             {
 5935:             recno = GET2(slot, 0);
 5936:             }
 5937:           else if ((recno =                /* Forward back reference */
 5938:                     find_parens(cd, name, namelen,
 5939:                       (options & PCRE_EXTENDED) != 0, utf8)) <= 0)
 5940:             {
 5941:             *errorcodeptr = ERR15;
 5942:             goto FAILED;
 5943:             }
 5944:           }
 5945: 
 5946:         /* In both phases, we can now go to the code than handles numerical
 5947:         recursion or backreferences. */
 5948: 
 5949:         if (is_recurse) goto HANDLE_RECURSION;
 5950:           else goto HANDLE_REFERENCE;
 5951: 
 5952: 
 5953:         /* ------------------------------------------------------------ */
 5954:         case CHAR_R:              /* Recursion */
 5955:         ptr++;                    /* Same as (?0)      */
 5956:         /* Fall through */
 5957: 
 5958: 
 5959:         /* ------------------------------------------------------------ */
 5960:         case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
 5961:         case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
 5962:         case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
 5963:           {
 5964:           const uschar *called;
 5965:           terminator = CHAR_RIGHT_PARENTHESIS;
 5966: 
 5967:           /* Come here from the \g<...> and \g'...' code (Oniguruma
 5968:           compatibility). However, the syntax has been checked to ensure that
 5969:           the ... are a (signed) number, so that neither ERR63 nor ERR29 will
 5970:           be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
 5971:           ever be taken. */
 5972: 
 5973:           HANDLE_NUMERICAL_RECURSION:
 5974: 
 5975:           if ((refsign = *ptr) == CHAR_PLUS)
 5976:             {
 5977:             ptr++;
 5978:             if ((digitab[*ptr] & ctype_digit) == 0)
 5979:               {
 5980:               *errorcodeptr = ERR63;
 5981:               goto FAILED;
 5982:               }
 5983:             }
 5984:           else if (refsign == CHAR_MINUS)
 5985:             {
 5986:             if ((digitab[ptr[1]] & ctype_digit) == 0)
 5987:               goto OTHER_CHAR_AFTER_QUERY;
 5988:             ptr++;
 5989:             }
 5990: 
 5991:           recno = 0;
 5992:           while((digitab[*ptr] & ctype_digit) != 0)
 5993:             recno = recno * 10 + *ptr++ - CHAR_0;
 5994: 
 5995:           if (*ptr != terminator)
 5996:             {
 5997:             *errorcodeptr = ERR29;
 5998:             goto FAILED;
 5999:             }
 6000: 
 6001:           if (refsign == CHAR_MINUS)
 6002:             {
 6003:             if (recno == 0)
 6004:               {
 6005:               *errorcodeptr = ERR58;
 6006:               goto FAILED;
 6007:               }
 6008:             recno = cd->bracount - recno + 1;
 6009:             if (recno <= 0)
 6010:               {
 6011:               *errorcodeptr = ERR15;
 6012:               goto FAILED;
 6013:               }
 6014:             }
 6015:           else if (refsign == CHAR_PLUS)
 6016:             {
 6017:             if (recno == 0)
 6018:               {
 6019:               *errorcodeptr = ERR58;
 6020:               goto FAILED;
 6021:               }
 6022:             recno += cd->bracount;
 6023:             }
 6024: 
 6025:           /* Come here from code above that handles a named recursion */
 6026: 
 6027:           HANDLE_RECURSION:
 6028: 
 6029:           previous = code;
 6030:           called = cd->start_code;
 6031: 
 6032:           /* When we are actually compiling, find the bracket that is being
 6033:           referenced. Temporarily end the regex in case it doesn't exist before
 6034:           this point. If we end up with a forward reference, first check that
 6035:           the bracket does occur later so we can give the error (and position)
 6036:           now. Then remember this forward reference in the workspace so it can
 6037:           be filled in at the end. */
 6038: 
 6039:           if (lengthptr == NULL)
 6040:             {
 6041:             *code = OP_END;
 6042:             if (recno != 0)
 6043:               called = _pcre_find_bracket(cd->start_code, utf8, recno);
 6044: 
 6045:             /* Forward reference */
 6046: 
 6047:             if (called == NULL)
 6048:               {
 6049:               if (find_parens(cd, NULL, recno,
 6050:                     (options & PCRE_EXTENDED) != 0, utf8) < 0)
 6051:                 {
 6052:                 *errorcodeptr = ERR15;
 6053:                 goto FAILED;
 6054:                 }
 6055: 
 6056:               /* Fudge the value of "called" so that when it is inserted as an
 6057:               offset below, what it actually inserted is the reference number
 6058:               of the group. Then remember the forward reference. */
 6059: 
 6060:               called = cd->start_code + recno;
 6061:               if (cd->hwm >= cd->start_workspace + cd->workspace_size -
 6062:                   WORK_SIZE_SAFETY_MARGIN)
 6063:                 {
 6064:                 *errorcodeptr = expand_workspace(cd);
 6065:                 if (*errorcodeptr != 0) goto FAILED;
 6066:                 }
 6067:               PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
 6068:               }
 6069: 
 6070:             /* If not a forward reference, and the subpattern is still open,
 6071:             this is a recursive call. We check to see if this is a left
 6072:             recursion that could loop for ever, and diagnose that case. We
 6073:             must not, however, do this check if we are in a conditional
 6074:             subpattern because the condition might be testing for recursion in
 6075:             a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid.
 6076:             Forever loops are also detected at runtime, so those that occur in
 6077:             conditional subpatterns will be picked up then. */
 6078: 
 6079:             else if (GET(called, 1) == 0 && cond_depth <= 0 &&
 6080:                      could_be_empty(called, code, bcptr, utf8, cd))
 6081:               {
 6082:               *errorcodeptr = ERR40;
 6083:               goto FAILED;
 6084:               }
 6085:             }
 6086: 
 6087:           /* Insert the recursion/subroutine item. It does not have a set first
 6088:           byte (relevant if it is repeated, because it will then be wrapped
 6089:           with ONCE brackets). */
 6090: 
 6091:           *code = OP_RECURSE;
 6092:           PUT(code, 1, (int)(called - cd->start_code));
 6093:           code += 1 + LINK_SIZE;
 6094:           groupsetfirstbyte = FALSE;
 6095:           }
 6096: 
 6097:         /* Can't determine a first byte now */
 6098: 
 6099:         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
 6100:         continue;
 6101: 
 6102: 
 6103:         /* ------------------------------------------------------------ */
 6104:         default:              /* Other characters: check option setting */
 6105:         OTHER_CHAR_AFTER_QUERY:
 6106:         set = unset = 0;
 6107:         optset = &set;
 6108: 
 6109:         while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
 6110:           {
 6111:           switch (*ptr++)
 6112:             {
 6113:             case CHAR_MINUS: optset = &unset; break;
 6114: 
 6115:             case CHAR_J:    /* Record that it changed in the external options */
 6116:             *optset |= PCRE_DUPNAMES;
 6117:             cd->external_flags |= PCRE_JCHANGED;
 6118:             break;
 6119: 
 6120:             case CHAR_i: *optset |= PCRE_CASELESS; break;
 6121:             case CHAR_m: *optset |= PCRE_MULTILINE; break;
 6122:             case CHAR_s: *optset |= PCRE_DOTALL; break;
 6123:             case CHAR_x: *optset |= PCRE_EXTENDED; break;
 6124:             case CHAR_U: *optset |= PCRE_UNGREEDY; break;
 6125:             case CHAR_X: *optset |= PCRE_EXTRA; break;
 6126: 
 6127:             default:  *errorcodeptr = ERR12;
 6128:                       ptr--;    /* Correct the offset */
 6129:                       goto FAILED;
 6130:             }
 6131:           }
 6132: 
 6133:         /* Set up the changed option bits, but don't change anything yet. */
 6134: 
 6135:         newoptions = (options | set) & (~unset);
 6136: 
 6137:         /* If the options ended with ')' this is not the start of a nested
 6138:         group with option changes, so the options change at this level. If this
 6139:         item is right at the start of the pattern, the options can be
 6140:         abstracted and made external in the pre-compile phase, and ignored in
 6141:         the compile phase. This can be helpful when matching -- for instance in
 6142:         caseless checking of required bytes.
 6143: 
 6144:         If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
 6145:         definitely *not* at the start of the pattern because something has been
 6146:         compiled. In the pre-compile phase, however, the code pointer can have
 6147:         that value after the start, because it gets reset as code is discarded
 6148:         during the pre-compile. However, this can happen only at top level - if
 6149:         we are within parentheses, the starting BRA will still be present. At
 6150:         any parenthesis level, the length value can be used to test if anything
 6151:         has been compiled at that level. Thus, a test for both these conditions
 6152:         is necessary to ensure we correctly detect the start of the pattern in
 6153:         both phases.
 6154: 
 6155:         If we are not at the pattern start, reset the greedy defaults and the
 6156:         case value for firstbyte and reqbyte. */
 6157: 
 6158:         if (*ptr == CHAR_RIGHT_PARENTHESIS)
 6159:           {
 6160:           if (code == cd->start_code + 1 + LINK_SIZE &&
 6161:                (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
 6162:             {
 6163:             cd->external_options = newoptions;
 6164:             }
 6165:           else
 6166:             {
 6167:             greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
 6168:             greedy_non_default = greedy_default ^ 1;
 6169:             req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
 6170:             }
 6171: 
 6172:           /* Change options at this level, and pass them back for use
 6173:           in subsequent branches. */
 6174: 
 6175:           *optionsptr = options = newoptions;
 6176:           previous = NULL;       /* This item can't be repeated */
 6177:           continue;              /* It is complete */
 6178:           }
 6179: 
 6180:         /* If the options ended with ':' we are heading into a nested group
 6181:         with possible change of options. Such groups are non-capturing and are
 6182:         not assertions of any kind. All we need to do is skip over the ':';
 6183:         the newoptions value is handled below. */
 6184: 
 6185:         bravalue = OP_BRA;
 6186:         ptr++;
 6187:         }     /* End of switch for character following (? */
 6188:       }       /* End of (? handling */
 6189: 
 6190:     /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
 6191:     is set, all unadorned brackets become non-capturing and behave like (?:...)
 6192:     brackets. */
 6193: 
 6194:     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
 6195:       {
 6196:       bravalue = OP_BRA;
 6197:       }
 6198: 
 6199:     /* Else we have a capturing group. */
 6200: 
 6201:     else
 6202:       {
 6203:       NUMBERED_GROUP:
 6204:       cd->bracount += 1;
 6205:       PUT2(code, 1+LINK_SIZE, cd->bracount);
 6206:       skipbytes = 2;
 6207:       }
 6208: 
 6209:     /* Process nested bracketed regex. Assertions used not to be repeatable,
 6210:     but this was changed for Perl compatibility, so all kinds can now be
 6211:     repeated. We copy code into a non-register variable (tempcode) in order to
 6212:     be able to pass its address because some compilers complain otherwise. */
 6213: 
 6214:     previous = code;                      /* For handling repetition */
 6215:     *code = bravalue;
 6216:     tempcode = code;
 6217:     tempreqvary = cd->req_varyopt;        /* Save value before bracket */
 6218:     tempbracount = cd->bracount;          /* Save value before bracket */
 6219:     length_prevgroup = 0;                 /* Initialize for pre-compile phase */
 6220: 
 6221:     if (!compile_regex(
 6222:          newoptions,                      /* The complete new option state */
 6223:          &tempcode,                       /* Where to put code (updated) */
 6224:          &ptr,                            /* Input pointer (updated) */
 6225:          errorcodeptr,                    /* Where to put an error message */
 6226:          (bravalue == OP_ASSERTBACK ||
 6227:           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
 6228:          reset_bracount,                  /* True if (?| group */
 6229:          skipbytes,                       /* Skip over bracket number */
 6230:          cond_depth +
 6231:            ((bravalue == OP_COND)?1:0),   /* Depth of condition subpatterns */
 6232:          &subfirstbyte,                   /* For possible first char */
 6233:          &subreqbyte,                     /* For possible last char */
 6234:          bcptr,                           /* Current branch chain */
 6235:          cd,                              /* Tables block */
 6236:          (lengthptr == NULL)? NULL :      /* Actual compile phase */
 6237:            &length_prevgroup              /* Pre-compile phase */
 6238:          ))
 6239:       goto FAILED;
 6240: 
 6241:     /* If this was an atomic group and there are no capturing groups within it,
 6242:     generate OP_ONCE_NC instead of OP_ONCE. */
 6243: 
 6244:     if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
 6245:       *code = OP_ONCE_NC;
 6246: 
 6247:     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
 6248:       cd->assert_depth -= 1;
 6249: 
 6250:     /* At the end of compiling, code is still pointing to the start of the
 6251:     group, while tempcode has been updated to point past the end of the group.
 6252:     The pattern pointer (ptr) is on the bracket.
 6253: 
 6254:     If this is a conditional bracket, check that there are no more than
 6255:     two branches in the group, or just one if it's a DEFINE group. We do this
 6256:     in the real compile phase, not in the pre-pass, where the whole group may
 6257:     not be available. */
 6258: 
 6259:     if (bravalue == OP_COND && lengthptr == NULL)
 6260:       {
 6261:       uschar *tc = code;
 6262:       int condcount = 0;
 6263: 
 6264:       do {
 6265:          condcount++;
 6266:          tc += GET(tc,1);
 6267:          }
 6268:       while (*tc != OP_KET);
 6269: 
 6270:       /* A DEFINE group is never obeyed inline (the "condition" is always
 6271:       false). It must have only one branch. */
 6272: 
 6273:       if (code[LINK_SIZE+1] == OP_DEF)
 6274:         {
 6275:         if (condcount > 1)
 6276:           {
 6277:           *errorcodeptr = ERR54;
 6278:           goto FAILED;
 6279:           }
 6280:         bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
 6281:         }
 6282: 
 6283:       /* A "normal" conditional group. If there is just one branch, we must not
 6284:       make use of its firstbyte or reqbyte, because this is equivalent to an
 6285:       empty second branch. */
 6286: 
 6287:       else
 6288:         {
 6289:         if (condcount > 2)
 6290:           {
 6291:           *errorcodeptr = ERR27;
 6292:           goto FAILED;
 6293:           }
 6294:         if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
 6295:         }
 6296:       }
 6297: 
 6298:     /* Error if hit end of pattern */
 6299: 
 6300:     if (*ptr != CHAR_RIGHT_PARENTHESIS)
 6301:       {
 6302:       *errorcodeptr = ERR14;
 6303:       goto FAILED;
 6304:       }
 6305: 
 6306:     /* In the pre-compile phase, update the length by the length of the group,
 6307:     less the brackets at either end. Then reduce the compiled code to just a
 6308:     set of non-capturing brackets so that it doesn't use much memory if it is
 6309:     duplicated by a quantifier.*/
 6310: 
 6311:     if (lengthptr != NULL)
 6312:       {
 6313:       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
 6314:         {
 6315:         *errorcodeptr = ERR20;
 6316:         goto FAILED;
 6317:         }
 6318:       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
 6319:       code++;   /* This already contains bravalue */
 6320:       PUTINC(code, 0, 1 + LINK_SIZE);
 6321:       *code++ = OP_KET;
 6322:       PUTINC(code, 0, 1 + LINK_SIZE);
 6323:       break;    /* No need to waste time with special character handling */
 6324:       }
 6325: 
 6326:     /* Otherwise update the main code pointer to the end of the group. */
 6327: 
 6328:     code = tempcode;
 6329: 
 6330:     /* For a DEFINE group, required and first character settings are not
 6331:     relevant. */
 6332: 
 6333:     if (bravalue == OP_DEF) break;
 6334: 
 6335:     /* Handle updating of the required and first characters for other types of
 6336:     group. Update for normal brackets of all kinds, and conditions with two
 6337:     branches (see code above). If the bracket is followed by a quantifier with
 6338:     zero repeat, we have to back off. Hence the definition of zeroreqbyte and
 6339:     zerofirstbyte outside the main loop so that they can be accessed for the
 6340:     back off. */
 6341: 
 6342:     zeroreqbyte = reqbyte;
 6343:     zerofirstbyte = firstbyte;
 6344:     groupsetfirstbyte = FALSE;
 6345: 
 6346:     if (bravalue >= OP_ONCE)
 6347:       {
 6348:       /* If we have not yet set a firstbyte in this branch, take it from the
 6349:       subpattern, remembering that it was set here so that a repeat of more
 6350:       than one can replicate it as reqbyte if necessary. If the subpattern has
 6351:       no firstbyte, set "none" for the whole branch. In both cases, a zero
 6352:       repeat forces firstbyte to "none". */
 6353: 
 6354:       if (firstbyte == REQ_UNSET)
 6355:         {
 6356:         if (subfirstbyte >= 0)
 6357:           {
 6358:           firstbyte = subfirstbyte;
 6359:           groupsetfirstbyte = TRUE;
 6360:           }
 6361:         else firstbyte = REQ_NONE;
 6362:         zerofirstbyte = REQ_NONE;
 6363:         }
 6364: 
 6365:       /* If firstbyte was previously set, convert the subpattern's firstbyte
 6366:       into reqbyte if there wasn't one, using the vary flag that was in
 6367:       existence beforehand. */
 6368: 
 6369:       else if (subfirstbyte >= 0 && subreqbyte < 0)
 6370:         subreqbyte = subfirstbyte | tempreqvary;
 6371: 
 6372:       /* If the subpattern set a required byte (or set a first byte that isn't
 6373:       really the first byte - see above), set it. */
 6374: 
 6375:       if (subreqbyte >= 0) reqbyte = subreqbyte;
 6376:       }
 6377: 
 6378:     /* For a forward assertion, we take the reqbyte, if set. This can be
 6379:     helpful if the pattern that follows the assertion doesn't set a different
 6380:     char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
 6381:     for an assertion, however because it leads to incorrect effect for patterns
 6382:     such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
 6383:     of a firstbyte. This is overcome by a scan at the end if there's no
 6384:     firstbyte, looking for an asserted first char. */
 6385: 
 6386:     else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
 6387:     break;     /* End of processing '(' */
 6388: 
 6389: 
 6390:     /* ===================================================================*/
 6391:     /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
 6392:     are arranged to be the negation of the corresponding OP_values in the
 6393:     default case when PCRE_UCP is not set. For the back references, the values
 6394:     are ESC_REF plus the reference number. Only back references and those types
 6395:     that consume a character may be repeated. We can test for values between
 6396:     ESC_b and ESC_Z for the latter; this may have to change if any new ones are
 6397:     ever created. */
 6398: 
 6399:     case CHAR_BACKSLASH:
 6400:     tempptr = ptr;
 6401:     c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
 6402:     if (*errorcodeptr != 0) goto FAILED;
 6403: 
 6404:     if (c < 0)
 6405:       {
 6406:       if (-c == ESC_Q)            /* Handle start of quoted string */
 6407:         {
 6408:         if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
 6409:           ptr += 2;               /* avoid empty string */
 6410:             else inescq = TRUE;
 6411:         continue;
 6412:         }
 6413: 
 6414:       if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
 6415: 
 6416:       /* For metasequences that actually match a character, we disable the
 6417:       setting of a first character if it hasn't already been set. */
 6418: 
 6419:       if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
 6420:         firstbyte = REQ_NONE;
 6421: 
 6422:       /* Set values to reset to if this is followed by a zero repeat. */
 6423: 
 6424:       zerofirstbyte = firstbyte;
 6425:       zeroreqbyte = reqbyte;
 6426: 
 6427:       /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
 6428:       is a subroutine call by number (Oniguruma syntax). In fact, the value
 6429:       -ESC_g is returned only for these cases. So we don't need to check for <
 6430:       or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
 6431:       -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
 6432:       that is a synonym for a named back reference). */
 6433: 
 6434:       if (-c == ESC_g)
 6435:         {
 6436:         const uschar *p;
 6437:         save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
 6438:         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
 6439:           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
 6440: 
 6441:         /* These two statements stop the compiler for warning about possibly
 6442:         unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
 6443:         fact, because we actually check for a number below, the paths that
 6444:         would actually be in error are never taken. */
 6445: 
 6446:         skipbytes = 0;
 6447:         reset_bracount = FALSE;
 6448: 
 6449:         /* Test for a name */
 6450: 
 6451:         if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
 6452:           {
 6453:           BOOL isnumber = TRUE;
 6454:           for (p = ptr + 1; *p != 0 && *p != terminator; p++)
 6455:             {
 6456:             if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
 6457:             if ((cd->ctypes[*p] & ctype_word) == 0) break;
 6458:             }
 6459:           if (*p != terminator)
 6460:             {
 6461:             *errorcodeptr = ERR57;
 6462:             break;
 6463:             }
 6464:           if (isnumber)
 6465:             {
 6466:             ptr++;
 6467:             goto HANDLE_NUMERICAL_RECURSION;
 6468:             }
 6469:           is_recurse = TRUE;
 6470:           goto NAMED_REF_OR_RECURSE;
 6471:           }
 6472: 
 6473:         /* Test a signed number in angle brackets or quotes. */
 6474: 
 6475:         p = ptr + 2;
 6476:         while ((digitab[*p] & ctype_digit) != 0) p++;
 6477:         if (*p != terminator)
 6478:           {
 6479:           *errorcodeptr = ERR57;
 6480:           break;
 6481:           }
 6482:         ptr++;
 6483:         goto HANDLE_NUMERICAL_RECURSION;
 6484:         }
 6485: 
 6486:       /* \k<name> or \k'name' is a back reference by name (Perl syntax).
 6487:       We also support \k{name} (.NET syntax).  */
 6488: 
 6489:       if (-c == ESC_k)
 6490:         {
 6491:         if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
 6492:           ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
 6493:           {
 6494:           *errorcodeptr = ERR69;
 6495:           break;
 6496:           }
 6497:         is_recurse = FALSE;
 6498:         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
 6499:           CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
 6500:           CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
 6501:         goto NAMED_REF_OR_RECURSE;
 6502:         }
 6503: 
 6504:       /* Back references are handled specially; must disable firstbyte if
 6505:       not set to cope with cases like (?=(\w+))\1: which would otherwise set
 6506:       ':' later. */
 6507: 
 6508:       if (-c >= ESC_REF)
 6509:         {
 6510:         open_capitem *oc;
 6511:         recno = -c - ESC_REF;
 6512: 
 6513:         HANDLE_REFERENCE:    /* Come here from named backref handling */
 6514:         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
 6515:         previous = code;
 6516:         *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
 6517:         PUT2INC(code, 0, recno);
 6518:         cd->backref_map |= (recno < 32)? (1 << recno) : 1;
 6519:         if (recno > cd->top_backref) cd->top_backref = recno;
 6520: 
 6521:         /* Check to see if this back reference is recursive, that it, it
 6522:         is inside the group that it references. A flag is set so that the
 6523:         group can be made atomic. */
 6524: 
 6525:         for (oc = cd->open_caps; oc != NULL; oc = oc->next)
 6526:           {
 6527:           if (oc->number == recno)
 6528:             {
 6529:             oc->flag = TRUE;
 6530:             break;
 6531:             }
 6532:           }
 6533:         }
 6534: 
 6535:       /* So are Unicode property matches, if supported. */
 6536: 
 6537: #ifdef SUPPORT_UCP
 6538:       else if (-c == ESC_P || -c == ESC_p)
 6539:         {
 6540:         BOOL negated;
 6541:         int pdata;
 6542:         int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
 6543:         if (ptype < 0) goto FAILED;
 6544:         previous = code;
 6545:         *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
 6546:         *code++ = ptype;
 6547:         *code++ = pdata;
 6548:         }
 6549: #else
 6550: 
 6551:       /* If Unicode properties are not supported, \X, \P, and \p are not
 6552:       allowed. */
 6553: 
 6554:       else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
 6555:         {
 6556:         *errorcodeptr = ERR45;
 6557:         goto FAILED;
 6558:         }
 6559: #endif
 6560: 
 6561:       /* For the rest (including \X when Unicode properties are supported), we
 6562:       can obtain the OP value by negating the escape value in the default
 6563:       situation when PCRE_UCP is not set. When it *is* set, we substitute
 6564:       Unicode property tests. */
 6565: 
 6566:       else
 6567:         {
 6568: #ifdef SUPPORT_UCP
 6569:         if (-c >= ESC_DU && -c <= ESC_wu)
 6570:           {
 6571:           nestptr = ptr + 1;                   /* Where to resume */
 6572:           ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
 6573:           }
 6574:         else
 6575: #endif
 6576:         /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
 6577:         so that it works in DFA mode and in lookbehinds. */
 6578: 
 6579:           {
 6580:           previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
 6581:           *code++ = (!utf8 && c == -ESC_C)? OP_ALLANY : -c;
 6582:           }
 6583:         }
 6584:       continue;
 6585:       }
 6586: 
 6587:     /* We have a data character whose value is in c. In UTF-8 mode it may have
 6588:     a value > 127. We set its representation in the length/buffer, and then
 6589:     handle it as a data character. */
 6590: 
 6591: #ifdef SUPPORT_UTF8
 6592:     if (utf8 && c > 127)
 6593:       mclength = _pcre_ord2utf8(c, mcbuffer);
 6594:     else
 6595: #endif
 6596: 
 6597:      {
 6598:      mcbuffer[0] = c;
 6599:      mclength = 1;
 6600:      }
 6601:     goto ONE_CHAR;
 6602: 
 6603: 
 6604:     /* ===================================================================*/
 6605:     /* Handle a literal character. It is guaranteed not to be whitespace or #
 6606:     when the extended flag is set. If we are in UTF-8 mode, it may be a
 6607:     multi-byte literal character. */
 6608: 
 6609:     default:
 6610:     NORMAL_CHAR:
 6611:     mclength = 1;
 6612:     mcbuffer[0] = c;
 6613: 
 6614: #ifdef SUPPORT_UTF8
 6615:     if (utf8 && c >= 0xc0)
 6616:       {
 6617:       while ((ptr[1] & 0xc0) == 0x80)
 6618:         mcbuffer[mclength++] = *(++ptr);
 6619:       }
 6620: #endif
 6621: 
 6622:     /* At this point we have the character's bytes in mcbuffer, and the length
 6623:     in mclength. When not in UTF-8 mode, the length is always 1. */
 6624: 
 6625:     ONE_CHAR:
 6626:     previous = code;
 6627:     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR;
 6628:     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
 6629: 
 6630:     /* Remember if \r or \n were seen */
 6631: 
 6632:     if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
 6633:       cd->external_flags |= PCRE_HASCRORLF;
 6634: 
 6635:     /* Set the first and required bytes appropriately. If no previous first
 6636:     byte, set it from this character, but revert to none on a zero repeat.
 6637:     Otherwise, leave the firstbyte value alone, and don't change it on a zero
 6638:     repeat. */
 6639: 
 6640:     if (firstbyte == REQ_UNSET)
 6641:       {
 6642:       zerofirstbyte = REQ_NONE;
 6643:       zeroreqbyte = reqbyte;
 6644: 
 6645:       /* If the character is more than one byte long, we can set firstbyte
 6646:       only if it is not to be matched caselessly. */
 6647: 
 6648:       if (mclength == 1 || req_caseopt == 0)
 6649:         {
 6650:         firstbyte = mcbuffer[0] | req_caseopt;
 6651:         if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
 6652:         }
 6653:       else firstbyte = reqbyte = REQ_NONE;
 6654:       }
 6655: 
 6656:     /* firstbyte was previously set; we can set reqbyte only if the length is
 6657:     1 or the matching is caseful. */
 6658: 
 6659:     else
 6660:       {
 6661:       zerofirstbyte = firstbyte;
 6662:       zeroreqbyte = reqbyte;
 6663:       if (mclength == 1 || req_caseopt == 0)
 6664:         reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
 6665:       }
 6666: 
 6667:     break;            /* End of literal character handling */
 6668:     }
 6669:   }                   /* end of big loop */
 6670: 
 6671: 
 6672: /* Control never reaches here by falling through, only by a goto for all the
 6673: error states. Pass back the position in the pattern so that it can be displayed
 6674: to the user for diagnosing the error. */
 6675: 
 6676: FAILED:
 6677: *ptrptr = ptr;
 6678: return FALSE;
 6679: }
 6680: 
 6681: 
 6682: 
 6683: 
 6684: /*************************************************
 6685: *     Compile sequence of alternatives           *
 6686: *************************************************/
 6687: 
 6688: /* On entry, ptr is pointing past the bracket character, but on return it
 6689: points to the closing bracket, or vertical bar, or end of string. The code
 6690: variable is pointing at the byte into which the BRA operator has been stored.
 6691: This function is used during the pre-compile phase when we are trying to find
 6692: out the amount of memory needed, as well as during the real compile phase. The
 6693: value of lengthptr distinguishes the two phases.
 6694: 
 6695: Arguments:
 6696:   options        option bits, including any changes for this subpattern
 6697:   codeptr        -> the address of the current code pointer
 6698:   ptrptr         -> the address of the current pattern pointer
 6699:   errorcodeptr   -> pointer to error code variable
 6700:   lookbehind     TRUE if this is a lookbehind assertion
 6701:   reset_bracount TRUE to reset the count for each branch
 6702:   skipbytes      skip this many bytes at start (for brackets and OP_COND)
 6703:   cond_depth     depth of nesting for conditional subpatterns
 6704:   firstbyteptr   place to put the first required character, or a negative number
 6705:   reqbyteptr     place to put the last required character, or a negative number
 6706:   bcptr          pointer to the chain of currently open branches
 6707:   cd             points to the data block with tables pointers etc.
 6708:   lengthptr      NULL during the real compile phase
 6709:                  points to length accumulator during pre-compile phase
 6710: 
 6711: Returns:         TRUE on success
 6712: */
 6713: 
 6714: static BOOL
 6715: compile_regex(int options, uschar **codeptr, const uschar **ptrptr,
 6716:   int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
 6717:   int cond_depth, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
 6718:   compile_data *cd, int *lengthptr)
 6719: {
 6720: const uschar *ptr = *ptrptr;
 6721: uschar *code = *codeptr;
 6722: uschar *last_branch = code;
 6723: uschar *start_bracket = code;
 6724: uschar *reverse_count = NULL;
 6725: open_capitem capitem;
 6726: int capnumber = 0;
 6727: int firstbyte, reqbyte;
 6728: int branchfirstbyte, branchreqbyte;
 6729: int length;
 6730: int orig_bracount;
 6731: int max_bracount;
 6732: branch_chain bc;
 6733: 
 6734: bc.outer = bcptr;
 6735: bc.current_branch = code;
 6736: 
 6737: firstbyte = reqbyte = REQ_UNSET;
 6738: 
 6739: /* Accumulate the length for use in the pre-compile phase. Start with the
 6740: length of the BRA and KET and any extra bytes that are required at the
 6741: beginning. We accumulate in a local variable to save frequent testing of
 6742: lenthptr for NULL. We cannot do this by looking at the value of code at the
 6743: start and end of each alternative, because compiled items are discarded during
 6744: the pre-compile phase so that the work space is not exceeded. */
 6745: 
 6746: length = 2 + 2*LINK_SIZE + skipbytes;
 6747: 
 6748: /* WARNING: If the above line is changed for any reason, you must also change
 6749: the code that abstracts option settings at the start of the pattern and makes
 6750: them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
 6751: pre-compile phase to find out whether anything has yet been compiled or not. */
 6752: 
 6753: /* If this is a capturing subpattern, add to the chain of open capturing items
 6754: so that we can detect them if (*ACCEPT) is encountered. This is also used to
 6755: detect groups that contain recursive back references to themselves. Note that
 6756: only OP_CBRA need be tested here; changing this opcode to one of its variants,
 6757: e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
 6758: 
 6759: if (*code == OP_CBRA)
 6760:   {
 6761:   capnumber = GET2(code, 1 + LINK_SIZE);
 6762:   capitem.number = capnumber;
 6763:   capitem.next = cd->open_caps;
 6764:   capitem.flag = FALSE;
 6765:   cd->open_caps = &capitem;
 6766:   }
 6767: 
 6768: /* Offset is set zero to mark that this bracket is still open */
 6769: 
 6770: PUT(code, 1, 0);
 6771: code += 1 + LINK_SIZE + skipbytes;
 6772: 
 6773: /* Loop for each alternative branch */
 6774: 
 6775: orig_bracount = max_bracount = cd->bracount;
 6776: for (;;)
 6777:   {
 6778:   /* For a (?| group, reset the capturing bracket count so that each branch
 6779:   uses the same numbers. */
 6780: 
 6781:   if (reset_bracount) cd->bracount = orig_bracount;
 6782: 
 6783:   /* Set up dummy OP_REVERSE if lookbehind assertion */
 6784: 
 6785:   if (lookbehind)
 6786:     {
 6787:     *code++ = OP_REVERSE;
 6788:     reverse_count = code;
 6789:     PUTINC(code, 0, 0);
 6790:     length += 1 + LINK_SIZE;
 6791:     }
 6792: 
 6793:   /* Now compile the branch; in the pre-compile phase its length gets added
 6794:   into the length. */
 6795: 
 6796:   if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
 6797:         &branchreqbyte, &bc, cond_depth, cd,
 6798:         (lengthptr == NULL)? NULL : &length))
 6799:     {
 6800:     *ptrptr = ptr;
 6801:     return FALSE;
 6802:     }
 6803: 
 6804:   /* Keep the highest bracket count in case (?| was used and some branch
 6805:   has fewer than the rest. */
 6806: 
 6807:   if (cd->bracount > max_bracount) max_bracount = cd->bracount;
 6808: 
 6809:   /* In the real compile phase, there is some post-processing to be done. */
 6810: 
 6811:   if (lengthptr == NULL)
 6812:     {
 6813:     /* If this is the first branch, the firstbyte and reqbyte values for the
 6814:     branch become the values for the regex. */
 6815: 
 6816:     if (*last_branch != OP_ALT)
 6817:       {
 6818:       firstbyte = branchfirstbyte;
 6819:       reqbyte = branchreqbyte;
 6820:       }
 6821: 
 6822:     /* If this is not the first branch, the first char and reqbyte have to
 6823:     match the values from all the previous branches, except that if the
 6824:     previous value for reqbyte didn't have REQ_VARY set, it can still match,
 6825:     and we set REQ_VARY for the regex. */
 6826: 
 6827:     else
 6828:       {
 6829:       /* If we previously had a firstbyte, but it doesn't match the new branch,
 6830:       we have to abandon the firstbyte for the regex, but if there was
 6831:       previously no reqbyte, it takes on the value of the old firstbyte. */
 6832: 
 6833:       if (firstbyte >= 0 && firstbyte != branchfirstbyte)
 6834:         {
 6835:         if (reqbyte < 0) reqbyte = firstbyte;
 6836:         firstbyte = REQ_NONE;
 6837:         }
 6838: 
 6839:       /* If we (now or from before) have no firstbyte, a firstbyte from the
 6840:       branch becomes a reqbyte if there isn't a branch reqbyte. */
 6841: 
 6842:       if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
 6843:           branchreqbyte = branchfirstbyte;
 6844: 
 6845:       /* Now ensure that the reqbytes match */
 6846: 
 6847:       if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
 6848:         reqbyte = REQ_NONE;
 6849:       else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
 6850:       }
 6851: 
 6852:     /* If lookbehind, check that this branch matches a fixed-length string, and
 6853:     put the length into the OP_REVERSE item. Temporarily mark the end of the
 6854:     branch with OP_END. If the branch contains OP_RECURSE, the result is -3
 6855:     because there may be forward references that we can't check here. Set a
 6856:     flag to cause another lookbehind check at the end. Why not do it all at the
 6857:     end? Because common, erroneous checks are picked up here and the offset of
 6858:     the problem can be shown. */
 6859: 
 6860:     if (lookbehind)
 6861:       {
 6862:       int fixed_length;
 6863:       *code = OP_END;
 6864:       fixed_length = find_fixedlength(last_branch,  (options & PCRE_UTF8) != 0,
 6865:         FALSE, cd);
 6866:       DPRINTF(("fixed length = %d\n", fixed_length));
 6867:       if (fixed_length == -3)
 6868:         {
 6869:         cd->check_lookbehind = TRUE;
 6870:         }
 6871:       else if (fixed_length < 0)
 6872:         {
 6873:         *errorcodeptr = (fixed_length == -2)? ERR36 :
 6874:                         (fixed_length == -4)? ERR70: ERR25;
 6875:         *ptrptr = ptr;
 6876:         return FALSE;
 6877:         }
 6878:       else { PUT(reverse_count, 0, fixed_length); }
 6879:       }
 6880:     }
 6881: 
 6882:   /* Reached end of expression, either ')' or end of pattern. In the real
 6883:   compile phase, go back through the alternative branches and reverse the chain
 6884:   of offsets, with the field in the BRA item now becoming an offset to the
 6885:   first alternative. If there are no alternatives, it points to the end of the
 6886:   group. The length in the terminating ket is always the length of the whole
 6887:   bracketed item. Return leaving the pointer at the terminating char. */
 6888: 
 6889:   if (*ptr != CHAR_VERTICAL_LINE)
 6890:     {
 6891:     if (lengthptr == NULL)
 6892:       {
 6893:       int branch_length = (int)(code - last_branch);
 6894:       do
 6895:         {
 6896:         int prev_length = GET(last_branch, 1);
 6897:         PUT(last_branch, 1, branch_length);
 6898:         branch_length = prev_length;
 6899:         last_branch -= branch_length;
 6900:         }
 6901:       while (branch_length > 0);
 6902:       }
 6903: 
 6904:     /* Fill in the ket */
 6905: 
 6906:     *code = OP_KET;
 6907:     PUT(code, 1, (int)(code - start_bracket));
 6908:     code += 1 + LINK_SIZE;
 6909: 
 6910:     /* If it was a capturing subpattern, check to see if it contained any
 6911:     recursive back references. If so, we must wrap it in atomic brackets.
 6912:     In any event, remove the block from the chain. */
 6913: 
 6914:     if (capnumber > 0)
 6915:       {
 6916:       if (cd->open_caps->flag)
 6917:         {
 6918:         memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
 6919:           code - start_bracket);
 6920:         *start_bracket = OP_ONCE;
 6921:         code += 1 + LINK_SIZE;
 6922:         PUT(start_bracket, 1, (int)(code - start_bracket));
 6923:         *code = OP_KET;
 6924:         PUT(code, 1, (int)(code - start_bracket));
 6925:         code += 1 + LINK_SIZE;
 6926:         length += 2 + 2*LINK_SIZE;
 6927:         }
 6928:       cd->open_caps = cd->open_caps->next;
 6929:       }
 6930: 
 6931:     /* Retain the highest bracket number, in case resetting was used. */
 6932: 
 6933:     cd->bracount = max_bracount;
 6934: 
 6935:     /* Set values to pass back */
 6936: 
 6937:     *codeptr = code;
 6938:     *ptrptr = ptr;
 6939:     *firstbyteptr = firstbyte;
 6940:     *reqbyteptr = reqbyte;
 6941:     if (lengthptr != NULL)
 6942:       {
 6943:       if (OFLOW_MAX - *lengthptr < length)
 6944:         {
 6945:         *errorcodeptr = ERR20;
 6946:         return FALSE;
 6947:         }
 6948:       *lengthptr += length;
 6949:       }
 6950:     return TRUE;
 6951:     }
 6952: 
 6953:   /* Another branch follows. In the pre-compile phase, we can move the code
 6954:   pointer back to where it was for the start of the first branch. (That is,
 6955:   pretend that each branch is the only one.)
 6956: 
 6957:   In the real compile phase, insert an ALT node. Its length field points back
 6958:   to the previous branch while the bracket remains open. At the end the chain
 6959:   is reversed. It's done like this so that the start of the bracket has a
 6960:   zero offset until it is closed, making it possible to detect recursion. */
 6961: 
 6962:   if (lengthptr != NULL)
 6963:     {
 6964:     code = *codeptr + 1 + LINK_SIZE + skipbytes;
 6965:     length += 1 + LINK_SIZE;
 6966:     }
 6967:   else
 6968:     {
 6969:     *code = OP_ALT;
 6970:     PUT(code, 1, (int)(code - last_branch));
 6971:     bc.current_branch = last_branch = code;
 6972:     code += 1 + LINK_SIZE;
 6973:     }
 6974: 
 6975:   ptr++;
 6976:   }
 6977: /* Control never reaches here */
 6978: }
 6979: 
 6980: 
 6981: 
 6982: 
 6983: /*************************************************
 6984: *          Check for anchored expression         *
 6985: *************************************************/
 6986: 
 6987: /* Try to find out if this is an anchored regular expression. Consider each
 6988: alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
 6989: all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
 6990: it's anchored. However, if this is a multiline pattern, then only OP_SOD will
 6991: be found, because ^ generates OP_CIRCM in that mode.
 6992: 
 6993: We can also consider a regex to be anchored if OP_SOM starts all its branches.
 6994: This is the code for \G, which means "match at start of match position, taking
 6995: into account the match offset".
 6996: 
 6997: A branch is also implicitly anchored if it starts with .* and DOTALL is set,
 6998: because that will try the rest of the pattern at all possible matching points,
 6999: so there is no point trying again.... er ....
 7000: 
 7001: .... except when the .* appears inside capturing parentheses, and there is a
 7002: subsequent back reference to those parentheses. We haven't enough information
 7003: to catch that case precisely.
 7004: 
 7005: At first, the best we could do was to detect when .* was in capturing brackets
 7006: and the highest back reference was greater than or equal to that level.
 7007: However, by keeping a bitmap of the first 31 back references, we can catch some
 7008: of the more common cases more precisely.
 7009: 
 7010: Arguments:
 7011:   code           points to start of expression (the bracket)
 7012:   bracket_map    a bitmap of which brackets we are inside while testing; this
 7013:                   handles up to substring 31; after that we just have to take
 7014:                   the less precise approach
 7015:   backref_map    the back reference bitmap
 7016: 
 7017: Returns:     TRUE or FALSE
 7018: */
 7019: 
 7020: static BOOL
 7021: is_anchored(register const uschar *code, unsigned int bracket_map,
 7022:   unsigned int backref_map)
 7023: {
 7024: do {
 7025:    const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
 7026:      FALSE);
 7027:    register int op = *scode;
 7028: 
 7029:    /* Non-capturing brackets */
 7030: 
 7031:    if (op == OP_BRA  || op == OP_BRAPOS ||
 7032:        op == OP_SBRA || op == OP_SBRAPOS)
 7033:      {
 7034:      if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;
 7035:      }
 7036: 
 7037:    /* Capturing brackets */
 7038: 
 7039:    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
 7040:             op == OP_SCBRA || op == OP_SCBRAPOS)
 7041:      {
 7042:      int n = GET2(scode, 1+LINK_SIZE);
 7043:      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
 7044:      if (!is_anchored(scode, new_map, backref_map)) return FALSE;
 7045:      }
 7046: 
 7047:    /* Other brackets */
 7048: 
 7049:    else if (op == OP_ASSERT || op == OP_ONCE || op == OP_ONCE_NC ||
 7050:             op == OP_COND)
 7051:      {
 7052:      if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;
 7053:      }
 7054: 
 7055:    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
 7056:    it isn't in brackets that are or may be referenced. */
 7057: 
 7058:    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
 7059:              op == OP_TYPEPOSSTAR))
 7060:      {
 7061:      if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
 7062:        return FALSE;
 7063:      }
 7064: 
 7065:    /* Check for explicit anchoring */
 7066: 
 7067:    else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
 7068:    code += GET(code, 1);
 7069:    }
 7070: while (*code == OP_ALT);   /* Loop for each alternative */
 7071: return TRUE;
 7072: }
 7073: 
 7074: 
 7075: 
 7076: /*************************************************
 7077: *         Check for starting with ^ or .*        *
 7078: *************************************************/
 7079: 
 7080: /* This is called to find out if every branch starts with ^ or .* so that
 7081: "first char" processing can be done to speed things up in multiline
 7082: matching and for non-DOTALL patterns that start with .* (which must start at
 7083: the beginning or after \n). As in the case of is_anchored() (see above), we
 7084: have to take account of back references to capturing brackets that contain .*
 7085: because in that case we can't make the assumption.
 7086: 
 7087: Arguments:
 7088:   code           points to start of expression (the bracket)
 7089:   bracket_map    a bitmap of which brackets we are inside while testing; this
 7090:                   handles up to substring 31; after that we just have to take
 7091:                   the less precise approach
 7092:   backref_map    the back reference bitmap
 7093: 
 7094: Returns:         TRUE or FALSE
 7095: */
 7096: 
 7097: static BOOL
 7098: is_startline(const uschar *code, unsigned int bracket_map,
 7099:   unsigned int backref_map)
 7100: {
 7101: do {
 7102:    const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
 7103:      FALSE);
 7104:    register int op = *scode;
 7105: 
 7106:    /* If we are at the start of a conditional assertion group, *both* the
 7107:    conditional assertion *and* what follows the condition must satisfy the test
 7108:    for start of line. Other kinds of condition fail. Note that there may be an
 7109:    auto-callout at the start of a condition. */
 7110: 
 7111:    if (op == OP_COND)
 7112:      {
 7113:      scode += 1 + LINK_SIZE;
 7114:      if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];
 7115:      switch (*scode)
 7116:        {
 7117:        case OP_CREF:
 7118:        case OP_NCREF:
 7119:        case OP_RREF:
 7120:        case OP_NRREF:
 7121:        case OP_DEF:
 7122:        return FALSE;
 7123: 
 7124:        default:     /* Assertion */
 7125:        if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
 7126:        do scode += GET(scode, 1); while (*scode == OP_ALT);
 7127:        scode += 1 + LINK_SIZE;
 7128:        break;
 7129:        }
 7130:      scode = first_significant_code(scode, FALSE);
 7131:      op = *scode;
 7132:      }
 7133: 
 7134:    /* Non-capturing brackets */
 7135: 
 7136:    if (op == OP_BRA  || op == OP_BRAPOS ||
 7137:        op == OP_SBRA || op == OP_SBRAPOS)
 7138:      {
 7139:      if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
 7140:      }
 7141: 
 7142:    /* Capturing brackets */
 7143: 
 7144:    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
 7145:             op == OP_SCBRA || op == OP_SCBRAPOS)
 7146:      {
 7147:      int n = GET2(scode, 1+LINK_SIZE);
 7148:      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
 7149:      if (!is_startline(scode, new_map, backref_map)) return FALSE;
 7150:      }
 7151: 
 7152:    /* Other brackets */
 7153: 
 7154:    else if (op == OP_ASSERT || op == OP_ONCE || op == OP_ONCE_NC)
 7155:      {
 7156:      if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
 7157:      }
 7158: 
 7159:    /* .* means "start at start or after \n" if it isn't in brackets that
 7160:    may be referenced. */
 7161: 
 7162:    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
 7163:      {
 7164:      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
 7165:      }
 7166: 
 7167:    /* Check for explicit circumflex */
 7168: 
 7169:    else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
 7170: 
 7171:    /* Move on to the next alternative */
 7172: 
 7173:    code += GET(code, 1);
 7174:    }
 7175: while (*code == OP_ALT);  /* Loop for each alternative */
 7176: return TRUE;
 7177: }
 7178: 
 7179: 
 7180: 
 7181: /*************************************************
 7182: *       Check for asserted fixed first char      *
 7183: *************************************************/
 7184: 
 7185: /* During compilation, the "first char" settings from forward assertions are
 7186: discarded, because they can cause conflicts with actual literals that follow.
 7187: However, if we end up without a first char setting for an unanchored pattern,
 7188: it is worth scanning the regex to see if there is an initial asserted first
 7189: char. If all branches start with the same asserted char, or with a bracket all
 7190: of whose alternatives start with the same asserted char (recurse ad lib), then
 7191: we return that char, otherwise -1.
 7192: 
 7193: Arguments:
 7194:   code       points to start of expression (the bracket)
 7195:   inassert   TRUE if in an assertion
 7196: 
 7197: Returns:     -1 or the fixed first char
 7198: */
 7199: 
 7200: static int
 7201: find_firstassertedchar(const uschar *code, BOOL inassert)
 7202: {
 7203: register int c = -1;
 7204: do {
 7205:    int d;
 7206:    int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
 7207:              *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? 2:0;
 7208:    const uschar *scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
 7209:    register int op = *scode;
 7210: 
 7211:    switch(op)
 7212:      {
 7213:      default:
 7214:      return -1;
 7215: 
 7216:      case OP_BRA:
 7217:      case OP_BRAPOS:
 7218:      case OP_CBRA:
 7219:      case OP_SCBRA:
 7220:      case OP_CBRAPOS:
 7221:      case OP_SCBRAPOS:
 7222:      case OP_ASSERT:
 7223:      case OP_ONCE:
 7224:      case OP_ONCE_NC:
 7225:      case OP_COND:
 7226:      if ((d = find_firstassertedchar(scode, op == OP_ASSERT)) < 0)
 7227:        return -1;
 7228:      if (c < 0) c = d; else if (c != d) return -1;
 7229:      break;
 7230: 
 7231:      case OP_EXACT:
 7232:      scode += 2;
 7233:      /* Fall through */
 7234: 
 7235:      case OP_CHAR:
 7236:      case OP_PLUS:
 7237:      case OP_MINPLUS:
 7238:      case OP_POSPLUS:
 7239:      if (!inassert) return -1;
 7240:      if (c < 0) c = scode[1];
 7241:        else if (c != scode[1]) return -1;
 7242:      break;
 7243: 
 7244:      case OP_EXACTI:
 7245:      scode += 2;
 7246:      /* Fall through */
 7247: 
 7248:      case OP_CHARI:
 7249:      case OP_PLUSI:
 7250:      case OP_MINPLUSI:
 7251:      case OP_POSPLUSI:
 7252:      if (!inassert) return -1;
 7253:      if (c < 0) c = scode[1] | REQ_CASELESS;
 7254:        else if (c != scode[1]) return -1;
 7255:      break;
 7256:      }
 7257: 
 7258:    code += GET(code, 1);
 7259:    }
 7260: while (*code == OP_ALT);
 7261: return c;
 7262: }
 7263: 
 7264: 
 7265: 
 7266: /*************************************************
 7267: *        Compile a Regular Expression            *
 7268: *************************************************/
 7269: 
 7270: /* This function takes a string and returns a pointer to a block of store
 7271: holding a compiled version of the expression. The original API for this
 7272: function had no error code return variable; it is retained for backwards
 7273: compatibility. The new function is given a new name.
 7274: 
 7275: Arguments:
 7276:   pattern       the regular expression
 7277:   options       various option bits
 7278:   errorcodeptr  pointer to error code variable (pcre_compile2() only)
 7279:                   can be NULL if you don't want a code value
 7280:   errorptr      pointer to pointer to error text
 7281:   erroroffset   ptr offset in pattern where error was detected
 7282:   tables        pointer to character tables or NULL
 7283: 
 7284: Returns:        pointer to compiled data block, or NULL on error,
 7285:                 with errorptr and erroroffset set
 7286: */
 7287: 
 7288: PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
 7289: pcre_compile(const char *pattern, int options, const char **errorptr,
 7290:   int *erroroffset, const unsigned char *tables)
 7291: {
 7292: return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
 7293: }
 7294: 
 7295: 
 7296: PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
 7297: pcre_compile2(const char *pattern, int options, int *errorcodeptr,
 7298:   const char **errorptr, int *erroroffset, const unsigned char *tables)
 7299: {
 7300: real_pcre *re;
 7301: int length = 1;  /* For final END opcode */
 7302: int firstbyte, reqbyte, newline;
 7303: int errorcode = 0;
 7304: int skipatstart = 0;
 7305: BOOL utf8;
 7306: size_t size;
 7307: uschar *code;
 7308: const uschar *codestart;
 7309: const uschar *ptr;
 7310: compile_data compile_block;
 7311: compile_data *cd = &compile_block;
 7312: 
 7313: /* This space is used for "compiling" into during the first phase, when we are
 7314: computing the amount of memory that is needed. Compiled items are thrown away
 7315: as soon as possible, so that a fairly large buffer should be sufficient for
 7316: this purpose. The same space is used in the second phase for remembering where
 7317: to fill in forward references to subpatterns. That may overflow, in which case
 7318: new memory is obtained from malloc(). */
 7319: 
 7320: uschar cworkspace[COMPILE_WORK_SIZE];
 7321: 
 7322: /* Set this early so that early errors get offset 0. */
 7323: 
 7324: ptr = (const uschar *)pattern;
 7325: 
 7326: /* We can't pass back an error message if errorptr is NULL; I guess the best we
 7327: can do is just return NULL, but we can set a code value if there is a code
 7328: pointer. */
 7329: 
 7330: if (errorptr == NULL)
 7331:   {
 7332:   if (errorcodeptr != NULL) *errorcodeptr = 99;
 7333:   return NULL;
 7334:   }
 7335: 
 7336: *errorptr = NULL;
 7337: if (errorcodeptr != NULL) *errorcodeptr = ERR0;
 7338: 
 7339: /* However, we can give a message for this error */
 7340: 
 7341: if (erroroffset == NULL)
 7342:   {
 7343:   errorcode = ERR16;
 7344:   goto PCRE_EARLY_ERROR_RETURN2;
 7345:   }
 7346: 
 7347: *erroroffset = 0;
 7348: 
 7349: /* Set up pointers to the individual character tables */
 7350: 
 7351: if (tables == NULL) tables = _pcre_default_tables;
 7352: cd->lcc = tables + lcc_offset;
 7353: cd->fcc = tables + fcc_offset;
 7354: cd->cbits = tables + cbits_offset;
 7355: cd->ctypes = tables + ctypes_offset;
 7356: 
 7357: /* Check that all undefined public option bits are zero */
 7358: 
 7359: if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
 7360:   {
 7361:   errorcode = ERR17;
 7362:   goto PCRE_EARLY_ERROR_RETURN;
 7363:   }
 7364: 
 7365: /* Check for global one-time settings at the start of the pattern, and remember
 7366: the offset for later. */
 7367: 
 7368: while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
 7369:        ptr[skipatstart+1] == CHAR_ASTERISK)
 7370:   {
 7371:   int newnl = 0;
 7372:   int newbsr = 0;
 7373: 
 7374:   if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
 7375:     { skipatstart += 7; options |= PCRE_UTF8; continue; }
 7376:   else if (strncmp((char *)(ptr+skipatstart+2), STRING_UCP_RIGHTPAR, 4) == 0)
 7377:     { skipatstart += 6; options |= PCRE_UCP; continue; }
 7378:   else if (strncmp((char *)(ptr+skipatstart+2), STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
 7379:     { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
 7380: 
 7381:   if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
 7382:     { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
 7383:   else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3)  == 0)
 7384:     { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
 7385:   else if (strncmp((char *)(ptr+skipatstart+2), STRING_CRLF_RIGHTPAR, 5)  == 0)
 7386:     { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
 7387:   else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANY_RIGHTPAR, 4) == 0)
 7388:     { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
 7389:   else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANYCRLF_RIGHTPAR, 8) == 0)
 7390:     { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
 7391: 
 7392:   else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
 7393:     { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
 7394:   else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
 7395:     { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
 7396: 
 7397:   if (newnl != 0)
 7398:     options = (options & ~PCRE_NEWLINE_BITS) | newnl;
 7399:   else if (newbsr != 0)
 7400:     options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
 7401:   else break;
 7402:   }
 7403: 
 7404: utf8 = (options & PCRE_UTF8) != 0;
 7405: 
 7406: /* Can't support UTF8 unless PCRE has been compiled to include the code. The
 7407: return of an error code from _pcre_valid_utf8() is a new feature, introduced in
 7408: release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
 7409: not used here. */
 7410: 
 7411: #ifdef SUPPORT_UTF8
 7412: if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
 7413:      (errorcode = _pcre_valid_utf8((USPTR)pattern, -1, erroroffset)) != 0)
 7414:   {
 7415:   errorcode = ERR44;
 7416:   goto PCRE_EARLY_ERROR_RETURN2;
 7417:   }
 7418: #else
 7419: if (utf8)
 7420:   {
 7421:   errorcode = ERR32;
 7422:   goto PCRE_EARLY_ERROR_RETURN;
 7423:   }
 7424: #endif
 7425: 
 7426: /* Can't support UCP unless PCRE has been compiled to include the code. */
 7427: 
 7428: #ifndef SUPPORT_UCP
 7429: if ((options & PCRE_UCP) != 0)
 7430:   {
 7431:   errorcode = ERR67;
 7432:   goto PCRE_EARLY_ERROR_RETURN;
 7433:   }
 7434: #endif
 7435: 
 7436: /* Check validity of \R options. */
 7437: 
 7438: if ((options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) ==
 7439:      (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
 7440:   {
 7441:   errorcode = ERR56;
 7442:   goto PCRE_EARLY_ERROR_RETURN;
 7443:   }
 7444: 
 7445: /* Handle different types of newline. The three bits give seven cases. The
 7446: current code allows for fixed one- or two-byte sequences, plus "any" and
 7447: "anycrlf". */
 7448: 
 7449: switch (options & PCRE_NEWLINE_BITS)
 7450:   {
 7451:   case 0: newline = NEWLINE; break;   /* Build-time default */
 7452:   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
 7453:   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
 7454:   case PCRE_NEWLINE_CR+
 7455:        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
 7456:   case PCRE_NEWLINE_ANY: newline = -1; break;
 7457:   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
 7458:   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
 7459:   }
 7460: 
 7461: if (newline == -2)
 7462:   {
 7463:   cd->nltype = NLTYPE_ANYCRLF;
 7464:   }
 7465: else if (newline < 0)
 7466:   {
 7467:   cd->nltype = NLTYPE_ANY;
 7468:   }
 7469: else
 7470:   {
 7471:   cd->nltype = NLTYPE_FIXED;
 7472:   if (newline > 255)
 7473:     {
 7474:     cd->nllen = 2;
 7475:     cd->nl[0] = (newline >> 8) & 255;
 7476:     cd->nl[1] = newline & 255;
 7477:     }
 7478:   else
 7479:     {
 7480:     cd->nllen = 1;
 7481:     cd->nl[0] = newline;
 7482:     }
 7483:   }
 7484: 
 7485: /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
 7486: references to help in deciding whether (.*) can be treated as anchored or not.
 7487: */
 7488: 
 7489: cd->top_backref = 0;
 7490: cd->backref_map = 0;
 7491: 
 7492: /* Reflect pattern for debugging output */
 7493: 
 7494: DPRINTF(("------------------------------------------------------------------\n"));
 7495: DPRINTF(("%s\n", pattern));
 7496: 
 7497: /* Pretend to compile the pattern while actually just accumulating the length
 7498: of memory required. This behaviour is triggered by passing a non-NULL final
 7499: argument to compile_regex(). We pass a block of workspace (cworkspace) for it
 7500: to compile parts of the pattern into; the compiled code is discarded when it is
 7501: no longer needed, so hopefully this workspace will never overflow, though there
 7502: is a test for its doing so. */
 7503: 
 7504: cd->bracount = cd->final_bracount = 0;
 7505: cd->names_found = 0;
 7506: cd->name_entry_size = 0;
 7507: cd->name_table = NULL;
 7508: cd->start_code = cworkspace;
 7509: cd->hwm = cworkspace;
 7510: cd->start_workspace = cworkspace;
 7511: cd->workspace_size = COMPILE_WORK_SIZE;
 7512: cd->start_pattern = (const uschar *)pattern;
 7513: cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
 7514: cd->req_varyopt = 0;
 7515: cd->external_options = options;
 7516: cd->external_flags = 0;
 7517: cd->open_caps = NULL;
 7518: 
 7519: /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
 7520: don't need to look at the result of the function here. The initial options have
 7521: been put into the cd block so that they can be changed if an option setting is
 7522: found within the regex right at the beginning. Bringing initial option settings
 7523: outside can help speed up starting point checks. */
 7524: 
 7525: ptr += skipatstart;
 7526: code = cworkspace;
 7527: *code = OP_BRA;
 7528: (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
 7529:   FALSE, 0, 0, &firstbyte, &reqbyte, NULL, cd, &length);
 7530: if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
 7531: 
 7532: DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
 7533:   cd->hwm - cworkspace));
 7534: 
 7535: if (length > MAX_PATTERN_SIZE)
 7536:   {
 7537:   errorcode = ERR20;
 7538:   goto PCRE_EARLY_ERROR_RETURN;
 7539:   }
 7540: 
 7541: /* Compute the size of data block needed and get it, either from malloc or
 7542: externally provided function. Integer overflow should no longer be possible
 7543: because nowadays we limit the maximum value of cd->names_found and
 7544: cd->name_entry_size. */
 7545: 
 7546: size = length + sizeof(real_pcre) + cd->names_found * cd->name_entry_size;
 7547: re = (real_pcre *)(pcre_malloc)(size);
 7548: 
 7549: if (re == NULL)
 7550:   {
 7551:   errorcode = ERR21;
 7552:   goto PCRE_EARLY_ERROR_RETURN;
 7553:   }
 7554: 
 7555: /* Put in the magic number, and save the sizes, initial options, internal
 7556: flags, and character table pointer. NULL is used for the default character
 7557: tables. The nullpad field is at the end; it's there to help in the case when a
 7558: regex compiled on a system with 4-byte pointers is run on another with 8-byte
 7559: pointers. */
 7560: 
 7561: re->magic_number = MAGIC_NUMBER;
 7562: re->size = (int)size;
 7563: re->options = cd->external_options;
 7564: re->flags = cd->external_flags;
 7565: re->dummy1 = 0;
 7566: re->first_byte = 0;
 7567: re->req_byte = 0;
 7568: re->name_table_offset = sizeof(real_pcre);
 7569: re->name_entry_size = cd->name_entry_size;
 7570: re->name_count = cd->names_found;
 7571: re->ref_count = 0;
 7572: re->tables = (tables == _pcre_default_tables)? NULL : tables;
 7573: re->nullpad = NULL;
 7574: 
 7575: /* The starting points of the name/number translation table and of the code are
 7576: passed around in the compile data block. The start/end pattern and initial
 7577: options are already set from the pre-compile phase, as is the name_entry_size
 7578: field. Reset the bracket count and the names_found field. Also reset the hwm
 7579: field; this time it's used for remembering forward references to subpatterns.
 7580: */
 7581: 
 7582: cd->final_bracount = cd->bracount;  /* Save for checking forward references */
 7583: cd->assert_depth = 0;
 7584: cd->bracount = 0;
 7585: cd->names_found = 0;
 7586: cd->name_table = (uschar *)re + re->name_table_offset;
 7587: codestart = cd->name_table + re->name_entry_size * re->name_count;
 7588: cd->start_code = codestart;
 7589: cd->hwm = (uschar *)(cd->start_workspace);
 7590: cd->req_varyopt = 0;
 7591: cd->had_accept = FALSE;
 7592: cd->check_lookbehind = FALSE;
 7593: cd->open_caps = NULL;
 7594: 
 7595: /* Set up a starting, non-extracting bracket, then compile the expression. On
 7596: error, errorcode will be set non-zero, so we don't need to look at the result
 7597: of the function here. */
 7598: 
 7599: ptr = (const uschar *)pattern + skipatstart;
 7600: code = (uschar *)codestart;
 7601: *code = OP_BRA;
 7602: (void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0,
 7603:   &firstbyte, &reqbyte, NULL, cd, NULL);
 7604: re->top_bracket = cd->bracount;
 7605: re->top_backref = cd->top_backref;
 7606: re->flags = cd->external_flags;
 7607: 
 7608: if (cd->had_accept) reqbyte = REQ_NONE;   /* Must disable after (*ACCEPT) */
 7609: 
 7610: /* If not reached end of pattern on success, there's an excess bracket. */
 7611: 
 7612: if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
 7613: 
 7614: /* Fill in the terminating state and check for disastrous overflow, but
 7615: if debugging, leave the test till after things are printed out. */
 7616: 
 7617: *code++ = OP_END;
 7618: 
 7619: #ifndef PCRE_DEBUG
 7620: if (code - codestart > length) errorcode = ERR23;
 7621: #endif
 7622: 
 7623: /* Fill in any forward references that are required. There may be repeated
 7624: references; optimize for them, as searching a large regex takes time. */
 7625: 
 7626: if (cd->hwm > cd->start_workspace)
 7627:   {
 7628:   int prev_recno = -1;
 7629:   const uschar *groupptr = NULL;
 7630:   while (errorcode == 0 && cd->hwm > cd->start_workspace)
 7631:     {
 7632:     int offset, recno;
 7633:     cd->hwm -= LINK_SIZE;
 7634:     offset = GET(cd->hwm, 0);
 7635:     recno = GET(codestart, offset);
 7636:     if (recno != prev_recno)
 7637:       {
 7638:       groupptr = _pcre_find_bracket(codestart, utf8, recno);
 7639:       prev_recno = recno;
 7640:       }
 7641:     if (groupptr == NULL) errorcode = ERR53;
 7642:       else PUT(((uschar *)codestart), offset, (int)(groupptr - codestart));
 7643:     }
 7644:   }
 7645: 
 7646: /* If the workspace had to be expanded, free the new memory. */
 7647: 
 7648: if (cd->workspace_size > COMPILE_WORK_SIZE)
 7649:   (pcre_free)((void *)cd->start_workspace);
 7650: 
 7651: /* Give an error if there's back reference to a non-existent capturing
 7652: subpattern. */
 7653: 
 7654: if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
 7655: 
 7656: /* If there were any lookbehind assertions that contained OP_RECURSE
 7657: (recursions or subroutine calls), a flag is set for them to be checked here,
 7658: because they may contain forward references. Actual recursions can't be fixed
 7659: length, but subroutine calls can. It is done like this so that those without
 7660: OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
 7661: exceptional ones forgo this. We scan the pattern to check that they are fixed
 7662: length, and set their lengths. */
 7663: 
 7664: if (cd->check_lookbehind)
 7665:   {
 7666:   uschar *cc = (uschar *)codestart;
 7667: 
 7668:   /* Loop, searching for OP_REVERSE items, and process those that do not have
 7669:   their length set. (Actually, it will also re-process any that have a length
 7670:   of zero, but that is a pathological case, and it does no harm.) When we find
 7671:   one, we temporarily terminate the branch it is in while we scan it. */
 7672: 
 7673:   for (cc = (uschar *)_pcre_find_bracket(codestart, utf8, -1);
 7674:        cc != NULL;
 7675:        cc = (uschar *)_pcre_find_bracket(cc, utf8, -1))
 7676:     {
 7677:     if (GET(cc, 1) == 0)
 7678:       {
 7679:       int fixed_length;
 7680:       uschar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
 7681:       int end_op = *be;
 7682:       *be = OP_END;
 7683:       fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE,
 7684:         cd);
 7685:       *be = end_op;
 7686:       DPRINTF(("fixed length = %d\n", fixed_length));
 7687:       if (fixed_length < 0)
 7688:         {
 7689:         errorcode = (fixed_length == -2)? ERR36 :
 7690:                     (fixed_length == -4)? ERR70 : ERR25;
 7691:         break;
 7692:         }
 7693:       PUT(cc, 1, fixed_length);
 7694:       }
 7695:     cc += 1 + LINK_SIZE;
 7696:     }
 7697:   }
 7698: 
 7699: /* Failed to compile, or error while post-processing */
 7700: 
 7701: if (errorcode != 0)
 7702:   {
 7703:   (pcre_free)(re);
 7704:   PCRE_EARLY_ERROR_RETURN:
 7705:   *erroroffset = (int)(ptr - (const uschar *)pattern);
 7706:   PCRE_EARLY_ERROR_RETURN2:
 7707:   *errorptr = find_error_text(errorcode);
 7708:   if (errorcodeptr != NULL) *errorcodeptr = errorcode;
 7709:   return NULL;
 7710:   }
 7711: 
 7712: /* If the anchored option was not passed, set the flag if we can determine that
 7713: the pattern is anchored by virtue of ^ characters or \A or anything else (such
 7714: as starting with .* when DOTALL is set).
 7715: 
 7716: Otherwise, if we know what the first byte has to be, save it, because that
 7717: speeds up unanchored matches no end. If not, see if we can set the
 7718: PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
 7719: start with ^. and also when all branches start with .* for non-DOTALL matches.
 7720: */
 7721: 
 7722: if ((re->options & PCRE_ANCHORED) == 0)
 7723:   {
 7724:   if (is_anchored(codestart, 0, cd->backref_map))
 7725:     re->options |= PCRE_ANCHORED;
 7726:   else
 7727:     {
 7728:     if (firstbyte < 0)
 7729:       firstbyte = find_firstassertedchar(codestart, FALSE);
 7730:     if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
 7731:       {
 7732:       int ch = firstbyte & 255;
 7733:       re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
 7734:          cd->fcc[ch] == ch)? ch : firstbyte;
 7735:       re->flags |= PCRE_FIRSTSET;
 7736:       }
 7737:     else if (is_startline(codestart, 0, cd->backref_map))
 7738:       re->flags |= PCRE_STARTLINE;
 7739:     }
 7740:   }
 7741: 
 7742: /* For an anchored pattern, we use the "required byte" only if it follows a
 7743: variable length item in the regex. Remove the caseless flag for non-caseable
 7744: bytes. */
 7745: 
 7746: if (reqbyte >= 0 &&
 7747:      ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
 7748:   {
 7749:   int ch = reqbyte & 255;
 7750:   re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
 7751:     cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
 7752:   re->flags |= PCRE_REQCHSET;
 7753:   }
 7754: 
 7755: /* Print out the compiled data if debugging is enabled. This is never the
 7756: case when building a production library. */
 7757: 
 7758: #ifdef PCRE_DEBUG
 7759: printf("Length = %d top_bracket = %d top_backref = %d\n",
 7760:   length, re->top_bracket, re->top_backref);
 7761: 
 7762: printf("Options=%08x\n", re->options);
 7763: 
 7764: if ((re->flags & PCRE_FIRSTSET) != 0)
 7765:   {
 7766:   int ch = re->first_byte & 255;
 7767:   const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
 7768:     "" : " (caseless)";
 7769:   if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
 7770:     else printf("First char = \\x%02x%s\n", ch, caseless);
 7771:   }
 7772: 
 7773: if ((re->flags & PCRE_REQCHSET) != 0)
 7774:   {
 7775:   int ch = re->req_byte & 255;
 7776:   const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
 7777:     "" : " (caseless)";
 7778:   if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
 7779:     else printf("Req char = \\x%02x%s\n", ch, caseless);
 7780:   }
 7781: 
 7782: pcre_printint(re, stdout, TRUE);
 7783: 
 7784: /* This check is done here in the debugging case so that the code that
 7785: was compiled can be seen. */
 7786: 
 7787: if (code - codestart > length)
 7788:   {
 7789:   (pcre_free)(re);
 7790:   *errorptr = find_error_text(ERR23);
 7791:   *erroroffset = ptr - (uschar *)pattern;
 7792:   if (errorcodeptr != NULL) *errorcodeptr = ERR23;
 7793:   return NULL;
 7794:   }
 7795: #endif   /* PCRE_DEBUG */
 7796: 
 7797: return (pcre *)re;
 7798: }
 7799: 
 7800: /* End of pcre_compile.c */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>