Annotation of embedaddon/php/ext/pcre/pcrelib/pcre_study.c, revision 1.1.1.1

1.1       misho       1: /*************************************************
                      2: *      Perl-Compatible Regular Expressions       *
                      3: *************************************************/
                      4: 
                      5: /* PCRE is a library of functions to support regular expressions whose syntax
                      6: and semantics are as close as possible to those of the Perl 5 language.
                      7: 
                      8:                        Written by Philip Hazel
                      9:            Copyright (c) 1997-2010 University of Cambridge
                     10: 
                     11: -----------------------------------------------------------------------------
                     12: Redistribution and use in source and binary forms, with or without
                     13: modification, are permitted provided that the following conditions are met:
                     14: 
                     15:     * Redistributions of source code must retain the above copyright notice,
                     16:       this list of conditions and the following disclaimer.
                     17: 
                     18:     * Redistributions in binary form must reproduce the above copyright
                     19:       notice, this list of conditions and the following disclaimer in the
                     20:       documentation and/or other materials provided with the distribution.
                     21: 
                     22:     * Neither the name of the University of Cambridge nor the names of its
                     23:       contributors may be used to endorse or promote products derived from
                     24:       this software without specific prior written permission.
                     25: 
                     26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
                     27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
                     28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
                     29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
                     30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
                     31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
                     32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
                     33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
                     34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
                     35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
                     36: POSSIBILITY OF SUCH DAMAGE.
                     37: -----------------------------------------------------------------------------
                     38: */
                     39: 
                     40: 
                     41: /* This module contains the external function pcre_study(), along with local
                     42: supporting functions. */
                     43: 
                     44: 
                     45: #include "config.h"
                     46: 
                     47: #include "pcre_internal.h"
                     48: 
                     49: #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
                     50: 
                     51: /* Returns from set_start_bits() */
                     52: 
                     53: enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
                     54: 
                     55: 
                     56: 
                     57: /*************************************************
                     58: *   Find the minimum subject length for a group  *
                     59: *************************************************/
                     60: 
                     61: /* Scan a parenthesized group and compute the minimum length of subject that
                     62: is needed to match it. This is a lower bound; it does not mean there is a
                     63: string of that length that matches. In UTF8 mode, the result is in characters
                     64: rather than bytes.
                     65: 
                     66: Arguments:
                     67:   code       pointer to start of group (the bracket)
                     68:   startcode  pointer to start of the whole pattern
                     69:   options    the compiling options
                     70: 
                     71: Returns:   the minimum length
                     72:            -1 if \C was encountered
                     73:            -2 internal error (missing capturing bracket)
                     74: */
                     75: 
                     76: static int
                     77: find_minlength(const uschar *code, const uschar *startcode, int options)
                     78: {
                     79: int length = -1;
                     80: BOOL utf8 = (options & PCRE_UTF8) != 0;
                     81: BOOL had_recurse = FALSE;
                     82: register int branchlength = 0;
                     83: register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
                     84: 
                     85: if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;
                     86: 
                     87: /* Scan along the opcodes for this branch. If we get to the end of the
                     88: branch, check the length against that of the other branches. */
                     89: 
                     90: for (;;)
                     91:   {
                     92:   int d, min;
                     93:   uschar *cs, *ce;
                     94:   register int op = *cc;
                     95: 
                     96:   switch (op)
                     97:     {
                     98:     case OP_COND:
                     99:     case OP_SCOND:
                    100: 
                    101:     /* If there is only one branch in a condition, the implied branch has zero
                    102:     length, so we don't add anything. This covers the DEFINE "condition"
                    103:     automatically. */
                    104: 
                    105:     cs = cc + GET(cc, 1);
                    106:     if (*cs != OP_ALT)
                    107:       {
                    108:       cc = cs + 1 + LINK_SIZE;
                    109:       break;
                    110:       }
                    111: 
                    112:     /* Otherwise we can fall through and treat it the same as any other
                    113:     subpattern. */
                    114: 
                    115:     case OP_CBRA:
                    116:     case OP_SCBRA:
                    117:     case OP_BRA:
                    118:     case OP_SBRA:
                    119:     case OP_ONCE:
                    120:     d = find_minlength(cc, startcode, options);
                    121:     if (d < 0) return d;
                    122:     branchlength += d;
                    123:     do cc += GET(cc, 1); while (*cc == OP_ALT);
                    124:     cc += 1 + LINK_SIZE;
                    125:     break;
                    126: 
                    127:     /* Reached end of a branch; if it's a ket it is the end of a nested
                    128:     call. If it's ALT it is an alternation in a nested call. If it is
                    129:     END it's the end of the outer call. All can be handled by the same code. */
                    130: 
                    131:     case OP_ALT:
                    132:     case OP_KET:
                    133:     case OP_KETRMAX:
                    134:     case OP_KETRMIN:
                    135:     case OP_END:
                    136:     if (length < 0 || (!had_recurse && branchlength < length))
                    137:       length = branchlength;
                    138:     if (*cc != OP_ALT) return length;
                    139:     cc += 1 + LINK_SIZE;
                    140:     branchlength = 0;
                    141:     had_recurse = FALSE;
                    142:     break;
                    143: 
                    144:     /* Skip over assertive subpatterns */
                    145: 
                    146:     case OP_ASSERT:
                    147:     case OP_ASSERT_NOT:
                    148:     case OP_ASSERTBACK:
                    149:     case OP_ASSERTBACK_NOT:
                    150:     do cc += GET(cc, 1); while (*cc == OP_ALT);
                    151:     /* Fall through */
                    152: 
                    153:     /* Skip over things that don't match chars */
                    154: 
                    155:     case OP_REVERSE:
                    156:     case OP_CREF:
                    157:     case OP_NCREF:
                    158:     case OP_RREF:
                    159:     case OP_NRREF:
                    160:     case OP_DEF:
                    161:     case OP_OPT:
                    162:     case OP_CALLOUT:
                    163:     case OP_SOD:
                    164:     case OP_SOM:
                    165:     case OP_EOD:
                    166:     case OP_EODN:
                    167:     case OP_CIRC:
                    168:     case OP_DOLL:
                    169:     case OP_NOT_WORD_BOUNDARY:
                    170:     case OP_WORD_BOUNDARY:
                    171:     cc += _pcre_OP_lengths[*cc];
                    172:     break;
                    173: 
                    174:     /* Skip over a subpattern that has a {0} or {0,x} quantifier */
                    175: 
                    176:     case OP_BRAZERO:
                    177:     case OP_BRAMINZERO:
                    178:     case OP_SKIPZERO:
                    179:     cc += _pcre_OP_lengths[*cc];
                    180:     do cc += GET(cc, 1); while (*cc == OP_ALT);
                    181:     cc += 1 + LINK_SIZE;
                    182:     break;
                    183: 
                    184:     /* Handle literal characters and + repetitions */
                    185: 
                    186:     case OP_CHAR:
                    187:     case OP_CHARNC:
                    188:     case OP_NOT:
                    189:     case OP_PLUS:
                    190:     case OP_MINPLUS:
                    191:     case OP_POSPLUS:
                    192:     case OP_NOTPLUS:
                    193:     case OP_NOTMINPLUS:
                    194:     case OP_NOTPOSPLUS:
                    195:     branchlength++;
                    196:     cc += 2;
                    197: #ifdef SUPPORT_UTF8
                    198:     if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
                    199: #endif
                    200:     break;
                    201: 
                    202:     case OP_TYPEPLUS:
                    203:     case OP_TYPEMINPLUS:
                    204:     case OP_TYPEPOSPLUS:
                    205:     branchlength++;
                    206:     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
                    207:     break;
                    208: 
                    209:     /* Handle exact repetitions. The count is already in characters, but we
                    210:     need to skip over a multibyte character in UTF8 mode.  */
                    211: 
                    212:     case OP_EXACT:
                    213:     case OP_NOTEXACT:
                    214:     branchlength += GET2(cc,1);
                    215:     cc += 4;
                    216: #ifdef SUPPORT_UTF8
                    217:     if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
                    218: #endif
                    219:     break;
                    220: 
                    221:     case OP_TYPEEXACT:
                    222:     branchlength += GET2(cc,1);
                    223:     cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
                    224:     break;
                    225: 
                    226:     /* Handle single-char non-literal matchers */
                    227: 
                    228:     case OP_PROP:
                    229:     case OP_NOTPROP:
                    230:     cc += 2;
                    231:     /* Fall through */
                    232: 
                    233:     case OP_NOT_DIGIT:
                    234:     case OP_DIGIT:
                    235:     case OP_NOT_WHITESPACE:
                    236:     case OP_WHITESPACE:
                    237:     case OP_NOT_WORDCHAR:
                    238:     case OP_WORDCHAR:
                    239:     case OP_ANY:
                    240:     case OP_ALLANY:
                    241:     case OP_EXTUNI:
                    242:     case OP_HSPACE:
                    243:     case OP_NOT_HSPACE:
                    244:     case OP_VSPACE:
                    245:     case OP_NOT_VSPACE:
                    246:     branchlength++;
                    247:     cc++;
                    248:     break;
                    249: 
                    250:     /* "Any newline" might match two characters */
                    251: 
                    252:     case OP_ANYNL:
                    253:     branchlength += 2;
                    254:     cc++;
                    255:     break;
                    256: 
                    257:     /* The single-byte matcher means we can't proceed in UTF-8 mode */
                    258: 
                    259:     case OP_ANYBYTE:
                    260: #ifdef SUPPORT_UTF8
                    261:     if (utf8) return -1;
                    262: #endif
                    263:     branchlength++;
                    264:     cc++;
                    265:     break;
                    266: 
                    267:     /* For repeated character types, we have to test for \p and \P, which have
                    268:     an extra two bytes of parameters. */
                    269: 
                    270:     case OP_TYPESTAR:
                    271:     case OP_TYPEMINSTAR:
                    272:     case OP_TYPEQUERY:
                    273:     case OP_TYPEMINQUERY:
                    274:     case OP_TYPEPOSSTAR:
                    275:     case OP_TYPEPOSQUERY:
                    276:     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
                    277:     cc += _pcre_OP_lengths[op];
                    278:     break;
                    279: 
                    280:     case OP_TYPEUPTO:
                    281:     case OP_TYPEMINUPTO:
                    282:     case OP_TYPEPOSUPTO:
                    283:     if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
                    284:     cc += _pcre_OP_lengths[op];
                    285:     break;
                    286: 
                    287:     /* Check a class for variable quantification */
                    288: 
                    289: #ifdef SUPPORT_UTF8
                    290:     case OP_XCLASS:
                    291:     cc += GET(cc, 1) - 33;
                    292:     /* Fall through */
                    293: #endif
                    294: 
                    295:     case OP_CLASS:
                    296:     case OP_NCLASS:
                    297:     cc += 33;
                    298: 
                    299:     switch (*cc)
                    300:       {
                    301:       case OP_CRPLUS:
                    302:       case OP_CRMINPLUS:
                    303:       branchlength++;
                    304:       /* Fall through */
                    305: 
                    306:       case OP_CRSTAR:
                    307:       case OP_CRMINSTAR:
                    308:       case OP_CRQUERY:
                    309:       case OP_CRMINQUERY:
                    310:       cc++;
                    311:       break;
                    312: 
                    313:       case OP_CRRANGE:
                    314:       case OP_CRMINRANGE:
                    315:       branchlength += GET2(cc,1);
                    316:       cc += 5;
                    317:       break;
                    318: 
                    319:       default:
                    320:       branchlength++;
                    321:       break;
                    322:       }
                    323:     break;
                    324: 
                    325:     /* Backreferences and subroutine calls are treated in the same way: we find
                    326:     the minimum length for the subpattern. A recursion, however, causes an
                    327:     a flag to be set that causes the length of this branch to be ignored. The
                    328:     logic is that a recursion can only make sense if there is another
                    329:     alternation that stops the recursing. That will provide the minimum length
                    330:     (when no recursion happens). A backreference within the group that it is
                    331:     referencing behaves in the same way.
                    332: 
                    333:     If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
                    334:     matches an empty string (by default it causes a matching failure), so in
                    335:     that case we must set the minimum length to zero. */
                    336: 
                    337:     case OP_REF:
                    338:     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
                    339:       {
                    340:       ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
                    341:       if (cs == NULL) return -2;
                    342:       do ce += GET(ce, 1); while (*ce == OP_ALT);
                    343:       if (cc > cs && cc < ce)
                    344:         {
                    345:         d = 0;
                    346:         had_recurse = TRUE;
                    347:         }
                    348:       else d = find_minlength(cs, startcode, options);
                    349:       }
                    350:     else d = 0;
                    351:     cc += 3;
                    352: 
                    353:     /* Handle repeated back references */
                    354: 
                    355:     switch (*cc)
                    356:       {
                    357:       case OP_CRSTAR:
                    358:       case OP_CRMINSTAR:
                    359:       case OP_CRQUERY:
                    360:       case OP_CRMINQUERY:
                    361:       min = 0;
                    362:       cc++;
                    363:       break;
                    364: 
                    365:       case OP_CRRANGE:
                    366:       case OP_CRMINRANGE:
                    367:       min = GET2(cc, 1);
                    368:       cc += 5;
                    369:       break;
                    370: 
                    371:       default:
                    372:       min = 1;
                    373:       break;
                    374:       }
                    375: 
                    376:     branchlength += min * d;
                    377:     break;
                    378: 
                    379:     case OP_RECURSE:
                    380:     cs = ce = (uschar *)startcode + GET(cc, 1);
                    381:     if (cs == NULL) return -2;
                    382:     do ce += GET(ce, 1); while (*ce == OP_ALT);
                    383:     if (cc > cs && cc < ce)
                    384:       had_recurse = TRUE;
                    385:     else
                    386:       branchlength += find_minlength(cs, startcode, options);
                    387:     cc += 1 + LINK_SIZE;
                    388:     break;
                    389: 
                    390:     /* Anything else does not or need not match a character. We can get the
                    391:     item's length from the table, but for those that can match zero occurrences
                    392:     of a character, we must take special action for UTF-8 characters. */
                    393: 
                    394:     case OP_UPTO:
                    395:     case OP_NOTUPTO:
                    396:     case OP_MINUPTO:
                    397:     case OP_NOTMINUPTO:
                    398:     case OP_POSUPTO:
                    399:     case OP_STAR:
                    400:     case OP_MINSTAR:
                    401:     case OP_NOTMINSTAR:
                    402:     case OP_POSSTAR:
                    403:     case OP_NOTPOSSTAR:
                    404:     case OP_QUERY:
                    405:     case OP_MINQUERY:
                    406:     case OP_NOTMINQUERY:
                    407:     case OP_POSQUERY:
                    408:     case OP_NOTPOSQUERY:
                    409:     cc += _pcre_OP_lengths[op];
                    410: #ifdef SUPPORT_UTF8
                    411:     if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
                    412: #endif
                    413:     break;
                    414: 
                    415:     /* Skip these, but we need to add in the name length. */
                    416: 
                    417:     case OP_MARK:
                    418:     case OP_PRUNE_ARG:
                    419:     case OP_SKIP_ARG:
                    420:     cc += _pcre_OP_lengths[op] + cc[1];
                    421:     break;
                    422: 
                    423:     case OP_THEN_ARG:
                    424:     cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
                    425:     break;
                    426: 
                    427:     /* For the record, these are the opcodes that are matched by "default":
                    428:     OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
                    429:     OP_THEN. */
                    430: 
                    431:     default:
                    432:     cc += _pcre_OP_lengths[op];
                    433:     break;
                    434:     }
                    435:   }
                    436: /* Control never gets here */
                    437: }
                    438: 
                    439: 
                    440: 
                    441: /*************************************************
                    442: *      Set a bit and maybe its alternate case    *
                    443: *************************************************/
                    444: 
                    445: /* Given a character, set its first byte's bit in the table, and also the
                    446: corresponding bit for the other version of a letter if we are caseless. In
                    447: UTF-8 mode, for characters greater than 127, we can only do the caseless thing
                    448: when Unicode property support is available.
                    449: 
                    450: Arguments:
                    451:   start_bits    points to the bit map
                    452:   p             points to the character
                    453:   caseless      the caseless flag
                    454:   cd            the block with char table pointers
                    455:   utf8          TRUE for UTF-8 mode
                    456: 
                    457: Returns:        pointer after the character
                    458: */
                    459: 
                    460: static const uschar *
                    461: set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
                    462:   compile_data *cd, BOOL utf8)
                    463: {
                    464: unsigned int c = *p;
                    465: 
                    466: SET_BIT(c);
                    467: 
                    468: #ifdef SUPPORT_UTF8
                    469: if (utf8 && c > 127)
                    470:   {
                    471:   GETCHARINC(c, p);
                    472: #ifdef SUPPORT_UCP
                    473:   if (caseless)
                    474:     {
                    475:     uschar buff[8];
                    476:     c = UCD_OTHERCASE(c);
                    477:     (void)_pcre_ord2utf8(c, buff);
                    478:     SET_BIT(buff[0]);
                    479:     }
                    480: #endif
                    481:   return p;
                    482:   }
                    483: #endif
                    484: 
                    485: /* Not UTF-8 mode, or character is less than 127. */
                    486: 
                    487: if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
                    488: return p + 1;
                    489: }
                    490: 
                    491: 
                    492: 
                    493: /*************************************************
                    494: *     Set bits for a positive character type     *
                    495: *************************************************/
                    496: 
                    497: /* This function sets starting bits for a character type. In UTF-8 mode, we can
                    498: only do a direct setting for bytes less than 128, as otherwise there can be
                    499: confusion with bytes in the middle of UTF-8 characters. In a "traditional"
                    500: environment, the tables will only recognize ASCII characters anyway, but in at
                    501: least one Windows environment, some higher bytes bits were set in the tables.
                    502: So we deal with that case by considering the UTF-8 encoding.
                    503: 
                    504: Arguments:
                    505:   start_bits     the starting bitmap
                    506:   cbit type      the type of character wanted
                    507:   table_limit    32 for non-UTF-8; 16 for UTF-8
                    508:   cd             the block with char table pointers
                    509: 
                    510: Returns:         nothing
                    511: */
                    512: 
                    513: static void
                    514: set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
                    515:   compile_data *cd)
                    516: {
                    517: register int c;
                    518: for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
                    519: if (table_limit == 32) return;
                    520: for (c = 128; c < 256; c++)
                    521:   {
                    522:   if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
                    523:     {
                    524:     uschar buff[8];
                    525:     (void)_pcre_ord2utf8(c, buff);
                    526:     SET_BIT(buff[0]);
                    527:     }
                    528:   }
                    529: }
                    530: 
                    531: 
                    532: /*************************************************
                    533: *     Set bits for a negative character type     *
                    534: *************************************************/
                    535: 
                    536: /* This function sets starting bits for a negative character type such as \D.
                    537: In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
                    538: otherwise there can be confusion with bytes in the middle of UTF-8 characters.
                    539: Unlike in the positive case, where we can set appropriate starting bits for
                    540: specific high-valued UTF-8 characters, in this case we have to set the bits for
                    541: all high-valued characters. The lowest is 0xc2, but we overkill by starting at
                    542: 0xc0 (192) for simplicity.
                    543: 
                    544: Arguments:
                    545:   start_bits     the starting bitmap
                    546:   cbit type      the type of character wanted
                    547:   table_limit    32 for non-UTF-8; 16 for UTF-8
                    548:   cd             the block with char table pointers
                    549: 
                    550: Returns:         nothing
                    551: */
                    552: 
                    553: static void
                    554: set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
                    555:   compile_data *cd)
                    556: {
                    557: register int c;
                    558: for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
                    559: if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
                    560: }
                    561: 
                    562: 
                    563: 
                    564: /*************************************************
                    565: *          Create bitmap of starting bytes       *
                    566: *************************************************/
                    567: 
                    568: /* This function scans a compiled unanchored expression recursively and
                    569: attempts to build a bitmap of the set of possible starting bytes. As time goes
                    570: by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
                    571: useful for parenthesized groups in patterns such as (a*)b where the group
                    572: provides some optional starting bytes but scanning must continue at the outer
                    573: level to find at least one mandatory byte. At the outermost level, this
                    574: function fails unless the result is SSB_DONE.
                    575: 
                    576: Arguments:
                    577:   code         points to an expression
                    578:   start_bits   points to a 32-byte table, initialized to 0
                    579:   caseless     the current state of the caseless flag
                    580:   utf8         TRUE if in UTF-8 mode
                    581:   cd           the block with char table pointers
                    582: 
                    583: Returns:       SSB_FAIL     => Failed to find any starting bytes
                    584:                SSB_DONE     => Found mandatory starting bytes
                    585:                SSB_CONTINUE => Found optional starting bytes
                    586: */
                    587: 
                    588: static int
                    589: set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
                    590:   BOOL utf8, compile_data *cd)
                    591: {
                    592: register int c;
                    593: int yield = SSB_DONE;
                    594: int table_limit = utf8? 16:32;
                    595: 
                    596: #if 0
                    597: /* ========================================================================= */
                    598: /* The following comment and code was inserted in January 1999. In May 2006,
                    599: when it was observed to cause compiler warnings about unused values, I took it
                    600: out again. If anybody is still using OS/2, they will have to put it back
                    601: manually. */
                    602: 
                    603: /* This next statement and the later reference to dummy are here in order to
                    604: trick the optimizer of the IBM C compiler for OS/2 into generating correct
                    605: code. Apparently IBM isn't going to fix the problem, and we would rather not
                    606: disable optimization (in this module it actually makes a big difference, and
                    607: the pcre module can use all the optimization it can get). */
                    608: 
                    609: volatile int dummy;
                    610: /* ========================================================================= */
                    611: #endif
                    612: 
                    613: do
                    614:   {
                    615:   const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
                    616:   BOOL try_next = TRUE;
                    617: 
                    618:   while (try_next)    /* Loop for items in this branch */
                    619:     {
                    620:     int rc;
                    621:     switch(*tcode)
                    622:       {
                    623:       /* Fail if we reach something we don't understand */
                    624: 
                    625:       default:
                    626:       return SSB_FAIL;
                    627: 
                    628:       /* If we hit a bracket or a positive lookahead assertion, recurse to set
                    629:       bits from within the subpattern. If it can't find anything, we have to
                    630:       give up. If it finds some mandatory character(s), we are done for this
                    631:       branch. Otherwise, carry on scanning after the subpattern. */
                    632: 
                    633:       case OP_BRA:
                    634:       case OP_SBRA:
                    635:       case OP_CBRA:
                    636:       case OP_SCBRA:
                    637:       case OP_ONCE:
                    638:       case OP_ASSERT:
                    639:       rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
                    640:       if (rc == SSB_FAIL) return SSB_FAIL;
                    641:       if (rc == SSB_DONE) try_next = FALSE; else
                    642:         {
                    643:         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
                    644:         tcode += 1 + LINK_SIZE;
                    645:         }
                    646:       break;
                    647: 
                    648:       /* If we hit ALT or KET, it means we haven't found anything mandatory in
                    649:       this branch, though we might have found something optional. For ALT, we
                    650:       continue with the next alternative, but we have to arrange that the final
                    651:       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
                    652:       return SSB_CONTINUE: if this is the top level, that indicates failure,
                    653:       but after a nested subpattern, it causes scanning to continue. */
                    654: 
                    655:       case OP_ALT:
                    656:       yield = SSB_CONTINUE;
                    657:       try_next = FALSE;
                    658:       break;
                    659: 
                    660:       case OP_KET:
                    661:       case OP_KETRMAX:
                    662:       case OP_KETRMIN:
                    663:       return SSB_CONTINUE;
                    664: 
                    665:       /* Skip over callout */
                    666: 
                    667:       case OP_CALLOUT:
                    668:       tcode += 2 + 2*LINK_SIZE;
                    669:       break;
                    670: 
                    671:       /* Skip over lookbehind and negative lookahead assertions */
                    672: 
                    673:       case OP_ASSERT_NOT:
                    674:       case OP_ASSERTBACK:
                    675:       case OP_ASSERTBACK_NOT:
                    676:       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
                    677:       tcode += 1 + LINK_SIZE;
                    678:       break;
                    679: 
                    680:       /* Skip over an option setting, changing the caseless flag */
                    681: 
                    682:       case OP_OPT:
                    683:       caseless = (tcode[1] & PCRE_CASELESS) != 0;
                    684:       tcode += 2;
                    685:       break;
                    686: 
                    687:       /* BRAZERO does the bracket, but carries on. */
                    688: 
                    689:       case OP_BRAZERO:
                    690:       case OP_BRAMINZERO:
                    691:       if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
                    692:         return SSB_FAIL;
                    693: /* =========================================================================
                    694:       See the comment at the head of this function concerning the next line,
                    695:       which was an old fudge for the benefit of OS/2.
                    696:       dummy = 1;
                    697:   ========================================================================= */
                    698:       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
                    699:       tcode += 1 + LINK_SIZE;
                    700:       break;
                    701: 
                    702:       /* SKIPZERO skips the bracket. */
                    703: 
                    704:       case OP_SKIPZERO:
                    705:       tcode++;
                    706:       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
                    707:       tcode += 1 + LINK_SIZE;
                    708:       break;
                    709: 
                    710:       /* Single-char * or ? sets the bit and tries the next item */
                    711: 
                    712:       case OP_STAR:
                    713:       case OP_MINSTAR:
                    714:       case OP_POSSTAR:
                    715:       case OP_QUERY:
                    716:       case OP_MINQUERY:
                    717:       case OP_POSQUERY:
                    718:       tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
                    719:       break;
                    720: 
                    721:       /* Single-char upto sets the bit and tries the next */
                    722: 
                    723:       case OP_UPTO:
                    724:       case OP_MINUPTO:
                    725:       case OP_POSUPTO:
                    726:       tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
                    727:       break;
                    728: 
                    729:       /* At least one single char sets the bit and stops */
                    730: 
                    731:       case OP_EXACT:       /* Fall through */
                    732:       tcode += 2;
                    733: 
                    734:       case OP_CHAR:
                    735:       case OP_CHARNC:
                    736:       case OP_PLUS:
                    737:       case OP_MINPLUS:
                    738:       case OP_POSPLUS:
                    739:       (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
                    740:       try_next = FALSE;
                    741:       break;
                    742: 
                    743:       /* Special spacing and line-terminating items. These recognize specific
                    744:       lists of characters. The difference between VSPACE and ANYNL is that the
                    745:       latter can match the two-character CRLF sequence, but that is not
                    746:       relevant for finding the first character, so their code here is
                    747:       identical. */
                    748: 
                    749:       case OP_HSPACE:
                    750:       SET_BIT(0x09);
                    751:       SET_BIT(0x20);
                    752:       if (utf8)
                    753:         {
                    754:         SET_BIT(0xC2);  /* For U+00A0 */
                    755:         SET_BIT(0xE1);  /* For U+1680, U+180E */
                    756:         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
                    757:         SET_BIT(0xE3);  /* For U+3000 */
                    758:         }
                    759:       else SET_BIT(0xA0);
                    760:       try_next = FALSE;
                    761:       break;
                    762: 
                    763:       case OP_ANYNL:
                    764:       case OP_VSPACE:
                    765:       SET_BIT(0x0A);
                    766:       SET_BIT(0x0B);
                    767:       SET_BIT(0x0C);
                    768:       SET_BIT(0x0D);
                    769:       if (utf8)
                    770:         {
                    771:         SET_BIT(0xC2);  /* For U+0085 */
                    772:         SET_BIT(0xE2);  /* For U+2028, U+2029 */
                    773:         }
                    774:       else SET_BIT(0x85);
                    775:       try_next = FALSE;
                    776:       break;
                    777: 
                    778:       /* Single character types set the bits and stop. Note that if PCRE_UCP
                    779:       is set, we do not see these op codes because \d etc are converted to
                    780:       properties. Therefore, these apply in the case when only characters less
                    781:       than 256 are recognized to match the types. */
                    782: 
                    783:       case OP_NOT_DIGIT:
                    784:       set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
                    785:       try_next = FALSE;
                    786:       break;
                    787: 
                    788:       case OP_DIGIT:
                    789:       set_type_bits(start_bits, cbit_digit, table_limit, cd);
                    790:       try_next = FALSE;
                    791:       break;
                    792: 
                    793:       /* The cbit_space table has vertical tab as whitespace; we have to
                    794:       ensure it is set as not whitespace. */
                    795: 
                    796:       case OP_NOT_WHITESPACE:
                    797:       set_nottype_bits(start_bits, cbit_space, table_limit, cd);
                    798:       start_bits[1] |= 0x08;
                    799:       try_next = FALSE;
                    800:       break;
                    801: 
                    802:       /* The cbit_space table has vertical tab as whitespace; we have to
                    803:       not set it from the table. */
                    804: 
                    805:       case OP_WHITESPACE:
                    806:       c = start_bits[1];    /* Save in case it was already set */
                    807:       set_type_bits(start_bits, cbit_space, table_limit, cd);
                    808:       start_bits[1] = (start_bits[1] & ~0x08) | c;
                    809:       try_next = FALSE;
                    810:       break;
                    811: 
                    812:       case OP_NOT_WORDCHAR:
                    813:       set_nottype_bits(start_bits, cbit_word, table_limit, cd);
                    814:       try_next = FALSE;
                    815:       break;
                    816: 
                    817:       case OP_WORDCHAR:
                    818:       set_type_bits(start_bits, cbit_word, table_limit, cd);
                    819:       try_next = FALSE;
                    820:       break;
                    821: 
                    822:       /* One or more character type fudges the pointer and restarts, knowing
                    823:       it will hit a single character type and stop there. */
                    824: 
                    825:       case OP_TYPEPLUS:
                    826:       case OP_TYPEMINPLUS:
                    827:       case OP_TYPEPOSPLUS:
                    828:       tcode++;
                    829:       break;
                    830: 
                    831:       case OP_TYPEEXACT:
                    832:       tcode += 3;
                    833:       break;
                    834: 
                    835:       /* Zero or more repeats of character types set the bits and then
                    836:       try again. */
                    837: 
                    838:       case OP_TYPEUPTO:
                    839:       case OP_TYPEMINUPTO:
                    840:       case OP_TYPEPOSUPTO:
                    841:       tcode += 2;               /* Fall through */
                    842: 
                    843:       case OP_TYPESTAR:
                    844:       case OP_TYPEMINSTAR:
                    845:       case OP_TYPEPOSSTAR:
                    846:       case OP_TYPEQUERY:
                    847:       case OP_TYPEMINQUERY:
                    848:       case OP_TYPEPOSQUERY:
                    849:       switch(tcode[1])
                    850:         {
                    851:         default:
                    852:         case OP_ANY:
                    853:         case OP_ALLANY:
                    854:         return SSB_FAIL;
                    855: 
                    856:         case OP_HSPACE:
                    857:         SET_BIT(0x09);
                    858:         SET_BIT(0x20);
                    859:         if (utf8)
                    860:           {
                    861:           SET_BIT(0xC2);  /* For U+00A0 */
                    862:           SET_BIT(0xE1);  /* For U+1680, U+180E */
                    863:           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
                    864:           SET_BIT(0xE3);  /* For U+3000 */
                    865:           }
                    866:         else SET_BIT(0xA0);
                    867:         break;
                    868: 
                    869:         case OP_ANYNL:
                    870:         case OP_VSPACE:
                    871:         SET_BIT(0x0A);
                    872:         SET_BIT(0x0B);
                    873:         SET_BIT(0x0C);
                    874:         SET_BIT(0x0D);
                    875:         if (utf8)
                    876:           {
                    877:           SET_BIT(0xC2);  /* For U+0085 */
                    878:           SET_BIT(0xE2);  /* For U+2028, U+2029 */
                    879:           }
                    880:         else SET_BIT(0x85);
                    881:         break;
                    882: 
                    883:         case OP_NOT_DIGIT:
                    884:         set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
                    885:         break;
                    886: 
                    887:         case OP_DIGIT:
                    888:         set_type_bits(start_bits, cbit_digit, table_limit, cd);
                    889:         break;
                    890: 
                    891:         /* The cbit_space table has vertical tab as whitespace; we have to
                    892:         ensure it gets set as not whitespace. */
                    893: 
                    894:         case OP_NOT_WHITESPACE:
                    895:         set_nottype_bits(start_bits, cbit_space, table_limit, cd);
                    896:         start_bits[1] |= 0x08;
                    897:         break;
                    898: 
                    899:         /* The cbit_space table has vertical tab as whitespace; we have to
                    900:         avoid setting it. */
                    901: 
                    902:         case OP_WHITESPACE:
                    903:         c = start_bits[1];    /* Save in case it was already set */
                    904:         set_type_bits(start_bits, cbit_space, table_limit, cd);
                    905:         start_bits[1] = (start_bits[1] & ~0x08) | c;
                    906:         break;
                    907: 
                    908:         case OP_NOT_WORDCHAR:
                    909:         set_nottype_bits(start_bits, cbit_word, table_limit, cd);
                    910:         break;
                    911: 
                    912:         case OP_WORDCHAR:
                    913:         set_type_bits(start_bits, cbit_word, table_limit, cd);
                    914:         break;
                    915:         }
                    916: 
                    917:       tcode += 2;
                    918:       break;
                    919: 
                    920:       /* Character class where all the information is in a bit map: set the
                    921:       bits and either carry on or not, according to the repeat count. If it was
                    922:       a negative class, and we are operating with UTF-8 characters, any byte
                    923:       with a value >= 0xc4 is a potentially valid starter because it starts a
                    924:       character with a value > 255. */
                    925: 
                    926:       case OP_NCLASS:
                    927: #ifdef SUPPORT_UTF8
                    928:       if (utf8)
                    929:         {
                    930:         start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
                    931:         memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
                    932:         }
                    933: #endif
                    934:       /* Fall through */
                    935: 
                    936:       case OP_CLASS:
                    937:         {
                    938:         tcode++;
                    939: 
                    940:         /* In UTF-8 mode, the bits in a bit map correspond to character
                    941:         values, not to byte values. However, the bit map we are constructing is
                    942:         for byte values. So we have to do a conversion for characters whose
                    943:         value is > 127. In fact, there are only two possible starting bytes for
                    944:         characters in the range 128 - 255. */
                    945: 
                    946: #ifdef SUPPORT_UTF8
                    947:         if (utf8)
                    948:           {
                    949:           for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
                    950:           for (c = 128; c < 256; c++)
                    951:             {
                    952:             if ((tcode[c/8] && (1 << (c&7))) != 0)
                    953:               {
                    954:               int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
                    955:               start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
                    956:               c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
                    957:               }
                    958:             }
                    959:           }
                    960: 
                    961:         /* In non-UTF-8 mode, the two bit maps are completely compatible. */
                    962: 
                    963:         else
                    964: #endif
                    965:           {
                    966:           for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
                    967:           }
                    968: 
                    969:         /* Advance past the bit map, and act on what follows */
                    970: 
                    971:         tcode += 32;
                    972:         switch (*tcode)
                    973:           {
                    974:           case OP_CRSTAR:
                    975:           case OP_CRMINSTAR:
                    976:           case OP_CRQUERY:
                    977:           case OP_CRMINQUERY:
                    978:           tcode++;
                    979:           break;
                    980: 
                    981:           case OP_CRRANGE:
                    982:           case OP_CRMINRANGE:
                    983:           if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
                    984:             else try_next = FALSE;
                    985:           break;
                    986: 
                    987:           default:
                    988:           try_next = FALSE;
                    989:           break;
                    990:           }
                    991:         }
                    992:       break; /* End of bitmap class handling */
                    993: 
                    994:       }      /* End of switch */
                    995:     }        /* End of try_next loop */
                    996: 
                    997:   code += GET(code, 1);   /* Advance to next branch */
                    998:   }
                    999: while (*code == OP_ALT);
                   1000: return yield;
                   1001: }
                   1002: 
                   1003: 
                   1004: 
                   1005: /*************************************************
                   1006: *          Study a compiled expression           *
                   1007: *************************************************/
                   1008: 
                   1009: /* This function is handed a compiled expression that it must study to produce
                   1010: information that will speed up the matching. It returns a pcre_extra block
                   1011: which then gets handed back to pcre_exec().
                   1012: 
                   1013: Arguments:
                   1014:   re        points to the compiled expression
                   1015:   options   contains option bits
                   1016:   errorptr  points to where to place error messages;
                   1017:             set NULL unless error
                   1018: 
                   1019: Returns:    pointer to a pcre_extra block, with study_data filled in and the
                   1020:               appropriate flags set;
                   1021:             NULL on error or if no optimization possible
                   1022: */
                   1023: 
                   1024: PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
                   1025: pcre_study(const pcre *external_re, int options, const char **errorptr)
                   1026: {
                   1027: int min;
                   1028: BOOL bits_set = FALSE;
                   1029: uschar start_bits[32];
                   1030: pcre_extra *extra;
                   1031: pcre_study_data *study;
                   1032: const uschar *tables;
                   1033: uschar *code;
                   1034: compile_data compile_block;
                   1035: const real_pcre *re = (const real_pcre *)external_re;
                   1036: 
                   1037: *errorptr = NULL;
                   1038: 
                   1039: if (re == NULL || re->magic_number != MAGIC_NUMBER)
                   1040:   {
                   1041:   *errorptr = "argument is not a compiled regular expression";
                   1042:   return NULL;
                   1043:   }
                   1044: 
                   1045: if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
                   1046:   {
                   1047:   *errorptr = "unknown or incorrect option bit(s) set";
                   1048:   return NULL;
                   1049:   }
                   1050: 
                   1051: code = (uschar *)re + re->name_table_offset +
                   1052:   (re->name_count * re->name_entry_size);
                   1053: 
                   1054: /* For an anchored pattern, or an unanchored pattern that has a first char, or
                   1055: a multiline pattern that matches only at "line starts", there is no point in
                   1056: seeking a list of starting bytes. */
                   1057: 
                   1058: if ((re->options & PCRE_ANCHORED) == 0 &&
                   1059:     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
                   1060:   {
                   1061:   /* Set the character tables in the block that is passed around */
                   1062: 
                   1063:   tables = re->tables;
                   1064:   if (tables == NULL)
                   1065:     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
                   1066:     (void *)(&tables));
                   1067: 
                   1068:   compile_block.lcc = tables + lcc_offset;
                   1069:   compile_block.fcc = tables + fcc_offset;
                   1070:   compile_block.cbits = tables + cbits_offset;
                   1071:   compile_block.ctypes = tables + ctypes_offset;
                   1072: 
                   1073:   /* See if we can find a fixed set of initial characters for the pattern. */
                   1074: 
                   1075:   memset(start_bits, 0, 32 * sizeof(uschar));
                   1076:   bits_set = set_start_bits(code, start_bits,
                   1077:     (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,
                   1078:     &compile_block) == SSB_DONE;
                   1079:   }
                   1080: 
                   1081: /* Find the minimum length of subject string. */
                   1082: 
                   1083: min = find_minlength(code, code, re->options);
                   1084: 
                   1085: /* Return NULL if no optimization is possible. */
                   1086: 
                   1087: if (!bits_set && min < 0) return NULL;
                   1088: 
                   1089: /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
                   1090: the latter, which is pointed to by the former, which may also get additional
                   1091: data set later by the calling program. At the moment, the size of
                   1092: pcre_study_data is fixed. We nevertheless save it in a field for returning via
                   1093: the pcre_fullinfo() function so that if it becomes variable in the future, we
                   1094: don't have to change that code. */
                   1095: 
                   1096: extra = (pcre_extra *)(pcre_malloc)
                   1097:   (sizeof(pcre_extra) + sizeof(pcre_study_data));
                   1098: 
                   1099: if (extra == NULL)
                   1100:   {
                   1101:   *errorptr = "failed to get memory";
                   1102:   return NULL;
                   1103:   }
                   1104: 
                   1105: study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
                   1106: extra->flags = PCRE_EXTRA_STUDY_DATA;
                   1107: extra->study_data = study;
                   1108: 
                   1109: study->size = sizeof(pcre_study_data);
                   1110: study->flags = 0;
                   1111: 
                   1112: if (bits_set)
                   1113:   {
                   1114:   study->flags |= PCRE_STUDY_MAPPED;
                   1115:   memcpy(study->start_bits, start_bits, sizeof(start_bits));
                   1116:   }
                   1117: 
                   1118: if (min >= 0)
                   1119:   {
                   1120:   study->flags |= PCRE_STUDY_MINLEN;
                   1121:   study->minlength = min;
                   1122:   }
                   1123: 
                   1124: return extra;
                   1125: }
                   1126: 
                   1127: /* End of pcre_study.c */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>