Annotation of embedaddon/php/ext/pcre/pcrelib/pcre_study.c, revision 1.1

1.1     ! misho       1: /*************************************************
        !             2: *      Perl-Compatible Regular Expressions       *
        !             3: *************************************************/
        !             4: 
        !             5: /* PCRE is a library of functions to support regular expressions whose syntax
        !             6: and semantics are as close as possible to those of the Perl 5 language.
        !             7: 
        !             8:                        Written by Philip Hazel
        !             9:            Copyright (c) 1997-2010 University of Cambridge
        !            10: 
        !            11: -----------------------------------------------------------------------------
        !            12: Redistribution and use in source and binary forms, with or without
        !            13: modification, are permitted provided that the following conditions are met:
        !            14: 
        !            15:     * Redistributions of source code must retain the above copyright notice,
        !            16:       this list of conditions and the following disclaimer.
        !            17: 
        !            18:     * Redistributions in binary form must reproduce the above copyright
        !            19:       notice, this list of conditions and the following disclaimer in the
        !            20:       documentation and/or other materials provided with the distribution.
        !            21: 
        !            22:     * Neither the name of the University of Cambridge nor the names of its
        !            23:       contributors may be used to endorse or promote products derived from
        !            24:       this software without specific prior written permission.
        !            25: 
        !            26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
        !            27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
        !            28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
        !            29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
        !            30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
        !            31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
        !            32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
        !            33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
        !            34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
        !            35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
        !            36: POSSIBILITY OF SUCH DAMAGE.
        !            37: -----------------------------------------------------------------------------
        !            38: */
        !            39: 
        !            40: 
        !            41: /* This module contains the external function pcre_study(), along with local
        !            42: supporting functions. */
        !            43: 
        !            44: 
        !            45: #include "config.h"
        !            46: 
        !            47: #include "pcre_internal.h"
        !            48: 
        !            49: #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
        !            50: 
        !            51: /* Returns from set_start_bits() */
        !            52: 
        !            53: enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
        !            54: 
        !            55: 
        !            56: 
        !            57: /*************************************************
        !            58: *   Find the minimum subject length for a group  *
        !            59: *************************************************/
        !            60: 
        !            61: /* Scan a parenthesized group and compute the minimum length of subject that
        !            62: is needed to match it. This is a lower bound; it does not mean there is a
        !            63: string of that length that matches. In UTF8 mode, the result is in characters
        !            64: rather than bytes.
        !            65: 
        !            66: Arguments:
        !            67:   code       pointer to start of group (the bracket)
        !            68:   startcode  pointer to start of the whole pattern
        !            69:   options    the compiling options
        !            70: 
        !            71: Returns:   the minimum length
        !            72:            -1 if \C was encountered
        !            73:            -2 internal error (missing capturing bracket)
        !            74: */
        !            75: 
        !            76: static int
        !            77: find_minlength(const uschar *code, const uschar *startcode, int options)
        !            78: {
        !            79: int length = -1;
        !            80: BOOL utf8 = (options & PCRE_UTF8) != 0;
        !            81: BOOL had_recurse = FALSE;
        !            82: register int branchlength = 0;
        !            83: register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
        !            84: 
        !            85: if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;
        !            86: 
        !            87: /* Scan along the opcodes for this branch. If we get to the end of the
        !            88: branch, check the length against that of the other branches. */
        !            89: 
        !            90: for (;;)
        !            91:   {
        !            92:   int d, min;
        !            93:   uschar *cs, *ce;
        !            94:   register int op = *cc;
        !            95: 
        !            96:   switch (op)
        !            97:     {
        !            98:     case OP_COND:
        !            99:     case OP_SCOND:
        !           100: 
        !           101:     /* If there is only one branch in a condition, the implied branch has zero
        !           102:     length, so we don't add anything. This covers the DEFINE "condition"
        !           103:     automatically. */
        !           104: 
        !           105:     cs = cc + GET(cc, 1);
        !           106:     if (*cs != OP_ALT)
        !           107:       {
        !           108:       cc = cs + 1 + LINK_SIZE;
        !           109:       break;
        !           110:       }
        !           111: 
        !           112:     /* Otherwise we can fall through and treat it the same as any other
        !           113:     subpattern. */
        !           114: 
        !           115:     case OP_CBRA:
        !           116:     case OP_SCBRA:
        !           117:     case OP_BRA:
        !           118:     case OP_SBRA:
        !           119:     case OP_ONCE:
        !           120:     d = find_minlength(cc, startcode, options);
        !           121:     if (d < 0) return d;
        !           122:     branchlength += d;
        !           123:     do cc += GET(cc, 1); while (*cc == OP_ALT);
        !           124:     cc += 1 + LINK_SIZE;
        !           125:     break;
        !           126: 
        !           127:     /* Reached end of a branch; if it's a ket it is the end of a nested
        !           128:     call. If it's ALT it is an alternation in a nested call. If it is
        !           129:     END it's the end of the outer call. All can be handled by the same code. */
        !           130: 
        !           131:     case OP_ALT:
        !           132:     case OP_KET:
        !           133:     case OP_KETRMAX:
        !           134:     case OP_KETRMIN:
        !           135:     case OP_END:
        !           136:     if (length < 0 || (!had_recurse && branchlength < length))
        !           137:       length = branchlength;
        !           138:     if (*cc != OP_ALT) return length;
        !           139:     cc += 1 + LINK_SIZE;
        !           140:     branchlength = 0;
        !           141:     had_recurse = FALSE;
        !           142:     break;
        !           143: 
        !           144:     /* Skip over assertive subpatterns */
        !           145: 
        !           146:     case OP_ASSERT:
        !           147:     case OP_ASSERT_NOT:
        !           148:     case OP_ASSERTBACK:
        !           149:     case OP_ASSERTBACK_NOT:
        !           150:     do cc += GET(cc, 1); while (*cc == OP_ALT);
        !           151:     /* Fall through */
        !           152: 
        !           153:     /* Skip over things that don't match chars */
        !           154: 
        !           155:     case OP_REVERSE:
        !           156:     case OP_CREF:
        !           157:     case OP_NCREF:
        !           158:     case OP_RREF:
        !           159:     case OP_NRREF:
        !           160:     case OP_DEF:
        !           161:     case OP_OPT:
        !           162:     case OP_CALLOUT:
        !           163:     case OP_SOD:
        !           164:     case OP_SOM:
        !           165:     case OP_EOD:
        !           166:     case OP_EODN:
        !           167:     case OP_CIRC:
        !           168:     case OP_DOLL:
        !           169:     case OP_NOT_WORD_BOUNDARY:
        !           170:     case OP_WORD_BOUNDARY:
        !           171:     cc += _pcre_OP_lengths[*cc];
        !           172:     break;
        !           173: 
        !           174:     /* Skip over a subpattern that has a {0} or {0,x} quantifier */
        !           175: 
        !           176:     case OP_BRAZERO:
        !           177:     case OP_BRAMINZERO:
        !           178:     case OP_SKIPZERO:
        !           179:     cc += _pcre_OP_lengths[*cc];
        !           180:     do cc += GET(cc, 1); while (*cc == OP_ALT);
        !           181:     cc += 1 + LINK_SIZE;
        !           182:     break;
        !           183: 
        !           184:     /* Handle literal characters and + repetitions */
        !           185: 
        !           186:     case OP_CHAR:
        !           187:     case OP_CHARNC:
        !           188:     case OP_NOT:
        !           189:     case OP_PLUS:
        !           190:     case OP_MINPLUS:
        !           191:     case OP_POSPLUS:
        !           192:     case OP_NOTPLUS:
        !           193:     case OP_NOTMINPLUS:
        !           194:     case OP_NOTPOSPLUS:
        !           195:     branchlength++;
        !           196:     cc += 2;
        !           197: #ifdef SUPPORT_UTF8
        !           198:     if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
        !           199: #endif
        !           200:     break;
        !           201: 
        !           202:     case OP_TYPEPLUS:
        !           203:     case OP_TYPEMINPLUS:
        !           204:     case OP_TYPEPOSPLUS:
        !           205:     branchlength++;
        !           206:     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
        !           207:     break;
        !           208: 
        !           209:     /* Handle exact repetitions. The count is already in characters, but we
        !           210:     need to skip over a multibyte character in UTF8 mode.  */
        !           211: 
        !           212:     case OP_EXACT:
        !           213:     case OP_NOTEXACT:
        !           214:     branchlength += GET2(cc,1);
        !           215:     cc += 4;
        !           216: #ifdef SUPPORT_UTF8
        !           217:     if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
        !           218: #endif
        !           219:     break;
        !           220: 
        !           221:     case OP_TYPEEXACT:
        !           222:     branchlength += GET2(cc,1);
        !           223:     cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
        !           224:     break;
        !           225: 
        !           226:     /* Handle single-char non-literal matchers */
        !           227: 
        !           228:     case OP_PROP:
        !           229:     case OP_NOTPROP:
        !           230:     cc += 2;
        !           231:     /* Fall through */
        !           232: 
        !           233:     case OP_NOT_DIGIT:
        !           234:     case OP_DIGIT:
        !           235:     case OP_NOT_WHITESPACE:
        !           236:     case OP_WHITESPACE:
        !           237:     case OP_NOT_WORDCHAR:
        !           238:     case OP_WORDCHAR:
        !           239:     case OP_ANY:
        !           240:     case OP_ALLANY:
        !           241:     case OP_EXTUNI:
        !           242:     case OP_HSPACE:
        !           243:     case OP_NOT_HSPACE:
        !           244:     case OP_VSPACE:
        !           245:     case OP_NOT_VSPACE:
        !           246:     branchlength++;
        !           247:     cc++;
        !           248:     break;
        !           249: 
        !           250:     /* "Any newline" might match two characters */
        !           251: 
        !           252:     case OP_ANYNL:
        !           253:     branchlength += 2;
        !           254:     cc++;
        !           255:     break;
        !           256: 
        !           257:     /* The single-byte matcher means we can't proceed in UTF-8 mode */
        !           258: 
        !           259:     case OP_ANYBYTE:
        !           260: #ifdef SUPPORT_UTF8
        !           261:     if (utf8) return -1;
        !           262: #endif
        !           263:     branchlength++;
        !           264:     cc++;
        !           265:     break;
        !           266: 
        !           267:     /* For repeated character types, we have to test for \p and \P, which have
        !           268:     an extra two bytes of parameters. */
        !           269: 
        !           270:     case OP_TYPESTAR:
        !           271:     case OP_TYPEMINSTAR:
        !           272:     case OP_TYPEQUERY:
        !           273:     case OP_TYPEMINQUERY:
        !           274:     case OP_TYPEPOSSTAR:
        !           275:     case OP_TYPEPOSQUERY:
        !           276:     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
        !           277:     cc += _pcre_OP_lengths[op];
        !           278:     break;
        !           279: 
        !           280:     case OP_TYPEUPTO:
        !           281:     case OP_TYPEMINUPTO:
        !           282:     case OP_TYPEPOSUPTO:
        !           283:     if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
        !           284:     cc += _pcre_OP_lengths[op];
        !           285:     break;
        !           286: 
        !           287:     /* Check a class for variable quantification */
        !           288: 
        !           289: #ifdef SUPPORT_UTF8
        !           290:     case OP_XCLASS:
        !           291:     cc += GET(cc, 1) - 33;
        !           292:     /* Fall through */
        !           293: #endif
        !           294: 
        !           295:     case OP_CLASS:
        !           296:     case OP_NCLASS:
        !           297:     cc += 33;
        !           298: 
        !           299:     switch (*cc)
        !           300:       {
        !           301:       case OP_CRPLUS:
        !           302:       case OP_CRMINPLUS:
        !           303:       branchlength++;
        !           304:       /* Fall through */
        !           305: 
        !           306:       case OP_CRSTAR:
        !           307:       case OP_CRMINSTAR:
        !           308:       case OP_CRQUERY:
        !           309:       case OP_CRMINQUERY:
        !           310:       cc++;
        !           311:       break;
        !           312: 
        !           313:       case OP_CRRANGE:
        !           314:       case OP_CRMINRANGE:
        !           315:       branchlength += GET2(cc,1);
        !           316:       cc += 5;
        !           317:       break;
        !           318: 
        !           319:       default:
        !           320:       branchlength++;
        !           321:       break;
        !           322:       }
        !           323:     break;
        !           324: 
        !           325:     /* Backreferences and subroutine calls are treated in the same way: we find
        !           326:     the minimum length for the subpattern. A recursion, however, causes an
        !           327:     a flag to be set that causes the length of this branch to be ignored. The
        !           328:     logic is that a recursion can only make sense if there is another
        !           329:     alternation that stops the recursing. That will provide the minimum length
        !           330:     (when no recursion happens). A backreference within the group that it is
        !           331:     referencing behaves in the same way.
        !           332: 
        !           333:     If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
        !           334:     matches an empty string (by default it causes a matching failure), so in
        !           335:     that case we must set the minimum length to zero. */
        !           336: 
        !           337:     case OP_REF:
        !           338:     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
        !           339:       {
        !           340:       ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
        !           341:       if (cs == NULL) return -2;
        !           342:       do ce += GET(ce, 1); while (*ce == OP_ALT);
        !           343:       if (cc > cs && cc < ce)
        !           344:         {
        !           345:         d = 0;
        !           346:         had_recurse = TRUE;
        !           347:         }
        !           348:       else d = find_minlength(cs, startcode, options);
        !           349:       }
        !           350:     else d = 0;
        !           351:     cc += 3;
        !           352: 
        !           353:     /* Handle repeated back references */
        !           354: 
        !           355:     switch (*cc)
        !           356:       {
        !           357:       case OP_CRSTAR:
        !           358:       case OP_CRMINSTAR:
        !           359:       case OP_CRQUERY:
        !           360:       case OP_CRMINQUERY:
        !           361:       min = 0;
        !           362:       cc++;
        !           363:       break;
        !           364: 
        !           365:       case OP_CRRANGE:
        !           366:       case OP_CRMINRANGE:
        !           367:       min = GET2(cc, 1);
        !           368:       cc += 5;
        !           369:       break;
        !           370: 
        !           371:       default:
        !           372:       min = 1;
        !           373:       break;
        !           374:       }
        !           375: 
        !           376:     branchlength += min * d;
        !           377:     break;
        !           378: 
        !           379:     case OP_RECURSE:
        !           380:     cs = ce = (uschar *)startcode + GET(cc, 1);
        !           381:     if (cs == NULL) return -2;
        !           382:     do ce += GET(ce, 1); while (*ce == OP_ALT);
        !           383:     if (cc > cs && cc < ce)
        !           384:       had_recurse = TRUE;
        !           385:     else
        !           386:       branchlength += find_minlength(cs, startcode, options);
        !           387:     cc += 1 + LINK_SIZE;
        !           388:     break;
        !           389: 
        !           390:     /* Anything else does not or need not match a character. We can get the
        !           391:     item's length from the table, but for those that can match zero occurrences
        !           392:     of a character, we must take special action for UTF-8 characters. */
        !           393: 
        !           394:     case OP_UPTO:
        !           395:     case OP_NOTUPTO:
        !           396:     case OP_MINUPTO:
        !           397:     case OP_NOTMINUPTO:
        !           398:     case OP_POSUPTO:
        !           399:     case OP_STAR:
        !           400:     case OP_MINSTAR:
        !           401:     case OP_NOTMINSTAR:
        !           402:     case OP_POSSTAR:
        !           403:     case OP_NOTPOSSTAR:
        !           404:     case OP_QUERY:
        !           405:     case OP_MINQUERY:
        !           406:     case OP_NOTMINQUERY:
        !           407:     case OP_POSQUERY:
        !           408:     case OP_NOTPOSQUERY:
        !           409:     cc += _pcre_OP_lengths[op];
        !           410: #ifdef SUPPORT_UTF8
        !           411:     if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
        !           412: #endif
        !           413:     break;
        !           414: 
        !           415:     /* Skip these, but we need to add in the name length. */
        !           416: 
        !           417:     case OP_MARK:
        !           418:     case OP_PRUNE_ARG:
        !           419:     case OP_SKIP_ARG:
        !           420:     cc += _pcre_OP_lengths[op] + cc[1];
        !           421:     break;
        !           422: 
        !           423:     case OP_THEN_ARG:
        !           424:     cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
        !           425:     break;
        !           426: 
        !           427:     /* For the record, these are the opcodes that are matched by "default":
        !           428:     OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
        !           429:     OP_THEN. */
        !           430: 
        !           431:     default:
        !           432:     cc += _pcre_OP_lengths[op];
        !           433:     break;
        !           434:     }
        !           435:   }
        !           436: /* Control never gets here */
        !           437: }
        !           438: 
        !           439: 
        !           440: 
        !           441: /*************************************************
        !           442: *      Set a bit and maybe its alternate case    *
        !           443: *************************************************/
        !           444: 
        !           445: /* Given a character, set its first byte's bit in the table, and also the
        !           446: corresponding bit for the other version of a letter if we are caseless. In
        !           447: UTF-8 mode, for characters greater than 127, we can only do the caseless thing
        !           448: when Unicode property support is available.
        !           449: 
        !           450: Arguments:
        !           451:   start_bits    points to the bit map
        !           452:   p             points to the character
        !           453:   caseless      the caseless flag
        !           454:   cd            the block with char table pointers
        !           455:   utf8          TRUE for UTF-8 mode
        !           456: 
        !           457: Returns:        pointer after the character
        !           458: */
        !           459: 
        !           460: static const uschar *
        !           461: set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
        !           462:   compile_data *cd, BOOL utf8)
        !           463: {
        !           464: unsigned int c = *p;
        !           465: 
        !           466: SET_BIT(c);
        !           467: 
        !           468: #ifdef SUPPORT_UTF8
        !           469: if (utf8 && c > 127)
        !           470:   {
        !           471:   GETCHARINC(c, p);
        !           472: #ifdef SUPPORT_UCP
        !           473:   if (caseless)
        !           474:     {
        !           475:     uschar buff[8];
        !           476:     c = UCD_OTHERCASE(c);
        !           477:     (void)_pcre_ord2utf8(c, buff);
        !           478:     SET_BIT(buff[0]);
        !           479:     }
        !           480: #endif
        !           481:   return p;
        !           482:   }
        !           483: #endif
        !           484: 
        !           485: /* Not UTF-8 mode, or character is less than 127. */
        !           486: 
        !           487: if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
        !           488: return p + 1;
        !           489: }
        !           490: 
        !           491: 
        !           492: 
        !           493: /*************************************************
        !           494: *     Set bits for a positive character type     *
        !           495: *************************************************/
        !           496: 
        !           497: /* This function sets starting bits for a character type. In UTF-8 mode, we can
        !           498: only do a direct setting for bytes less than 128, as otherwise there can be
        !           499: confusion with bytes in the middle of UTF-8 characters. In a "traditional"
        !           500: environment, the tables will only recognize ASCII characters anyway, but in at
        !           501: least one Windows environment, some higher bytes bits were set in the tables.
        !           502: So we deal with that case by considering the UTF-8 encoding.
        !           503: 
        !           504: Arguments:
        !           505:   start_bits     the starting bitmap
        !           506:   cbit type      the type of character wanted
        !           507:   table_limit    32 for non-UTF-8; 16 for UTF-8
        !           508:   cd             the block with char table pointers
        !           509: 
        !           510: Returns:         nothing
        !           511: */
        !           512: 
        !           513: static void
        !           514: set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
        !           515:   compile_data *cd)
        !           516: {
        !           517: register int c;
        !           518: for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
        !           519: if (table_limit == 32) return;
        !           520: for (c = 128; c < 256; c++)
        !           521:   {
        !           522:   if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
        !           523:     {
        !           524:     uschar buff[8];
        !           525:     (void)_pcre_ord2utf8(c, buff);
        !           526:     SET_BIT(buff[0]);
        !           527:     }
        !           528:   }
        !           529: }
        !           530: 
        !           531: 
        !           532: /*************************************************
        !           533: *     Set bits for a negative character type     *
        !           534: *************************************************/
        !           535: 
        !           536: /* This function sets starting bits for a negative character type such as \D.
        !           537: In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
        !           538: otherwise there can be confusion with bytes in the middle of UTF-8 characters.
        !           539: Unlike in the positive case, where we can set appropriate starting bits for
        !           540: specific high-valued UTF-8 characters, in this case we have to set the bits for
        !           541: all high-valued characters. The lowest is 0xc2, but we overkill by starting at
        !           542: 0xc0 (192) for simplicity.
        !           543: 
        !           544: Arguments:
        !           545:   start_bits     the starting bitmap
        !           546:   cbit type      the type of character wanted
        !           547:   table_limit    32 for non-UTF-8; 16 for UTF-8
        !           548:   cd             the block with char table pointers
        !           549: 
        !           550: Returns:         nothing
        !           551: */
        !           552: 
        !           553: static void
        !           554: set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
        !           555:   compile_data *cd)
        !           556: {
        !           557: register int c;
        !           558: for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
        !           559: if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
        !           560: }
        !           561: 
        !           562: 
        !           563: 
        !           564: /*************************************************
        !           565: *          Create bitmap of starting bytes       *
        !           566: *************************************************/
        !           567: 
        !           568: /* This function scans a compiled unanchored expression recursively and
        !           569: attempts to build a bitmap of the set of possible starting bytes. As time goes
        !           570: by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
        !           571: useful for parenthesized groups in patterns such as (a*)b where the group
        !           572: provides some optional starting bytes but scanning must continue at the outer
        !           573: level to find at least one mandatory byte. At the outermost level, this
        !           574: function fails unless the result is SSB_DONE.
        !           575: 
        !           576: Arguments:
        !           577:   code         points to an expression
        !           578:   start_bits   points to a 32-byte table, initialized to 0
        !           579:   caseless     the current state of the caseless flag
        !           580:   utf8         TRUE if in UTF-8 mode
        !           581:   cd           the block with char table pointers
        !           582: 
        !           583: Returns:       SSB_FAIL     => Failed to find any starting bytes
        !           584:                SSB_DONE     => Found mandatory starting bytes
        !           585:                SSB_CONTINUE => Found optional starting bytes
        !           586: */
        !           587: 
        !           588: static int
        !           589: set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
        !           590:   BOOL utf8, compile_data *cd)
        !           591: {
        !           592: register int c;
        !           593: int yield = SSB_DONE;
        !           594: int table_limit = utf8? 16:32;
        !           595: 
        !           596: #if 0
        !           597: /* ========================================================================= */
        !           598: /* The following comment and code was inserted in January 1999. In May 2006,
        !           599: when it was observed to cause compiler warnings about unused values, I took it
        !           600: out again. If anybody is still using OS/2, they will have to put it back
        !           601: manually. */
        !           602: 
        !           603: /* This next statement and the later reference to dummy are here in order to
        !           604: trick the optimizer of the IBM C compiler for OS/2 into generating correct
        !           605: code. Apparently IBM isn't going to fix the problem, and we would rather not
        !           606: disable optimization (in this module it actually makes a big difference, and
        !           607: the pcre module can use all the optimization it can get). */
        !           608: 
        !           609: volatile int dummy;
        !           610: /* ========================================================================= */
        !           611: #endif
        !           612: 
        !           613: do
        !           614:   {
        !           615:   const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
        !           616:   BOOL try_next = TRUE;
        !           617: 
        !           618:   while (try_next)    /* Loop for items in this branch */
        !           619:     {
        !           620:     int rc;
        !           621:     switch(*tcode)
        !           622:       {
        !           623:       /* Fail if we reach something we don't understand */
        !           624: 
        !           625:       default:
        !           626:       return SSB_FAIL;
        !           627: 
        !           628:       /* If we hit a bracket or a positive lookahead assertion, recurse to set
        !           629:       bits from within the subpattern. If it can't find anything, we have to
        !           630:       give up. If it finds some mandatory character(s), we are done for this
        !           631:       branch. Otherwise, carry on scanning after the subpattern. */
        !           632: 
        !           633:       case OP_BRA:
        !           634:       case OP_SBRA:
        !           635:       case OP_CBRA:
        !           636:       case OP_SCBRA:
        !           637:       case OP_ONCE:
        !           638:       case OP_ASSERT:
        !           639:       rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
        !           640:       if (rc == SSB_FAIL) return SSB_FAIL;
        !           641:       if (rc == SSB_DONE) try_next = FALSE; else
        !           642:         {
        !           643:         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
        !           644:         tcode += 1 + LINK_SIZE;
        !           645:         }
        !           646:       break;
        !           647: 
        !           648:       /* If we hit ALT or KET, it means we haven't found anything mandatory in
        !           649:       this branch, though we might have found something optional. For ALT, we
        !           650:       continue with the next alternative, but we have to arrange that the final
        !           651:       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
        !           652:       return SSB_CONTINUE: if this is the top level, that indicates failure,
        !           653:       but after a nested subpattern, it causes scanning to continue. */
        !           654: 
        !           655:       case OP_ALT:
        !           656:       yield = SSB_CONTINUE;
        !           657:       try_next = FALSE;
        !           658:       break;
        !           659: 
        !           660:       case OP_KET:
        !           661:       case OP_KETRMAX:
        !           662:       case OP_KETRMIN:
        !           663:       return SSB_CONTINUE;
        !           664: 
        !           665:       /* Skip over callout */
        !           666: 
        !           667:       case OP_CALLOUT:
        !           668:       tcode += 2 + 2*LINK_SIZE;
        !           669:       break;
        !           670: 
        !           671:       /* Skip over lookbehind and negative lookahead assertions */
        !           672: 
        !           673:       case OP_ASSERT_NOT:
        !           674:       case OP_ASSERTBACK:
        !           675:       case OP_ASSERTBACK_NOT:
        !           676:       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
        !           677:       tcode += 1 + LINK_SIZE;
        !           678:       break;
        !           679: 
        !           680:       /* Skip over an option setting, changing the caseless flag */
        !           681: 
        !           682:       case OP_OPT:
        !           683:       caseless = (tcode[1] & PCRE_CASELESS) != 0;
        !           684:       tcode += 2;
        !           685:       break;
        !           686: 
        !           687:       /* BRAZERO does the bracket, but carries on. */
        !           688: 
        !           689:       case OP_BRAZERO:
        !           690:       case OP_BRAMINZERO:
        !           691:       if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
        !           692:         return SSB_FAIL;
        !           693: /* =========================================================================
        !           694:       See the comment at the head of this function concerning the next line,
        !           695:       which was an old fudge for the benefit of OS/2.
        !           696:       dummy = 1;
        !           697:   ========================================================================= */
        !           698:       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
        !           699:       tcode += 1 + LINK_SIZE;
        !           700:       break;
        !           701: 
        !           702:       /* SKIPZERO skips the bracket. */
        !           703: 
        !           704:       case OP_SKIPZERO:
        !           705:       tcode++;
        !           706:       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
        !           707:       tcode += 1 + LINK_SIZE;
        !           708:       break;
        !           709: 
        !           710:       /* Single-char * or ? sets the bit and tries the next item */
        !           711: 
        !           712:       case OP_STAR:
        !           713:       case OP_MINSTAR:
        !           714:       case OP_POSSTAR:
        !           715:       case OP_QUERY:
        !           716:       case OP_MINQUERY:
        !           717:       case OP_POSQUERY:
        !           718:       tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
        !           719:       break;
        !           720: 
        !           721:       /* Single-char upto sets the bit and tries the next */
        !           722: 
        !           723:       case OP_UPTO:
        !           724:       case OP_MINUPTO:
        !           725:       case OP_POSUPTO:
        !           726:       tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
        !           727:       break;
        !           728: 
        !           729:       /* At least one single char sets the bit and stops */
        !           730: 
        !           731:       case OP_EXACT:       /* Fall through */
        !           732:       tcode += 2;
        !           733: 
        !           734:       case OP_CHAR:
        !           735:       case OP_CHARNC:
        !           736:       case OP_PLUS:
        !           737:       case OP_MINPLUS:
        !           738:       case OP_POSPLUS:
        !           739:       (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
        !           740:       try_next = FALSE;
        !           741:       break;
        !           742: 
        !           743:       /* Special spacing and line-terminating items. These recognize specific
        !           744:       lists of characters. The difference between VSPACE and ANYNL is that the
        !           745:       latter can match the two-character CRLF sequence, but that is not
        !           746:       relevant for finding the first character, so their code here is
        !           747:       identical. */
        !           748: 
        !           749:       case OP_HSPACE:
        !           750:       SET_BIT(0x09);
        !           751:       SET_BIT(0x20);
        !           752:       if (utf8)
        !           753:         {
        !           754:         SET_BIT(0xC2);  /* For U+00A0 */
        !           755:         SET_BIT(0xE1);  /* For U+1680, U+180E */
        !           756:         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
        !           757:         SET_BIT(0xE3);  /* For U+3000 */
        !           758:         }
        !           759:       else SET_BIT(0xA0);
        !           760:       try_next = FALSE;
        !           761:       break;
        !           762: 
        !           763:       case OP_ANYNL:
        !           764:       case OP_VSPACE:
        !           765:       SET_BIT(0x0A);
        !           766:       SET_BIT(0x0B);
        !           767:       SET_BIT(0x0C);
        !           768:       SET_BIT(0x0D);
        !           769:       if (utf8)
        !           770:         {
        !           771:         SET_BIT(0xC2);  /* For U+0085 */
        !           772:         SET_BIT(0xE2);  /* For U+2028, U+2029 */
        !           773:         }
        !           774:       else SET_BIT(0x85);
        !           775:       try_next = FALSE;
        !           776:       break;
        !           777: 
        !           778:       /* Single character types set the bits and stop. Note that if PCRE_UCP
        !           779:       is set, we do not see these op codes because \d etc are converted to
        !           780:       properties. Therefore, these apply in the case when only characters less
        !           781:       than 256 are recognized to match the types. */
        !           782: 
        !           783:       case OP_NOT_DIGIT:
        !           784:       set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
        !           785:       try_next = FALSE;
        !           786:       break;
        !           787: 
        !           788:       case OP_DIGIT:
        !           789:       set_type_bits(start_bits, cbit_digit, table_limit, cd);
        !           790:       try_next = FALSE;
        !           791:       break;
        !           792: 
        !           793:       /* The cbit_space table has vertical tab as whitespace; we have to
        !           794:       ensure it is set as not whitespace. */
        !           795: 
        !           796:       case OP_NOT_WHITESPACE:
        !           797:       set_nottype_bits(start_bits, cbit_space, table_limit, cd);
        !           798:       start_bits[1] |= 0x08;
        !           799:       try_next = FALSE;
        !           800:       break;
        !           801: 
        !           802:       /* The cbit_space table has vertical tab as whitespace; we have to
        !           803:       not set it from the table. */
        !           804: 
        !           805:       case OP_WHITESPACE:
        !           806:       c = start_bits[1];    /* Save in case it was already set */
        !           807:       set_type_bits(start_bits, cbit_space, table_limit, cd);
        !           808:       start_bits[1] = (start_bits[1] & ~0x08) | c;
        !           809:       try_next = FALSE;
        !           810:       break;
        !           811: 
        !           812:       case OP_NOT_WORDCHAR:
        !           813:       set_nottype_bits(start_bits, cbit_word, table_limit, cd);
        !           814:       try_next = FALSE;
        !           815:       break;
        !           816: 
        !           817:       case OP_WORDCHAR:
        !           818:       set_type_bits(start_bits, cbit_word, table_limit, cd);
        !           819:       try_next = FALSE;
        !           820:       break;
        !           821: 
        !           822:       /* One or more character type fudges the pointer and restarts, knowing
        !           823:       it will hit a single character type and stop there. */
        !           824: 
        !           825:       case OP_TYPEPLUS:
        !           826:       case OP_TYPEMINPLUS:
        !           827:       case OP_TYPEPOSPLUS:
        !           828:       tcode++;
        !           829:       break;
        !           830: 
        !           831:       case OP_TYPEEXACT:
        !           832:       tcode += 3;
        !           833:       break;
        !           834: 
        !           835:       /* Zero or more repeats of character types set the bits and then
        !           836:       try again. */
        !           837: 
        !           838:       case OP_TYPEUPTO:
        !           839:       case OP_TYPEMINUPTO:
        !           840:       case OP_TYPEPOSUPTO:
        !           841:       tcode += 2;               /* Fall through */
        !           842: 
        !           843:       case OP_TYPESTAR:
        !           844:       case OP_TYPEMINSTAR:
        !           845:       case OP_TYPEPOSSTAR:
        !           846:       case OP_TYPEQUERY:
        !           847:       case OP_TYPEMINQUERY:
        !           848:       case OP_TYPEPOSQUERY:
        !           849:       switch(tcode[1])
        !           850:         {
        !           851:         default:
        !           852:         case OP_ANY:
        !           853:         case OP_ALLANY:
        !           854:         return SSB_FAIL;
        !           855: 
        !           856:         case OP_HSPACE:
        !           857:         SET_BIT(0x09);
        !           858:         SET_BIT(0x20);
        !           859:         if (utf8)
        !           860:           {
        !           861:           SET_BIT(0xC2);  /* For U+00A0 */
        !           862:           SET_BIT(0xE1);  /* For U+1680, U+180E */
        !           863:           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
        !           864:           SET_BIT(0xE3);  /* For U+3000 */
        !           865:           }
        !           866:         else SET_BIT(0xA0);
        !           867:         break;
        !           868: 
        !           869:         case OP_ANYNL:
        !           870:         case OP_VSPACE:
        !           871:         SET_BIT(0x0A);
        !           872:         SET_BIT(0x0B);
        !           873:         SET_BIT(0x0C);
        !           874:         SET_BIT(0x0D);
        !           875:         if (utf8)
        !           876:           {
        !           877:           SET_BIT(0xC2);  /* For U+0085 */
        !           878:           SET_BIT(0xE2);  /* For U+2028, U+2029 */
        !           879:           }
        !           880:         else SET_BIT(0x85);
        !           881:         break;
        !           882: 
        !           883:         case OP_NOT_DIGIT:
        !           884:         set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
        !           885:         break;
        !           886: 
        !           887:         case OP_DIGIT:
        !           888:         set_type_bits(start_bits, cbit_digit, table_limit, cd);
        !           889:         break;
        !           890: 
        !           891:         /* The cbit_space table has vertical tab as whitespace; we have to
        !           892:         ensure it gets set as not whitespace. */
        !           893: 
        !           894:         case OP_NOT_WHITESPACE:
        !           895:         set_nottype_bits(start_bits, cbit_space, table_limit, cd);
        !           896:         start_bits[1] |= 0x08;
        !           897:         break;
        !           898: 
        !           899:         /* The cbit_space table has vertical tab as whitespace; we have to
        !           900:         avoid setting it. */
        !           901: 
        !           902:         case OP_WHITESPACE:
        !           903:         c = start_bits[1];    /* Save in case it was already set */
        !           904:         set_type_bits(start_bits, cbit_space, table_limit, cd);
        !           905:         start_bits[1] = (start_bits[1] & ~0x08) | c;
        !           906:         break;
        !           907: 
        !           908:         case OP_NOT_WORDCHAR:
        !           909:         set_nottype_bits(start_bits, cbit_word, table_limit, cd);
        !           910:         break;
        !           911: 
        !           912:         case OP_WORDCHAR:
        !           913:         set_type_bits(start_bits, cbit_word, table_limit, cd);
        !           914:         break;
        !           915:         }
        !           916: 
        !           917:       tcode += 2;
        !           918:       break;
        !           919: 
        !           920:       /* Character class where all the information is in a bit map: set the
        !           921:       bits and either carry on or not, according to the repeat count. If it was
        !           922:       a negative class, and we are operating with UTF-8 characters, any byte
        !           923:       with a value >= 0xc4 is a potentially valid starter because it starts a
        !           924:       character with a value > 255. */
        !           925: 
        !           926:       case OP_NCLASS:
        !           927: #ifdef SUPPORT_UTF8
        !           928:       if (utf8)
        !           929:         {
        !           930:         start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
        !           931:         memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
        !           932:         }
        !           933: #endif
        !           934:       /* Fall through */
        !           935: 
        !           936:       case OP_CLASS:
        !           937:         {
        !           938:         tcode++;
        !           939: 
        !           940:         /* In UTF-8 mode, the bits in a bit map correspond to character
        !           941:         values, not to byte values. However, the bit map we are constructing is
        !           942:         for byte values. So we have to do a conversion for characters whose
        !           943:         value is > 127. In fact, there are only two possible starting bytes for
        !           944:         characters in the range 128 - 255. */
        !           945: 
        !           946: #ifdef SUPPORT_UTF8
        !           947:         if (utf8)
        !           948:           {
        !           949:           for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
        !           950:           for (c = 128; c < 256; c++)
        !           951:             {
        !           952:             if ((tcode[c/8] && (1 << (c&7))) != 0)
        !           953:               {
        !           954:               int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
        !           955:               start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
        !           956:               c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
        !           957:               }
        !           958:             }
        !           959:           }
        !           960: 
        !           961:         /* In non-UTF-8 mode, the two bit maps are completely compatible. */
        !           962: 
        !           963:         else
        !           964: #endif
        !           965:           {
        !           966:           for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
        !           967:           }
        !           968: 
        !           969:         /* Advance past the bit map, and act on what follows */
        !           970: 
        !           971:         tcode += 32;
        !           972:         switch (*tcode)
        !           973:           {
        !           974:           case OP_CRSTAR:
        !           975:           case OP_CRMINSTAR:
        !           976:           case OP_CRQUERY:
        !           977:           case OP_CRMINQUERY:
        !           978:           tcode++;
        !           979:           break;
        !           980: 
        !           981:           case OP_CRRANGE:
        !           982:           case OP_CRMINRANGE:
        !           983:           if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
        !           984:             else try_next = FALSE;
        !           985:           break;
        !           986: 
        !           987:           default:
        !           988:           try_next = FALSE;
        !           989:           break;
        !           990:           }
        !           991:         }
        !           992:       break; /* End of bitmap class handling */
        !           993: 
        !           994:       }      /* End of switch */
        !           995:     }        /* End of try_next loop */
        !           996: 
        !           997:   code += GET(code, 1);   /* Advance to next branch */
        !           998:   }
        !           999: while (*code == OP_ALT);
        !          1000: return yield;
        !          1001: }
        !          1002: 
        !          1003: 
        !          1004: 
        !          1005: /*************************************************
        !          1006: *          Study a compiled expression           *
        !          1007: *************************************************/
        !          1008: 
        !          1009: /* This function is handed a compiled expression that it must study to produce
        !          1010: information that will speed up the matching. It returns a pcre_extra block
        !          1011: which then gets handed back to pcre_exec().
        !          1012: 
        !          1013: Arguments:
        !          1014:   re        points to the compiled expression
        !          1015:   options   contains option bits
        !          1016:   errorptr  points to where to place error messages;
        !          1017:             set NULL unless error
        !          1018: 
        !          1019: Returns:    pointer to a pcre_extra block, with study_data filled in and the
        !          1020:               appropriate flags set;
        !          1021:             NULL on error or if no optimization possible
        !          1022: */
        !          1023: 
        !          1024: PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
        !          1025: pcre_study(const pcre *external_re, int options, const char **errorptr)
        !          1026: {
        !          1027: int min;
        !          1028: BOOL bits_set = FALSE;
        !          1029: uschar start_bits[32];
        !          1030: pcre_extra *extra;
        !          1031: pcre_study_data *study;
        !          1032: const uschar *tables;
        !          1033: uschar *code;
        !          1034: compile_data compile_block;
        !          1035: const real_pcre *re = (const real_pcre *)external_re;
        !          1036: 
        !          1037: *errorptr = NULL;
        !          1038: 
        !          1039: if (re == NULL || re->magic_number != MAGIC_NUMBER)
        !          1040:   {
        !          1041:   *errorptr = "argument is not a compiled regular expression";
        !          1042:   return NULL;
        !          1043:   }
        !          1044: 
        !          1045: if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
        !          1046:   {
        !          1047:   *errorptr = "unknown or incorrect option bit(s) set";
        !          1048:   return NULL;
        !          1049:   }
        !          1050: 
        !          1051: code = (uschar *)re + re->name_table_offset +
        !          1052:   (re->name_count * re->name_entry_size);
        !          1053: 
        !          1054: /* For an anchored pattern, or an unanchored pattern that has a first char, or
        !          1055: a multiline pattern that matches only at "line starts", there is no point in
        !          1056: seeking a list of starting bytes. */
        !          1057: 
        !          1058: if ((re->options & PCRE_ANCHORED) == 0 &&
        !          1059:     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
        !          1060:   {
        !          1061:   /* Set the character tables in the block that is passed around */
        !          1062: 
        !          1063:   tables = re->tables;
        !          1064:   if (tables == NULL)
        !          1065:     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
        !          1066:     (void *)(&tables));
        !          1067: 
        !          1068:   compile_block.lcc = tables + lcc_offset;
        !          1069:   compile_block.fcc = tables + fcc_offset;
        !          1070:   compile_block.cbits = tables + cbits_offset;
        !          1071:   compile_block.ctypes = tables + ctypes_offset;
        !          1072: 
        !          1073:   /* See if we can find a fixed set of initial characters for the pattern. */
        !          1074: 
        !          1075:   memset(start_bits, 0, 32 * sizeof(uschar));
        !          1076:   bits_set = set_start_bits(code, start_bits,
        !          1077:     (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,
        !          1078:     &compile_block) == SSB_DONE;
        !          1079:   }
        !          1080: 
        !          1081: /* Find the minimum length of subject string. */
        !          1082: 
        !          1083: min = find_minlength(code, code, re->options);
        !          1084: 
        !          1085: /* Return NULL if no optimization is possible. */
        !          1086: 
        !          1087: if (!bits_set && min < 0) return NULL;
        !          1088: 
        !          1089: /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
        !          1090: the latter, which is pointed to by the former, which may also get additional
        !          1091: data set later by the calling program. At the moment, the size of
        !          1092: pcre_study_data is fixed. We nevertheless save it in a field for returning via
        !          1093: the pcre_fullinfo() function so that if it becomes variable in the future, we
        !          1094: don't have to change that code. */
        !          1095: 
        !          1096: extra = (pcre_extra *)(pcre_malloc)
        !          1097:   (sizeof(pcre_extra) + sizeof(pcre_study_data));
        !          1098: 
        !          1099: if (extra == NULL)
        !          1100:   {
        !          1101:   *errorptr = "failed to get memory";
        !          1102:   return NULL;
        !          1103:   }
        !          1104: 
        !          1105: study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
        !          1106: extra->flags = PCRE_EXTRA_STUDY_DATA;
        !          1107: extra->study_data = study;
        !          1108: 
        !          1109: study->size = sizeof(pcre_study_data);
        !          1110: study->flags = 0;
        !          1111: 
        !          1112: if (bits_set)
        !          1113:   {
        !          1114:   study->flags |= PCRE_STUDY_MAPPED;
        !          1115:   memcpy(study->start_bits, start_bits, sizeof(start_bits));
        !          1116:   }
        !          1117: 
        !          1118: if (min >= 0)
        !          1119:   {
        !          1120:   study->flags |= PCRE_STUDY_MINLEN;
        !          1121:   study->minlength = min;
        !          1122:   }
        !          1123: 
        !          1124: return extra;
        !          1125: }
        !          1126: 
        !          1127: /* End of pcre_study.c */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>