Annotation of embedaddon/pcre/pcre_study.c, revision 1.1

1.1     ! misho       1: /*************************************************
        !             2: *      Perl-Compatible Regular Expressions       *
        !             3: *************************************************/
        !             4: 
        !             5: /* PCRE is a library of functions to support regular expressions whose syntax
        !             6: and semantics are as close as possible to those of the Perl 5 language.
        !             7: 
        !             8:                        Written by Philip Hazel
        !             9:            Copyright (c) 1997-2010 University of Cambridge
        !            10: 
        !            11: -----------------------------------------------------------------------------
        !            12: Redistribution and use in source and binary forms, with or without
        !            13: modification, are permitted provided that the following conditions are met:
        !            14: 
        !            15:     * Redistributions of source code must retain the above copyright notice,
        !            16:       this list of conditions and the following disclaimer.
        !            17: 
        !            18:     * Redistributions in binary form must reproduce the above copyright
        !            19:       notice, this list of conditions and the following disclaimer in the
        !            20:       documentation and/or other materials provided with the distribution.
        !            21: 
        !            22:     * Neither the name of the University of Cambridge nor the names of its
        !            23:       contributors may be used to endorse or promote products derived from
        !            24:       this software without specific prior written permission.
        !            25: 
        !            26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
        !            27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
        !            28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
        !            29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
        !            30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
        !            31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
        !            32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
        !            33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
        !            34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
        !            35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
        !            36: POSSIBILITY OF SUCH DAMAGE.
        !            37: -----------------------------------------------------------------------------
        !            38: */
        !            39: 
        !            40: 
        !            41: /* This module contains the external function pcre_study(), along with local
        !            42: supporting functions. */
        !            43: 
        !            44: 
        !            45: #ifdef HAVE_CONFIG_H
        !            46: #include "config.h"
        !            47: #endif
        !            48: 
        !            49: #include "pcre_internal.h"
        !            50: 
        !            51: #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
        !            52: 
        !            53: /* Returns from set_start_bits() */
        !            54: 
        !            55: enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
        !            56: 
        !            57: 
        !            58: 
        !            59: /*************************************************
        !            60: *   Find the minimum subject length for a group  *
        !            61: *************************************************/
        !            62: 
        !            63: /* Scan a parenthesized group and compute the minimum length of subject that
        !            64: is needed to match it. This is a lower bound; it does not mean there is a
        !            65: string of that length that matches. In UTF8 mode, the result is in characters
        !            66: rather than bytes.
        !            67: 
        !            68: Arguments:
        !            69:   code            pointer to start of group (the bracket)
        !            70:   startcode       pointer to start of the whole pattern
        !            71:   options         the compiling options
        !            72:   int             RECURSE depth
        !            73: 
        !            74: Returns:   the minimum length
        !            75:            -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
        !            76:            -2 internal error (missing capturing bracket)
        !            77:            -3 internal error (opcode not listed)
        !            78: */
        !            79: 
        !            80: static int
        !            81: find_minlength(const uschar *code, const uschar *startcode, int options,
        !            82:   int recurse_depth)
        !            83: {
        !            84: int length = -1;
        !            85: BOOL utf8 = (options & PCRE_UTF8) != 0;
        !            86: BOOL had_recurse = FALSE;
        !            87: register int branchlength = 0;
        !            88: register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
        !            89: 
        !            90: if (*code == OP_CBRA || *code == OP_SCBRA ||
        !            91:     *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += 2;
        !            92: 
        !            93: /* Scan along the opcodes for this branch. If we get to the end of the
        !            94: branch, check the length against that of the other branches. */
        !            95: 
        !            96: for (;;)
        !            97:   {
        !            98:   int d, min;
        !            99:   uschar *cs, *ce;
        !           100:   register int op = *cc;
        !           101: 
        !           102:   switch (op)
        !           103:     {
        !           104:     case OP_COND:
        !           105:     case OP_SCOND:
        !           106: 
        !           107:     /* If there is only one branch in a condition, the implied branch has zero
        !           108:     length, so we don't add anything. This covers the DEFINE "condition"
        !           109:     automatically. */
        !           110: 
        !           111:     cs = cc + GET(cc, 1);
        !           112:     if (*cs != OP_ALT)
        !           113:       {
        !           114:       cc = cs + 1 + LINK_SIZE;
        !           115:       break;
        !           116:       }
        !           117: 
        !           118:     /* Otherwise we can fall through and treat it the same as any other
        !           119:     subpattern. */
        !           120: 
        !           121:     case OP_CBRA:
        !           122:     case OP_SCBRA:
        !           123:     case OP_BRA:
        !           124:     case OP_SBRA:
        !           125:     case OP_CBRAPOS:
        !           126:     case OP_SCBRAPOS:
        !           127:     case OP_BRAPOS:
        !           128:     case OP_SBRAPOS:
        !           129:     case OP_ONCE:
        !           130:     case OP_ONCE_NC:
        !           131:     d = find_minlength(cc, startcode, options, recurse_depth);
        !           132:     if (d < 0) return d;
        !           133:     branchlength += d;
        !           134:     do cc += GET(cc, 1); while (*cc == OP_ALT);
        !           135:     cc += 1 + LINK_SIZE;
        !           136:     break;
        !           137: 
        !           138:     /* ACCEPT makes things far too complicated; we have to give up. */
        !           139: 
        !           140:     case OP_ACCEPT:
        !           141:     case OP_ASSERT_ACCEPT:
        !           142:     return -1;
        !           143: 
        !           144:     /* Reached end of a branch; if it's a ket it is the end of a nested
        !           145:     call. If it's ALT it is an alternation in a nested call. If it is END it's
        !           146:     the end of the outer call. All can be handled by the same code. If an
        !           147:     ACCEPT was previously encountered, use the length that was in force at that
        !           148:     time, and pass back the shortest ACCEPT length. */
        !           149: 
        !           150:     case OP_ALT:
        !           151:     case OP_KET:
        !           152:     case OP_KETRMAX:
        !           153:     case OP_KETRMIN:
        !           154:     case OP_KETRPOS:
        !           155:     case OP_END:
        !           156:     if (length < 0 || (!had_recurse && branchlength < length))
        !           157:       length = branchlength;
        !           158:     if (op != OP_ALT) return length;
        !           159:     cc += 1 + LINK_SIZE;
        !           160:     branchlength = 0;
        !           161:     had_recurse = FALSE;
        !           162:     break;
        !           163: 
        !           164:     /* Skip over assertive subpatterns */
        !           165: 
        !           166:     case OP_ASSERT:
        !           167:     case OP_ASSERT_NOT:
        !           168:     case OP_ASSERTBACK:
        !           169:     case OP_ASSERTBACK_NOT:
        !           170:     do cc += GET(cc, 1); while (*cc == OP_ALT);
        !           171:     /* Fall through */
        !           172: 
        !           173:     /* Skip over things that don't match chars */
        !           174: 
        !           175:     case OP_REVERSE:
        !           176:     case OP_CREF:
        !           177:     case OP_NCREF:
        !           178:     case OP_RREF:
        !           179:     case OP_NRREF:
        !           180:     case OP_DEF:
        !           181:     case OP_CALLOUT:
        !           182:     case OP_SOD:
        !           183:     case OP_SOM:
        !           184:     case OP_EOD:
        !           185:     case OP_EODN:
        !           186:     case OP_CIRC:
        !           187:     case OP_CIRCM:
        !           188:     case OP_DOLL:
        !           189:     case OP_DOLLM:
        !           190:     case OP_NOT_WORD_BOUNDARY:
        !           191:     case OP_WORD_BOUNDARY:
        !           192:     cc += _pcre_OP_lengths[*cc];
        !           193:     break;
        !           194: 
        !           195:     /* Skip over a subpattern that has a {0} or {0,x} quantifier */
        !           196: 
        !           197:     case OP_BRAZERO:
        !           198:     case OP_BRAMINZERO:
        !           199:     case OP_BRAPOSZERO:
        !           200:     case OP_SKIPZERO:
        !           201:     cc += _pcre_OP_lengths[*cc];
        !           202:     do cc += GET(cc, 1); while (*cc == OP_ALT);
        !           203:     cc += 1 + LINK_SIZE;
        !           204:     break;
        !           205: 
        !           206:     /* Handle literal characters and + repetitions */
        !           207: 
        !           208:     case OP_CHAR:
        !           209:     case OP_CHARI:
        !           210:     case OP_NOT:
        !           211:     case OP_NOTI:
        !           212:     case OP_PLUS:
        !           213:     case OP_PLUSI:
        !           214:     case OP_MINPLUS:
        !           215:     case OP_MINPLUSI:
        !           216:     case OP_POSPLUS:
        !           217:     case OP_POSPLUSI:
        !           218:     case OP_NOTPLUS:
        !           219:     case OP_NOTPLUSI:
        !           220:     case OP_NOTMINPLUS:
        !           221:     case OP_NOTMINPLUSI:
        !           222:     case OP_NOTPOSPLUS:
        !           223:     case OP_NOTPOSPLUSI:
        !           224:     branchlength++;
        !           225:     cc += 2;
        !           226: #ifdef SUPPORT_UTF8
        !           227:     if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
        !           228: #endif
        !           229:     break;
        !           230: 
        !           231:     case OP_TYPEPLUS:
        !           232:     case OP_TYPEMINPLUS:
        !           233:     case OP_TYPEPOSPLUS:
        !           234:     branchlength++;
        !           235:     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
        !           236:     break;
        !           237: 
        !           238:     /* Handle exact repetitions. The count is already in characters, but we
        !           239:     need to skip over a multibyte character in UTF8 mode.  */
        !           240: 
        !           241:     case OP_EXACT:
        !           242:     case OP_EXACTI:
        !           243:     case OP_NOTEXACT:
        !           244:     case OP_NOTEXACTI:
        !           245:     branchlength += GET2(cc,1);
        !           246:     cc += 4;
        !           247: #ifdef SUPPORT_UTF8
        !           248:     if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
        !           249: #endif
        !           250:     break;
        !           251: 
        !           252:     case OP_TYPEEXACT:
        !           253:     branchlength += GET2(cc,1);
        !           254:     cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
        !           255:     break;
        !           256: 
        !           257:     /* Handle single-char non-literal matchers */
        !           258: 
        !           259:     case OP_PROP:
        !           260:     case OP_NOTPROP:
        !           261:     cc += 2;
        !           262:     /* Fall through */
        !           263: 
        !           264:     case OP_NOT_DIGIT:
        !           265:     case OP_DIGIT:
        !           266:     case OP_NOT_WHITESPACE:
        !           267:     case OP_WHITESPACE:
        !           268:     case OP_NOT_WORDCHAR:
        !           269:     case OP_WORDCHAR:
        !           270:     case OP_ANY:
        !           271:     case OP_ALLANY:
        !           272:     case OP_EXTUNI:
        !           273:     case OP_HSPACE:
        !           274:     case OP_NOT_HSPACE:
        !           275:     case OP_VSPACE:
        !           276:     case OP_NOT_VSPACE:
        !           277:     branchlength++;
        !           278:     cc++;
        !           279:     break;
        !           280: 
        !           281:     /* "Any newline" might match two characters, but it also might match just
        !           282:     one. */
        !           283: 
        !           284:     case OP_ANYNL:
        !           285:     branchlength += 1;
        !           286:     cc++;
        !           287:     break;
        !           288: 
        !           289:     /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
        !           290:     non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
        !           291:     appear, but leave the code, just in case.) */
        !           292: 
        !           293:     case OP_ANYBYTE:
        !           294: #ifdef SUPPORT_UTF8
        !           295:     if (utf8) return -1;
        !           296: #endif
        !           297:     branchlength++;
        !           298:     cc++;
        !           299:     break;
        !           300: 
        !           301:     /* For repeated character types, we have to test for \p and \P, which have
        !           302:     an extra two bytes of parameters. */
        !           303: 
        !           304:     case OP_TYPESTAR:
        !           305:     case OP_TYPEMINSTAR:
        !           306:     case OP_TYPEQUERY:
        !           307:     case OP_TYPEMINQUERY:
        !           308:     case OP_TYPEPOSSTAR:
        !           309:     case OP_TYPEPOSQUERY:
        !           310:     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
        !           311:     cc += _pcre_OP_lengths[op];
        !           312:     break;
        !           313: 
        !           314:     case OP_TYPEUPTO:
        !           315:     case OP_TYPEMINUPTO:
        !           316:     case OP_TYPEPOSUPTO:
        !           317:     if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
        !           318:     cc += _pcre_OP_lengths[op];
        !           319:     break;
        !           320: 
        !           321:     /* Check a class for variable quantification */
        !           322: 
        !           323: #ifdef SUPPORT_UTF8
        !           324:     case OP_XCLASS:
        !           325:     cc += GET(cc, 1) - 33;
        !           326:     /* Fall through */
        !           327: #endif
        !           328: 
        !           329:     case OP_CLASS:
        !           330:     case OP_NCLASS:
        !           331:     cc += 33;
        !           332: 
        !           333:     switch (*cc)
        !           334:       {
        !           335:       case OP_CRPLUS:
        !           336:       case OP_CRMINPLUS:
        !           337:       branchlength++;
        !           338:       /* Fall through */
        !           339: 
        !           340:       case OP_CRSTAR:
        !           341:       case OP_CRMINSTAR:
        !           342:       case OP_CRQUERY:
        !           343:       case OP_CRMINQUERY:
        !           344:       cc++;
        !           345:       break;
        !           346: 
        !           347:       case OP_CRRANGE:
        !           348:       case OP_CRMINRANGE:
        !           349:       branchlength += GET2(cc,1);
        !           350:       cc += 5;
        !           351:       break;
        !           352: 
        !           353:       default:
        !           354:       branchlength++;
        !           355:       break;
        !           356:       }
        !           357:     break;
        !           358: 
        !           359:     /* Backreferences and subroutine calls are treated in the same way: we find
        !           360:     the minimum length for the subpattern. A recursion, however, causes an
        !           361:     a flag to be set that causes the length of this branch to be ignored. The
        !           362:     logic is that a recursion can only make sense if there is another
        !           363:     alternation that stops the recursing. That will provide the minimum length
        !           364:     (when no recursion happens). A backreference within the group that it is
        !           365:     referencing behaves in the same way.
        !           366: 
        !           367:     If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
        !           368:     matches an empty string (by default it causes a matching failure), so in
        !           369:     that case we must set the minimum length to zero. */
        !           370: 
        !           371:     case OP_REF:
        !           372:     case OP_REFI:
        !           373:     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
        !           374:       {
        !           375:       ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
        !           376:       if (cs == NULL) return -2;
        !           377:       do ce += GET(ce, 1); while (*ce == OP_ALT);
        !           378:       if (cc > cs && cc < ce)
        !           379:         {
        !           380:         d = 0;
        !           381:         had_recurse = TRUE;
        !           382:         }
        !           383:       else
        !           384:         {
        !           385:         d = find_minlength(cs, startcode, options, recurse_depth);
        !           386:         }
        !           387:       }
        !           388:     else d = 0;
        !           389:     cc += 3;
        !           390: 
        !           391:     /* Handle repeated back references */
        !           392: 
        !           393:     switch (*cc)
        !           394:       {
        !           395:       case OP_CRSTAR:
        !           396:       case OP_CRMINSTAR:
        !           397:       case OP_CRQUERY:
        !           398:       case OP_CRMINQUERY:
        !           399:       min = 0;
        !           400:       cc++;
        !           401:       break;
        !           402: 
        !           403:       case OP_CRPLUS:
        !           404:       case OP_CRMINPLUS:
        !           405:       min = 1;
        !           406:       cc++;
        !           407:       break;
        !           408: 
        !           409:       case OP_CRRANGE:
        !           410:       case OP_CRMINRANGE:
        !           411:       min = GET2(cc, 1);
        !           412:       cc += 5;
        !           413:       break;
        !           414: 
        !           415:       default:
        !           416:       min = 1;
        !           417:       break;
        !           418:       }
        !           419: 
        !           420:     branchlength += min * d;
        !           421:     break;
        !           422: 
        !           423:     /* We can easily detect direct recursion, but not mutual recursion. This is
        !           424:     caught by a recursion depth count. */
        !           425: 
        !           426:     case OP_RECURSE:
        !           427:     cs = ce = (uschar *)startcode + GET(cc, 1);
        !           428:     do ce += GET(ce, 1); while (*ce == OP_ALT);
        !           429:     if ((cc > cs && cc < ce) || recurse_depth > 10)
        !           430:       had_recurse = TRUE;
        !           431:     else
        !           432:       {
        !           433:       branchlength += find_minlength(cs, startcode, options, recurse_depth + 1);
        !           434:       }
        !           435:     cc += 1 + LINK_SIZE;
        !           436:     break;
        !           437: 
        !           438:     /* Anything else does not or need not match a character. We can get the
        !           439:     item's length from the table, but for those that can match zero occurrences
        !           440:     of a character, we must take special action for UTF-8 characters. As it
        !           441:     happens, the "NOT" versions of these opcodes are used at present only for
        !           442:     ASCII characters, so they could be omitted from this list. However, in
        !           443:     future that may change, so we include them here so as not to leave a
        !           444:     gotcha for a future maintainer. */
        !           445: 
        !           446:     case OP_UPTO:
        !           447:     case OP_UPTOI:
        !           448:     case OP_NOTUPTO:
        !           449:     case OP_NOTUPTOI:
        !           450:     case OP_MINUPTO:
        !           451:     case OP_MINUPTOI:
        !           452:     case OP_NOTMINUPTO:
        !           453:     case OP_NOTMINUPTOI:
        !           454:     case OP_POSUPTO:
        !           455:     case OP_POSUPTOI:
        !           456:     case OP_NOTPOSUPTO:
        !           457:     case OP_NOTPOSUPTOI:
        !           458: 
        !           459:     case OP_STAR:
        !           460:     case OP_STARI:
        !           461:     case OP_NOTSTAR:
        !           462:     case OP_NOTSTARI:
        !           463:     case OP_MINSTAR:
        !           464:     case OP_MINSTARI:
        !           465:     case OP_NOTMINSTAR:
        !           466:     case OP_NOTMINSTARI:
        !           467:     case OP_POSSTAR:
        !           468:     case OP_POSSTARI:
        !           469:     case OP_NOTPOSSTAR:
        !           470:     case OP_NOTPOSSTARI:
        !           471: 
        !           472:     case OP_QUERY:
        !           473:     case OP_QUERYI:
        !           474:     case OP_NOTQUERY:
        !           475:     case OP_NOTQUERYI:
        !           476:     case OP_MINQUERY:
        !           477:     case OP_MINQUERYI:
        !           478:     case OP_NOTMINQUERY:
        !           479:     case OP_NOTMINQUERYI:
        !           480:     case OP_POSQUERY:
        !           481:     case OP_POSQUERYI:
        !           482:     case OP_NOTPOSQUERY:
        !           483:     case OP_NOTPOSQUERYI:
        !           484: 
        !           485:     cc += _pcre_OP_lengths[op];
        !           486: #ifdef SUPPORT_UTF8
        !           487:     if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
        !           488: #endif
        !           489:     break;
        !           490: 
        !           491:     /* Skip these, but we need to add in the name length. */
        !           492: 
        !           493:     case OP_MARK:
        !           494:     case OP_PRUNE_ARG:
        !           495:     case OP_SKIP_ARG:
        !           496:     case OP_THEN_ARG:
        !           497:     cc += _pcre_OP_lengths[op] + cc[1];
        !           498:     break;
        !           499: 
        !           500:     /* The remaining opcodes are just skipped over. */
        !           501: 
        !           502:     case OP_CLOSE:
        !           503:     case OP_COMMIT:
        !           504:     case OP_FAIL:
        !           505:     case OP_PRUNE:
        !           506:     case OP_SET_SOM:
        !           507:     case OP_SKIP:
        !           508:     case OP_THEN:
        !           509:     cc += _pcre_OP_lengths[op];
        !           510:     break;
        !           511: 
        !           512:     /* This should not occur: we list all opcodes explicitly so that when
        !           513:     new ones get added they are properly considered. */
        !           514: 
        !           515:     default:
        !           516:     return -3;
        !           517:     }
        !           518:   }
        !           519: /* Control never gets here */
        !           520: }
        !           521: 
        !           522: 
        !           523: 
        !           524: /*************************************************
        !           525: *      Set a bit and maybe its alternate case    *
        !           526: *************************************************/
        !           527: 
        !           528: /* Given a character, set its first byte's bit in the table, and also the
        !           529: corresponding bit for the other version of a letter if we are caseless. In
        !           530: UTF-8 mode, for characters greater than 127, we can only do the caseless thing
        !           531: when Unicode property support is available.
        !           532: 
        !           533: Arguments:
        !           534:   start_bits    points to the bit map
        !           535:   p             points to the character
        !           536:   caseless      the caseless flag
        !           537:   cd            the block with char table pointers
        !           538:   utf8          TRUE for UTF-8 mode
        !           539: 
        !           540: Returns:        pointer after the character
        !           541: */
        !           542: 
        !           543: static const uschar *
        !           544: set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
        !           545:   compile_data *cd, BOOL utf8)
        !           546: {
        !           547: unsigned int c = *p;
        !           548: 
        !           549: SET_BIT(c);
        !           550: 
        !           551: #ifdef SUPPORT_UTF8
        !           552: if (utf8 && c > 127)
        !           553:   {
        !           554:   GETCHARINC(c, p);
        !           555: #ifdef SUPPORT_UCP
        !           556:   if (caseless)
        !           557:     {
        !           558:     uschar buff[8];
        !           559:     c = UCD_OTHERCASE(c);
        !           560:     (void)_pcre_ord2utf8(c, buff);
        !           561:     SET_BIT(buff[0]);
        !           562:     }
        !           563: #endif
        !           564:   return p;
        !           565:   }
        !           566: #endif
        !           567: 
        !           568: /* Not UTF-8 mode, or character is less than 127. */
        !           569: 
        !           570: if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
        !           571: return p + 1;
        !           572: }
        !           573: 
        !           574: 
        !           575: 
        !           576: /*************************************************
        !           577: *     Set bits for a positive character type     *
        !           578: *************************************************/
        !           579: 
        !           580: /* This function sets starting bits for a character type. In UTF-8 mode, we can
        !           581: only do a direct setting for bytes less than 128, as otherwise there can be
        !           582: confusion with bytes in the middle of UTF-8 characters. In a "traditional"
        !           583: environment, the tables will only recognize ASCII characters anyway, but in at
        !           584: least one Windows environment, some higher bytes bits were set in the tables.
        !           585: So we deal with that case by considering the UTF-8 encoding.
        !           586: 
        !           587: Arguments:
        !           588:   start_bits     the starting bitmap
        !           589:   cbit type      the type of character wanted
        !           590:   table_limit    32 for non-UTF-8; 16 for UTF-8
        !           591:   cd             the block with char table pointers
        !           592: 
        !           593: Returns:         nothing
        !           594: */
        !           595: 
        !           596: static void
        !           597: set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
        !           598:   compile_data *cd)
        !           599: {
        !           600: register int c;
        !           601: for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
        !           602: if (table_limit == 32) return;
        !           603: for (c = 128; c < 256; c++)
        !           604:   {
        !           605:   if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
        !           606:     {
        !           607:     uschar buff[8];
        !           608:     (void)_pcre_ord2utf8(c, buff);
        !           609:     SET_BIT(buff[0]);
        !           610:     }
        !           611:   }
        !           612: }
        !           613: 
        !           614: 
        !           615: /*************************************************
        !           616: *     Set bits for a negative character type     *
        !           617: *************************************************/
        !           618: 
        !           619: /* This function sets starting bits for a negative character type such as \D.
        !           620: In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
        !           621: otherwise there can be confusion with bytes in the middle of UTF-8 characters.
        !           622: Unlike in the positive case, where we can set appropriate starting bits for
        !           623: specific high-valued UTF-8 characters, in this case we have to set the bits for
        !           624: all high-valued characters. The lowest is 0xc2, but we overkill by starting at
        !           625: 0xc0 (192) for simplicity.
        !           626: 
        !           627: Arguments:
        !           628:   start_bits     the starting bitmap
        !           629:   cbit type      the type of character wanted
        !           630:   table_limit    32 for non-UTF-8; 16 for UTF-8
        !           631:   cd             the block with char table pointers
        !           632: 
        !           633: Returns:         nothing
        !           634: */
        !           635: 
        !           636: static void
        !           637: set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
        !           638:   compile_data *cd)
        !           639: {
        !           640: register int c;
        !           641: for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
        !           642: if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
        !           643: }
        !           644: 
        !           645: 
        !           646: 
        !           647: /*************************************************
        !           648: *          Create bitmap of starting bytes       *
        !           649: *************************************************/
        !           650: 
        !           651: /* This function scans a compiled unanchored expression recursively and
        !           652: attempts to build a bitmap of the set of possible starting bytes. As time goes
        !           653: by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
        !           654: useful for parenthesized groups in patterns such as (a*)b where the group
        !           655: provides some optional starting bytes but scanning must continue at the outer
        !           656: level to find at least one mandatory byte. At the outermost level, this
        !           657: function fails unless the result is SSB_DONE.
        !           658: 
        !           659: Arguments:
        !           660:   code         points to an expression
        !           661:   start_bits   points to a 32-byte table, initialized to 0
        !           662:   utf8         TRUE if in UTF-8 mode
        !           663:   cd           the block with char table pointers
        !           664: 
        !           665: Returns:       SSB_FAIL     => Failed to find any starting bytes
        !           666:                SSB_DONE     => Found mandatory starting bytes
        !           667:                SSB_CONTINUE => Found optional starting bytes
        !           668:                SSB_UNKNOWN  => Hit an unrecognized opcode
        !           669: */
        !           670: 
        !           671: static int
        !           672: set_start_bits(const uschar *code, uschar *start_bits, BOOL utf8,
        !           673:   compile_data *cd)
        !           674: {
        !           675: register int c;
        !           676: int yield = SSB_DONE;
        !           677: int table_limit = utf8? 16:32;
        !           678: 
        !           679: #if 0
        !           680: /* ========================================================================= */
        !           681: /* The following comment and code was inserted in January 1999. In May 2006,
        !           682: when it was observed to cause compiler warnings about unused values, I took it
        !           683: out again. If anybody is still using OS/2, they will have to put it back
        !           684: manually. */
        !           685: 
        !           686: /* This next statement and the later reference to dummy are here in order to
        !           687: trick the optimizer of the IBM C compiler for OS/2 into generating correct
        !           688: code. Apparently IBM isn't going to fix the problem, and we would rather not
        !           689: disable optimization (in this module it actually makes a big difference, and
        !           690: the pcre module can use all the optimization it can get). */
        !           691: 
        !           692: volatile int dummy;
        !           693: /* ========================================================================= */
        !           694: #endif
        !           695: 
        !           696: do
        !           697:   {
        !           698:   BOOL try_next = TRUE;
        !           699:   const uschar *tcode = code + 1 + LINK_SIZE;
        !           700: 
        !           701:   if (*code == OP_CBRA || *code == OP_SCBRA ||
        !           702:       *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += 2;
        !           703: 
        !           704:   while (try_next)    /* Loop for items in this branch */
        !           705:     {
        !           706:     int rc;
        !           707: 
        !           708:     switch(*tcode)
        !           709:       {
        !           710:       /* If we reach something we don't understand, it means a new opcode has
        !           711:       been created that hasn't been added to this code. Hopefully this problem
        !           712:       will be discovered during testing. */
        !           713: 
        !           714:       default:
        !           715:       return SSB_UNKNOWN;
        !           716: 
        !           717:       /* Fail for a valid opcode that implies no starting bits. */
        !           718: 
        !           719:       case OP_ACCEPT:
        !           720:       case OP_ASSERT_ACCEPT:
        !           721:       case OP_ALLANY:
        !           722:       case OP_ANY:
        !           723:       case OP_ANYBYTE:
        !           724:       case OP_CIRC:
        !           725:       case OP_CIRCM:
        !           726:       case OP_CLOSE:
        !           727:       case OP_COMMIT:
        !           728:       case OP_COND:
        !           729:       case OP_CREF:
        !           730:       case OP_DEF:
        !           731:       case OP_DOLL:
        !           732:       case OP_DOLLM:
        !           733:       case OP_END:
        !           734:       case OP_EOD:
        !           735:       case OP_EODN:
        !           736:       case OP_EXTUNI:
        !           737:       case OP_FAIL:
        !           738:       case OP_MARK:
        !           739:       case OP_NCREF:
        !           740:       case OP_NOT:
        !           741:       case OP_NOTEXACT:
        !           742:       case OP_NOTEXACTI:
        !           743:       case OP_NOTI:
        !           744:       case OP_NOTMINPLUS:
        !           745:       case OP_NOTMINPLUSI:
        !           746:       case OP_NOTMINQUERY:
        !           747:       case OP_NOTMINQUERYI:
        !           748:       case OP_NOTMINSTAR:
        !           749:       case OP_NOTMINSTARI:
        !           750:       case OP_NOTMINUPTO:
        !           751:       case OP_NOTMINUPTOI:
        !           752:       case OP_NOTPLUS:
        !           753:       case OP_NOTPLUSI:
        !           754:       case OP_NOTPOSPLUS:
        !           755:       case OP_NOTPOSPLUSI:
        !           756:       case OP_NOTPOSQUERY:
        !           757:       case OP_NOTPOSQUERYI:
        !           758:       case OP_NOTPOSSTAR:
        !           759:       case OP_NOTPOSSTARI:
        !           760:       case OP_NOTPOSUPTO:
        !           761:       case OP_NOTPOSUPTOI:
        !           762:       case OP_NOTPROP:
        !           763:       case OP_NOTQUERY:
        !           764:       case OP_NOTQUERYI:
        !           765:       case OP_NOTSTAR:
        !           766:       case OP_NOTSTARI:
        !           767:       case OP_NOTUPTO:
        !           768:       case OP_NOTUPTOI:
        !           769:       case OP_NOT_HSPACE:
        !           770:       case OP_NOT_VSPACE:
        !           771:       case OP_NRREF:
        !           772:       case OP_PROP:
        !           773:       case OP_PRUNE:
        !           774:       case OP_PRUNE_ARG:
        !           775:       case OP_RECURSE:
        !           776:       case OP_REF:
        !           777:       case OP_REFI:
        !           778:       case OP_REVERSE:
        !           779:       case OP_RREF:
        !           780:       case OP_SCOND:
        !           781:       case OP_SET_SOM:
        !           782:       case OP_SKIP:
        !           783:       case OP_SKIP_ARG:
        !           784:       case OP_SOD:
        !           785:       case OP_SOM:
        !           786:       case OP_THEN:
        !           787:       case OP_THEN_ARG:
        !           788:       case OP_XCLASS:
        !           789:       return SSB_FAIL;
        !           790: 
        !           791:       /* We can ignore word boundary tests. */
        !           792: 
        !           793:       case OP_WORD_BOUNDARY:
        !           794:       case OP_NOT_WORD_BOUNDARY:
        !           795:       tcode++;
        !           796:       break;
        !           797: 
        !           798:       /* If we hit a bracket or a positive lookahead assertion, recurse to set
        !           799:       bits from within the subpattern. If it can't find anything, we have to
        !           800:       give up. If it finds some mandatory character(s), we are done for this
        !           801:       branch. Otherwise, carry on scanning after the subpattern. */
        !           802: 
        !           803:       case OP_BRA:
        !           804:       case OP_SBRA:
        !           805:       case OP_CBRA:
        !           806:       case OP_SCBRA:
        !           807:       case OP_BRAPOS:
        !           808:       case OP_SBRAPOS:
        !           809:       case OP_CBRAPOS:
        !           810:       case OP_SCBRAPOS:
        !           811:       case OP_ONCE:
        !           812:       case OP_ONCE_NC:
        !           813:       case OP_ASSERT:
        !           814:       rc = set_start_bits(tcode, start_bits, utf8, cd);
        !           815:       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
        !           816:       if (rc == SSB_DONE) try_next = FALSE; else
        !           817:         {
        !           818:         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
        !           819:         tcode += 1 + LINK_SIZE;
        !           820:         }
        !           821:       break;
        !           822: 
        !           823:       /* If we hit ALT or KET, it means we haven't found anything mandatory in
        !           824:       this branch, though we might have found something optional. For ALT, we
        !           825:       continue with the next alternative, but we have to arrange that the final
        !           826:       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
        !           827:       return SSB_CONTINUE: if this is the top level, that indicates failure,
        !           828:       but after a nested subpattern, it causes scanning to continue. */
        !           829: 
        !           830:       case OP_ALT:
        !           831:       yield = SSB_CONTINUE;
        !           832:       try_next = FALSE;
        !           833:       break;
        !           834: 
        !           835:       case OP_KET:
        !           836:       case OP_KETRMAX:
        !           837:       case OP_KETRMIN:
        !           838:       case OP_KETRPOS:
        !           839:       return SSB_CONTINUE;
        !           840: 
        !           841:       /* Skip over callout */
        !           842: 
        !           843:       case OP_CALLOUT:
        !           844:       tcode += 2 + 2*LINK_SIZE;
        !           845:       break;
        !           846: 
        !           847:       /* Skip over lookbehind and negative lookahead assertions */
        !           848: 
        !           849:       case OP_ASSERT_NOT:
        !           850:       case OP_ASSERTBACK:
        !           851:       case OP_ASSERTBACK_NOT:
        !           852:       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
        !           853:       tcode += 1 + LINK_SIZE;
        !           854:       break;
        !           855: 
        !           856:       /* BRAZERO does the bracket, but carries on. */
        !           857: 
        !           858:       case OP_BRAZERO:
        !           859:       case OP_BRAMINZERO:
        !           860:       case OP_BRAPOSZERO:
        !           861:       rc = set_start_bits(++tcode, start_bits, utf8, cd);
        !           862:       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
        !           863: /* =========================================================================
        !           864:       See the comment at the head of this function concerning the next line,
        !           865:       which was an old fudge for the benefit of OS/2.
        !           866:       dummy = 1;
        !           867:   ========================================================================= */
        !           868:       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
        !           869:       tcode += 1 + LINK_SIZE;
        !           870:       break;
        !           871: 
        !           872:       /* SKIPZERO skips the bracket. */
        !           873: 
        !           874:       case OP_SKIPZERO:
        !           875:       tcode++;
        !           876:       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
        !           877:       tcode += 1 + LINK_SIZE;
        !           878:       break;
        !           879: 
        !           880:       /* Single-char * or ? sets the bit and tries the next item */
        !           881: 
        !           882:       case OP_STAR:
        !           883:       case OP_MINSTAR:
        !           884:       case OP_POSSTAR:
        !           885:       case OP_QUERY:
        !           886:       case OP_MINQUERY:
        !           887:       case OP_POSQUERY:
        !           888:       tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
        !           889:       break;
        !           890: 
        !           891:       case OP_STARI:
        !           892:       case OP_MINSTARI:
        !           893:       case OP_POSSTARI:
        !           894:       case OP_QUERYI:
        !           895:       case OP_MINQUERYI:
        !           896:       case OP_POSQUERYI:
        !           897:       tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
        !           898:       break;
        !           899: 
        !           900:       /* Single-char upto sets the bit and tries the next */
        !           901: 
        !           902:       case OP_UPTO:
        !           903:       case OP_MINUPTO:
        !           904:       case OP_POSUPTO:
        !           905:       tcode = set_table_bit(start_bits, tcode + 3, FALSE, cd, utf8);
        !           906:       break;
        !           907: 
        !           908:       case OP_UPTOI:
        !           909:       case OP_MINUPTOI:
        !           910:       case OP_POSUPTOI:
        !           911:       tcode = set_table_bit(start_bits, tcode + 3, TRUE, cd, utf8);
        !           912:       break;
        !           913: 
        !           914:       /* At least one single char sets the bit and stops */
        !           915: 
        !           916:       case OP_EXACT:
        !           917:       tcode += 2;
        !           918:       /* Fall through */
        !           919:       case OP_CHAR:
        !           920:       case OP_PLUS:
        !           921:       case OP_MINPLUS:
        !           922:       case OP_POSPLUS:
        !           923:       (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
        !           924:       try_next = FALSE;
        !           925:       break;
        !           926: 
        !           927:       case OP_EXACTI:
        !           928:       tcode += 2;
        !           929:       /* Fall through */
        !           930:       case OP_CHARI:
        !           931:       case OP_PLUSI:
        !           932:       case OP_MINPLUSI:
        !           933:       case OP_POSPLUSI:
        !           934:       (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
        !           935:       try_next = FALSE;
        !           936:       break;
        !           937: 
        !           938:       /* Special spacing and line-terminating items. These recognize specific
        !           939:       lists of characters. The difference between VSPACE and ANYNL is that the
        !           940:       latter can match the two-character CRLF sequence, but that is not
        !           941:       relevant for finding the first character, so their code here is
        !           942:       identical. */
        !           943: 
        !           944:       case OP_HSPACE:
        !           945:       SET_BIT(0x09);
        !           946:       SET_BIT(0x20);
        !           947:       if (utf8)
        !           948:         {
        !           949:         SET_BIT(0xC2);  /* For U+00A0 */
        !           950:         SET_BIT(0xE1);  /* For U+1680, U+180E */
        !           951:         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
        !           952:         SET_BIT(0xE3);  /* For U+3000 */
        !           953:         }
        !           954:       else SET_BIT(0xA0);
        !           955:       try_next = FALSE;
        !           956:       break;
        !           957: 
        !           958:       case OP_ANYNL:
        !           959:       case OP_VSPACE:
        !           960:       SET_BIT(0x0A);
        !           961:       SET_BIT(0x0B);
        !           962:       SET_BIT(0x0C);
        !           963:       SET_BIT(0x0D);
        !           964:       if (utf8)
        !           965:         {
        !           966:         SET_BIT(0xC2);  /* For U+0085 */
        !           967:         SET_BIT(0xE2);  /* For U+2028, U+2029 */
        !           968:         }
        !           969:       else SET_BIT(0x85);
        !           970:       try_next = FALSE;
        !           971:       break;
        !           972: 
        !           973:       /* Single character types set the bits and stop. Note that if PCRE_UCP
        !           974:       is set, we do not see these op codes because \d etc are converted to
        !           975:       properties. Therefore, these apply in the case when only characters less
        !           976:       than 256 are recognized to match the types. */
        !           977: 
        !           978:       case OP_NOT_DIGIT:
        !           979:       set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
        !           980:       try_next = FALSE;
        !           981:       break;
        !           982: 
        !           983:       case OP_DIGIT:
        !           984:       set_type_bits(start_bits, cbit_digit, table_limit, cd);
        !           985:       try_next = FALSE;
        !           986:       break;
        !           987: 
        !           988:       /* The cbit_space table has vertical tab as whitespace; we have to
        !           989:       ensure it is set as not whitespace. */
        !           990: 
        !           991:       case OP_NOT_WHITESPACE:
        !           992:       set_nottype_bits(start_bits, cbit_space, table_limit, cd);
        !           993:       start_bits[1] |= 0x08;
        !           994:       try_next = FALSE;
        !           995:       break;
        !           996: 
        !           997:       /* The cbit_space table has vertical tab as whitespace; we have to
        !           998:       not set it from the table. */
        !           999: 
        !          1000:       case OP_WHITESPACE:
        !          1001:       c = start_bits[1];    /* Save in case it was already set */
        !          1002:       set_type_bits(start_bits, cbit_space, table_limit, cd);
        !          1003:       start_bits[1] = (start_bits[1] & ~0x08) | c;
        !          1004:       try_next = FALSE;
        !          1005:       break;
        !          1006: 
        !          1007:       case OP_NOT_WORDCHAR:
        !          1008:       set_nottype_bits(start_bits, cbit_word, table_limit, cd);
        !          1009:       try_next = FALSE;
        !          1010:       break;
        !          1011: 
        !          1012:       case OP_WORDCHAR:
        !          1013:       set_type_bits(start_bits, cbit_word, table_limit, cd);
        !          1014:       try_next = FALSE;
        !          1015:       break;
        !          1016: 
        !          1017:       /* One or more character type fudges the pointer and restarts, knowing
        !          1018:       it will hit a single character type and stop there. */
        !          1019: 
        !          1020:       case OP_TYPEPLUS:
        !          1021:       case OP_TYPEMINPLUS:
        !          1022:       case OP_TYPEPOSPLUS:
        !          1023:       tcode++;
        !          1024:       break;
        !          1025: 
        !          1026:       case OP_TYPEEXACT:
        !          1027:       tcode += 3;
        !          1028:       break;
        !          1029: 
        !          1030:       /* Zero or more repeats of character types set the bits and then
        !          1031:       try again. */
        !          1032: 
        !          1033:       case OP_TYPEUPTO:
        !          1034:       case OP_TYPEMINUPTO:
        !          1035:       case OP_TYPEPOSUPTO:
        !          1036:       tcode += 2;               /* Fall through */
        !          1037: 
        !          1038:       case OP_TYPESTAR:
        !          1039:       case OP_TYPEMINSTAR:
        !          1040:       case OP_TYPEPOSSTAR:
        !          1041:       case OP_TYPEQUERY:
        !          1042:       case OP_TYPEMINQUERY:
        !          1043:       case OP_TYPEPOSQUERY:
        !          1044:       switch(tcode[1])
        !          1045:         {
        !          1046:         default:
        !          1047:         case OP_ANY:
        !          1048:         case OP_ALLANY:
        !          1049:         return SSB_FAIL;
        !          1050: 
        !          1051:         case OP_HSPACE:
        !          1052:         SET_BIT(0x09);
        !          1053:         SET_BIT(0x20);
        !          1054:         if (utf8)
        !          1055:           {
        !          1056:           SET_BIT(0xC2);  /* For U+00A0 */
        !          1057:           SET_BIT(0xE1);  /* For U+1680, U+180E */
        !          1058:           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
        !          1059:           SET_BIT(0xE3);  /* For U+3000 */
        !          1060:           }
        !          1061:         else SET_BIT(0xA0);
        !          1062:         break;
        !          1063: 
        !          1064:         case OP_ANYNL:
        !          1065:         case OP_VSPACE:
        !          1066:         SET_BIT(0x0A);
        !          1067:         SET_BIT(0x0B);
        !          1068:         SET_BIT(0x0C);
        !          1069:         SET_BIT(0x0D);
        !          1070:         if (utf8)
        !          1071:           {
        !          1072:           SET_BIT(0xC2);  /* For U+0085 */
        !          1073:           SET_BIT(0xE2);  /* For U+2028, U+2029 */
        !          1074:           }
        !          1075:         else SET_BIT(0x85);
        !          1076:         break;
        !          1077: 
        !          1078:         case OP_NOT_DIGIT:
        !          1079:         set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
        !          1080:         break;
        !          1081: 
        !          1082:         case OP_DIGIT:
        !          1083:         set_type_bits(start_bits, cbit_digit, table_limit, cd);
        !          1084:         break;
        !          1085: 
        !          1086:         /* The cbit_space table has vertical tab as whitespace; we have to
        !          1087:         ensure it gets set as not whitespace. */
        !          1088: 
        !          1089:         case OP_NOT_WHITESPACE:
        !          1090:         set_nottype_bits(start_bits, cbit_space, table_limit, cd);
        !          1091:         start_bits[1] |= 0x08;
        !          1092:         break;
        !          1093: 
        !          1094:         /* The cbit_space table has vertical tab as whitespace; we have to
        !          1095:         avoid setting it. */
        !          1096: 
        !          1097:         case OP_WHITESPACE:
        !          1098:         c = start_bits[1];    /* Save in case it was already set */
        !          1099:         set_type_bits(start_bits, cbit_space, table_limit, cd);
        !          1100:         start_bits[1] = (start_bits[1] & ~0x08) | c;
        !          1101:         break;
        !          1102: 
        !          1103:         case OP_NOT_WORDCHAR:
        !          1104:         set_nottype_bits(start_bits, cbit_word, table_limit, cd);
        !          1105:         break;
        !          1106: 
        !          1107:         case OP_WORDCHAR:
        !          1108:         set_type_bits(start_bits, cbit_word, table_limit, cd);
        !          1109:         break;
        !          1110:         }
        !          1111: 
        !          1112:       tcode += 2;
        !          1113:       break;
        !          1114: 
        !          1115:       /* Character class where all the information is in a bit map: set the
        !          1116:       bits and either carry on or not, according to the repeat count. If it was
        !          1117:       a negative class, and we are operating with UTF-8 characters, any byte
        !          1118:       with a value >= 0xc4 is a potentially valid starter because it starts a
        !          1119:       character with a value > 255. */
        !          1120: 
        !          1121:       case OP_NCLASS:
        !          1122: #ifdef SUPPORT_UTF8
        !          1123:       if (utf8)
        !          1124:         {
        !          1125:         start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
        !          1126:         memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
        !          1127:         }
        !          1128: #endif
        !          1129:       /* Fall through */
        !          1130: 
        !          1131:       case OP_CLASS:
        !          1132:         {
        !          1133:         tcode++;
        !          1134: 
        !          1135:         /* In UTF-8 mode, the bits in a bit map correspond to character
        !          1136:         values, not to byte values. However, the bit map we are constructing is
        !          1137:         for byte values. So we have to do a conversion for characters whose
        !          1138:         value is > 127. In fact, there are only two possible starting bytes for
        !          1139:         characters in the range 128 - 255. */
        !          1140: 
        !          1141: #ifdef SUPPORT_UTF8
        !          1142:         if (utf8)
        !          1143:           {
        !          1144:           for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
        !          1145:           for (c = 128; c < 256; c++)
        !          1146:             {
        !          1147:             if ((tcode[c/8] && (1 << (c&7))) != 0)
        !          1148:               {
        !          1149:               int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
        !          1150:               start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
        !          1151:               c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
        !          1152:               }
        !          1153:             }
        !          1154:           }
        !          1155: 
        !          1156:         /* In non-UTF-8 mode, the two bit maps are completely compatible. */
        !          1157: 
        !          1158:         else
        !          1159: #endif
        !          1160:           {
        !          1161:           for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
        !          1162:           }
        !          1163: 
        !          1164:         /* Advance past the bit map, and act on what follows. For a zero
        !          1165:         minimum repeat, continue; otherwise stop processing. */
        !          1166: 
        !          1167:         tcode += 32;
        !          1168:         switch (*tcode)
        !          1169:           {
        !          1170:           case OP_CRSTAR:
        !          1171:           case OP_CRMINSTAR:
        !          1172:           case OP_CRQUERY:
        !          1173:           case OP_CRMINQUERY:
        !          1174:           tcode++;
        !          1175:           break;
        !          1176: 
        !          1177:           case OP_CRRANGE:
        !          1178:           case OP_CRMINRANGE:
        !          1179:           if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
        !          1180:             else try_next = FALSE;
        !          1181:           break;
        !          1182: 
        !          1183:           default:
        !          1184:           try_next = FALSE;
        !          1185:           break;
        !          1186:           }
        !          1187:         }
        !          1188:       break; /* End of bitmap class handling */
        !          1189: 
        !          1190:       }      /* End of switch */
        !          1191:     }        /* End of try_next loop */
        !          1192: 
        !          1193:   code += GET(code, 1);   /* Advance to next branch */
        !          1194:   }
        !          1195: while (*code == OP_ALT);
        !          1196: return yield;
        !          1197: }
        !          1198: 
        !          1199: 
        !          1200: 
        !          1201: 
        !          1202: 
        !          1203: /*************************************************
        !          1204: *          Study a compiled expression           *
        !          1205: *************************************************/
        !          1206: 
        !          1207: /* This function is handed a compiled expression that it must study to produce
        !          1208: information that will speed up the matching. It returns a pcre_extra block
        !          1209: which then gets handed back to pcre_exec().
        !          1210: 
        !          1211: Arguments:
        !          1212:   re        points to the compiled expression
        !          1213:   options   contains option bits
        !          1214:   errorptr  points to where to place error messages;
        !          1215:             set NULL unless error
        !          1216: 
        !          1217: Returns:    pointer to a pcre_extra block, with study_data filled in and the
        !          1218:               appropriate flags set;
        !          1219:             NULL on error or if no optimization possible
        !          1220: */
        !          1221: 
        !          1222: PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
        !          1223: pcre_study(const pcre *external_re, int options, const char **errorptr)
        !          1224: {
        !          1225: int min;
        !          1226: BOOL bits_set = FALSE;
        !          1227: uschar start_bits[32];
        !          1228: pcre_extra *extra = NULL;
        !          1229: pcre_study_data *study;
        !          1230: const uschar *tables;
        !          1231: uschar *code;
        !          1232: compile_data compile_block;
        !          1233: const real_pcre *re = (const real_pcre *)external_re;
        !          1234: 
        !          1235: *errorptr = NULL;
        !          1236: 
        !          1237: if (re == NULL || re->magic_number != MAGIC_NUMBER)
        !          1238:   {
        !          1239:   *errorptr = "argument is not a compiled regular expression";
        !          1240:   return NULL;
        !          1241:   }
        !          1242: 
        !          1243: if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
        !          1244:   {
        !          1245:   *errorptr = "unknown or incorrect option bit(s) set";
        !          1246:   return NULL;
        !          1247:   }
        !          1248: 
        !          1249: code = (uschar *)re + re->name_table_offset +
        !          1250:   (re->name_count * re->name_entry_size);
        !          1251: 
        !          1252: /* For an anchored pattern, or an unanchored pattern that has a first char, or
        !          1253: a multiline pattern that matches only at "line starts", there is no point in
        !          1254: seeking a list of starting bytes. */
        !          1255: 
        !          1256: if ((re->options & PCRE_ANCHORED) == 0 &&
        !          1257:     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
        !          1258:   {
        !          1259:   int rc;
        !          1260: 
        !          1261:   /* Set the character tables in the block that is passed around */
        !          1262: 
        !          1263:   tables = re->tables;
        !          1264:   if (tables == NULL)
        !          1265:     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
        !          1266:     (void *)(&tables));
        !          1267: 
        !          1268:   compile_block.lcc = tables + lcc_offset;
        !          1269:   compile_block.fcc = tables + fcc_offset;
        !          1270:   compile_block.cbits = tables + cbits_offset;
        !          1271:   compile_block.ctypes = tables + ctypes_offset;
        !          1272: 
        !          1273:   /* See if we can find a fixed set of initial characters for the pattern. */
        !          1274: 
        !          1275:   memset(start_bits, 0, 32 * sizeof(uschar));
        !          1276:   rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
        !          1277:     &compile_block);
        !          1278:   bits_set = rc == SSB_DONE;
        !          1279:   if (rc == SSB_UNKNOWN)
        !          1280:     {
        !          1281:     *errorptr = "internal error: opcode not recognized";
        !          1282:     return NULL;
        !          1283:     }
        !          1284:   }
        !          1285: 
        !          1286: /* Find the minimum length of subject string. */
        !          1287: 
        !          1288: switch(min = find_minlength(code, code, re->options, 0))
        !          1289:   {
        !          1290:   case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
        !          1291:   case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
        !          1292:   default: break;
        !          1293:   }
        !          1294: 
        !          1295: /* If a set of starting bytes has been identified, or if the minimum length is
        !          1296: greater than zero, or if JIT optimization has been requested, get a pcre_extra
        !          1297: block and a pcre_study_data block. The study data is put in the latter, which
        !          1298: is pointed to by the former, which may also get additional data set later by
        !          1299: the calling program. At the moment, the size of pcre_study_data is fixed. We
        !          1300: nevertheless save it in a field for returning via the pcre_fullinfo() function
        !          1301: so that if it becomes variable in the future, we don't have to change that
        !          1302: code. */
        !          1303: 
        !          1304: if (bits_set || min > 0
        !          1305: #ifdef SUPPORT_JIT
        !          1306:     || (options & PCRE_STUDY_JIT_COMPILE) != 0
        !          1307: #endif
        !          1308:   )
        !          1309:   {
        !          1310:   extra = (pcre_extra *)(pcre_malloc)
        !          1311:     (sizeof(pcre_extra) + sizeof(pcre_study_data));
        !          1312:   if (extra == NULL)
        !          1313:     {
        !          1314:     *errorptr = "failed to get memory";
        !          1315:     return NULL;
        !          1316:     }
        !          1317: 
        !          1318:   study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
        !          1319:   extra->flags = PCRE_EXTRA_STUDY_DATA;
        !          1320:   extra->study_data = study;
        !          1321: 
        !          1322:   study->size = sizeof(pcre_study_data);
        !          1323:   study->flags = 0;
        !          1324: 
        !          1325:   /* Set the start bits always, to avoid unset memory errors if the
        !          1326:   study data is written to a file, but set the flag only if any of the bits
        !          1327:   are set, to save time looking when none are. */
        !          1328: 
        !          1329:   if (bits_set)
        !          1330:     {
        !          1331:     study->flags |= PCRE_STUDY_MAPPED;
        !          1332:     memcpy(study->start_bits, start_bits, sizeof(start_bits));
        !          1333:     }
        !          1334:   else memset(study->start_bits, 0, 32 * sizeof(uschar));
        !          1335: 
        !          1336:   /* Always set the minlength value in the block, because the JIT compiler
        !          1337:   makes use of it. However, don't set the bit unless the length is greater than
        !          1338:   zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
        !          1339:   checking the zero case. */
        !          1340: 
        !          1341:   if (min > 0)
        !          1342:     {
        !          1343:     study->flags |= PCRE_STUDY_MINLEN;
        !          1344:     study->minlength = min;
        !          1345:     }
        !          1346:   else study->minlength = 0;
        !          1347: 
        !          1348:   /* If JIT support was compiled and requested, attempt the JIT compilation.
        !          1349:   If no starting bytes were found, and the minimum length is zero, and JIT
        !          1350:   compilation fails, abandon the extra block and return NULL. */
        !          1351: 
        !          1352: #ifdef SUPPORT_JIT
        !          1353:   extra->executable_jit = NULL;
        !          1354:   if ((options & PCRE_STUDY_JIT_COMPILE) != 0) _pcre_jit_compile(re, extra);
        !          1355:   if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0)
        !          1356:     {
        !          1357:     pcre_free_study(extra);
        !          1358:     extra = NULL;
        !          1359:     }
        !          1360: #endif
        !          1361:   }
        !          1362: 
        !          1363: return extra;
        !          1364: }
        !          1365: 
        !          1366: 
        !          1367: /*************************************************
        !          1368: *          Free the study data                   *
        !          1369: *************************************************/
        !          1370: 
        !          1371: /* This function frees the memory that was obtained by pcre_study().
        !          1372: 
        !          1373: Argument:   a pointer to the pcre_extra block
        !          1374: Returns:    nothing
        !          1375: */
        !          1376: 
        !          1377: PCRE_EXP_DEFN void
        !          1378: pcre_free_study(pcre_extra *extra)
        !          1379: {
        !          1380: #ifdef SUPPORT_JIT
        !          1381: if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
        !          1382:      extra->executable_jit != NULL)
        !          1383:   _pcre_jit_free(extra->executable_jit);
        !          1384: #endif
        !          1385: pcre_free(extra);
        !          1386: }
        !          1387: 
        !          1388: /* End of pcre_study.c */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>