File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / pcre / pcre_study.c
Revision 1.1.1.2 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Tue Feb 21 23:50:25 2012 UTC (12 years, 4 months ago) by misho
Branches: pcre, MAIN
CVS tags: v8_30, HEAD
pcre

    1: /*************************************************
    2: *      Perl-Compatible Regular Expressions       *
    3: *************************************************/
    4: 
    5: /* PCRE is a library of functions to support regular expressions whose syntax
    6: and semantics are as close as possible to those of the Perl 5 language.
    7: 
    8:                        Written by Philip Hazel
    9:            Copyright (c) 1997-2012 University of Cambridge
   10: 
   11: -----------------------------------------------------------------------------
   12: Redistribution and use in source and binary forms, with or without
   13: modification, are permitted provided that the following conditions are met:
   14: 
   15:     * Redistributions of source code must retain the above copyright notice,
   16:       this list of conditions and the following disclaimer.
   17: 
   18:     * Redistributions in binary form must reproduce the above copyright
   19:       notice, this list of conditions and the following disclaimer in the
   20:       documentation and/or other materials provided with the distribution.
   21: 
   22:     * Neither the name of the University of Cambridge nor the names of its
   23:       contributors may be used to endorse or promote products derived from
   24:       this software without specific prior written permission.
   25: 
   26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
   27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
   30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   36: POSSIBILITY OF SUCH DAMAGE.
   37: -----------------------------------------------------------------------------
   38: */
   39: 
   40: 
   41: /* This module contains the external function pcre_study(), along with local
   42: supporting functions. */
   43: 
   44: 
   45: #ifdef HAVE_CONFIG_H
   46: #include "config.h"
   47: #endif
   48: 
   49: #include "pcre_internal.h"
   50: 
   51: #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
   52: 
   53: /* Returns from set_start_bits() */
   54: 
   55: enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
   56: 
   57: 
   58: 
   59: /*************************************************
   60: *   Find the minimum subject length for a group  *
   61: *************************************************/
   62: 
   63: /* Scan a parenthesized group and compute the minimum length of subject that
   64: is needed to match it. This is a lower bound; it does not mean there is a
   65: string of that length that matches. In UTF8 mode, the result is in characters
   66: rather than bytes.
   67: 
   68: Arguments:
   69:   code            pointer to start of group (the bracket)
   70:   startcode       pointer to start of the whole pattern
   71:   options         the compiling options
   72:   int             RECURSE depth
   73: 
   74: Returns:   the minimum length
   75:            -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
   76:            -2 internal error (missing capturing bracket)
   77:            -3 internal error (opcode not listed)
   78: */
   79: 
   80: static int
   81: find_minlength(const pcre_uchar *code, const pcre_uchar *startcode, int options,
   82:   int recurse_depth)
   83: {
   84: int length = -1;
   85: /* PCRE_UTF16 has the same value as PCRE_UTF8. */
   86: BOOL utf = (options & PCRE_UTF8) != 0;
   87: BOOL had_recurse = FALSE;
   88: register int branchlength = 0;
   89: register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
   90: 
   91: if (*code == OP_CBRA || *code == OP_SCBRA ||
   92:     *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
   93: 
   94: /* Scan along the opcodes for this branch. If we get to the end of the
   95: branch, check the length against that of the other branches. */
   96: 
   97: for (;;)
   98:   {
   99:   int d, min;
  100:   pcre_uchar *cs, *ce;
  101:   register int op = *cc;
  102: 
  103:   switch (op)
  104:     {
  105:     case OP_COND:
  106:     case OP_SCOND:
  107: 
  108:     /* If there is only one branch in a condition, the implied branch has zero
  109:     length, so we don't add anything. This covers the DEFINE "condition"
  110:     automatically. */
  111: 
  112:     cs = cc + GET(cc, 1);
  113:     if (*cs != OP_ALT)
  114:       {
  115:       cc = cs + 1 + LINK_SIZE;
  116:       break;
  117:       }
  118: 
  119:     /* Otherwise we can fall through and treat it the same as any other
  120:     subpattern. */
  121: 
  122:     case OP_CBRA:
  123:     case OP_SCBRA:
  124:     case OP_BRA:
  125:     case OP_SBRA:
  126:     case OP_CBRAPOS:
  127:     case OP_SCBRAPOS:
  128:     case OP_BRAPOS:
  129:     case OP_SBRAPOS:
  130:     case OP_ONCE:
  131:     case OP_ONCE_NC:
  132:     d = find_minlength(cc, startcode, options, recurse_depth);
  133:     if (d < 0) return d;
  134:     branchlength += d;
  135:     do cc += GET(cc, 1); while (*cc == OP_ALT);
  136:     cc += 1 + LINK_SIZE;
  137:     break;
  138: 
  139:     /* ACCEPT makes things far too complicated; we have to give up. */
  140: 
  141:     case OP_ACCEPT:
  142:     case OP_ASSERT_ACCEPT:
  143:     return -1;
  144: 
  145:     /* Reached end of a branch; if it's a ket it is the end of a nested
  146:     call. If it's ALT it is an alternation in a nested call. If it is END it's
  147:     the end of the outer call. All can be handled by the same code. If an
  148:     ACCEPT was previously encountered, use the length that was in force at that
  149:     time, and pass back the shortest ACCEPT length. */
  150: 
  151:     case OP_ALT:
  152:     case OP_KET:
  153:     case OP_KETRMAX:
  154:     case OP_KETRMIN:
  155:     case OP_KETRPOS:
  156:     case OP_END:
  157:     if (length < 0 || (!had_recurse && branchlength < length))
  158:       length = branchlength;
  159:     if (op != OP_ALT) return length;
  160:     cc += 1 + LINK_SIZE;
  161:     branchlength = 0;
  162:     had_recurse = FALSE;
  163:     break;
  164: 
  165:     /* Skip over assertive subpatterns */
  166: 
  167:     case OP_ASSERT:
  168:     case OP_ASSERT_NOT:
  169:     case OP_ASSERTBACK:
  170:     case OP_ASSERTBACK_NOT:
  171:     do cc += GET(cc, 1); while (*cc == OP_ALT);
  172:     /* Fall through */
  173: 
  174:     /* Skip over things that don't match chars */
  175: 
  176:     case OP_REVERSE:
  177:     case OP_CREF:
  178:     case OP_NCREF:
  179:     case OP_RREF:
  180:     case OP_NRREF:
  181:     case OP_DEF:
  182:     case OP_CALLOUT:
  183:     case OP_SOD:
  184:     case OP_SOM:
  185:     case OP_EOD:
  186:     case OP_EODN:
  187:     case OP_CIRC:
  188:     case OP_CIRCM:
  189:     case OP_DOLL:
  190:     case OP_DOLLM:
  191:     case OP_NOT_WORD_BOUNDARY:
  192:     case OP_WORD_BOUNDARY:
  193:     cc += PRIV(OP_lengths)[*cc];
  194:     break;
  195: 
  196:     /* Skip over a subpattern that has a {0} or {0,x} quantifier */
  197: 
  198:     case OP_BRAZERO:
  199:     case OP_BRAMINZERO:
  200:     case OP_BRAPOSZERO:
  201:     case OP_SKIPZERO:
  202:     cc += PRIV(OP_lengths)[*cc];
  203:     do cc += GET(cc, 1); while (*cc == OP_ALT);
  204:     cc += 1 + LINK_SIZE;
  205:     break;
  206: 
  207:     /* Handle literal characters and + repetitions */
  208: 
  209:     case OP_CHAR:
  210:     case OP_CHARI:
  211:     case OP_NOT:
  212:     case OP_NOTI:
  213:     case OP_PLUS:
  214:     case OP_PLUSI:
  215:     case OP_MINPLUS:
  216:     case OP_MINPLUSI:
  217:     case OP_POSPLUS:
  218:     case OP_POSPLUSI:
  219:     case OP_NOTPLUS:
  220:     case OP_NOTPLUSI:
  221:     case OP_NOTMINPLUS:
  222:     case OP_NOTMINPLUSI:
  223:     case OP_NOTPOSPLUS:
  224:     case OP_NOTPOSPLUSI:
  225:     branchlength++;
  226:     cc += 2;
  227: #ifdef SUPPORT_UTF
  228:     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
  229: #endif
  230:     break;
  231: 
  232:     case OP_TYPEPLUS:
  233:     case OP_TYPEMINPLUS:
  234:     case OP_TYPEPOSPLUS:
  235:     branchlength++;
  236:     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
  237:     break;
  238: 
  239:     /* Handle exact repetitions. The count is already in characters, but we
  240:     need to skip over a multibyte character in UTF8 mode.  */
  241: 
  242:     case OP_EXACT:
  243:     case OP_EXACTI:
  244:     case OP_NOTEXACT:
  245:     case OP_NOTEXACTI:
  246:     branchlength += GET2(cc,1);
  247:     cc += 2 + IMM2_SIZE;
  248: #ifdef SUPPORT_UTF
  249:     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
  250: #endif
  251:     break;
  252: 
  253:     case OP_TYPEEXACT:
  254:     branchlength += GET2(cc,1);
  255:     cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
  256:       || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
  257:     break;
  258: 
  259:     /* Handle single-char non-literal matchers */
  260: 
  261:     case OP_PROP:
  262:     case OP_NOTPROP:
  263:     cc += 2;
  264:     /* Fall through */
  265: 
  266:     case OP_NOT_DIGIT:
  267:     case OP_DIGIT:
  268:     case OP_NOT_WHITESPACE:
  269:     case OP_WHITESPACE:
  270:     case OP_NOT_WORDCHAR:
  271:     case OP_WORDCHAR:
  272:     case OP_ANY:
  273:     case OP_ALLANY:
  274:     case OP_EXTUNI:
  275:     case OP_HSPACE:
  276:     case OP_NOT_HSPACE:
  277:     case OP_VSPACE:
  278:     case OP_NOT_VSPACE:
  279:     branchlength++;
  280:     cc++;
  281:     break;
  282: 
  283:     /* "Any newline" might match two characters, but it also might match just
  284:     one. */
  285: 
  286:     case OP_ANYNL:
  287:     branchlength += 1;
  288:     cc++;
  289:     break;
  290: 
  291:     /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
  292:     non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
  293:     appear, but leave the code, just in case.) */
  294: 
  295:     case OP_ANYBYTE:
  296: #ifdef SUPPORT_UTF
  297:     if (utf) return -1;
  298: #endif
  299:     branchlength++;
  300:     cc++;
  301:     break;
  302: 
  303:     /* For repeated character types, we have to test for \p and \P, which have
  304:     an extra two bytes of parameters. */
  305: 
  306:     case OP_TYPESTAR:
  307:     case OP_TYPEMINSTAR:
  308:     case OP_TYPEQUERY:
  309:     case OP_TYPEMINQUERY:
  310:     case OP_TYPEPOSSTAR:
  311:     case OP_TYPEPOSQUERY:
  312:     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
  313:     cc += PRIV(OP_lengths)[op];
  314:     break;
  315: 
  316:     case OP_TYPEUPTO:
  317:     case OP_TYPEMINUPTO:
  318:     case OP_TYPEPOSUPTO:
  319:     if (cc[1 + IMM2_SIZE] == OP_PROP
  320:       || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
  321:     cc += PRIV(OP_lengths)[op];
  322:     break;
  323: 
  324:     /* Check a class for variable quantification */
  325: 
  326: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
  327:     case OP_XCLASS:
  328:     cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
  329:     /* Fall through */
  330: #endif
  331: 
  332:     case OP_CLASS:
  333:     case OP_NCLASS:
  334:     cc += PRIV(OP_lengths)[OP_CLASS];
  335: 
  336:     switch (*cc)
  337:       {
  338:       case OP_CRPLUS:
  339:       case OP_CRMINPLUS:
  340:       branchlength++;
  341:       /* Fall through */
  342: 
  343:       case OP_CRSTAR:
  344:       case OP_CRMINSTAR:
  345:       case OP_CRQUERY:
  346:       case OP_CRMINQUERY:
  347:       cc++;
  348:       break;
  349: 
  350:       case OP_CRRANGE:
  351:       case OP_CRMINRANGE:
  352:       branchlength += GET2(cc,1);
  353:       cc += 1 + 2 * IMM2_SIZE;
  354:       break;
  355: 
  356:       default:
  357:       branchlength++;
  358:       break;
  359:       }
  360:     break;
  361: 
  362:     /* Backreferences and subroutine calls are treated in the same way: we find
  363:     the minimum length for the subpattern. A recursion, however, causes an
  364:     a flag to be set that causes the length of this branch to be ignored. The
  365:     logic is that a recursion can only make sense if there is another
  366:     alternation that stops the recursing. That will provide the minimum length
  367:     (when no recursion happens). A backreference within the group that it is
  368:     referencing behaves in the same way.
  369: 
  370:     If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
  371:     matches an empty string (by default it causes a matching failure), so in
  372:     that case we must set the minimum length to zero. */
  373: 
  374:     case OP_REF:
  375:     case OP_REFI:
  376:     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
  377:       {
  378:       ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
  379:       if (cs == NULL) return -2;
  380:       do ce += GET(ce, 1); while (*ce == OP_ALT);
  381:       if (cc > cs && cc < ce)
  382:         {
  383:         d = 0;
  384:         had_recurse = TRUE;
  385:         }
  386:       else
  387:         {
  388:         d = find_minlength(cs, startcode, options, recurse_depth);
  389:         }
  390:       }
  391:     else d = 0;
  392:     cc += 1 + IMM2_SIZE;
  393: 
  394:     /* Handle repeated back references */
  395: 
  396:     switch (*cc)
  397:       {
  398:       case OP_CRSTAR:
  399:       case OP_CRMINSTAR:
  400:       case OP_CRQUERY:
  401:       case OP_CRMINQUERY:
  402:       min = 0;
  403:       cc++;
  404:       break;
  405: 
  406:       case OP_CRPLUS:
  407:       case OP_CRMINPLUS:
  408:       min = 1;
  409:       cc++;
  410:       break;
  411: 
  412:       case OP_CRRANGE:
  413:       case OP_CRMINRANGE:
  414:       min = GET2(cc, 1);
  415:       cc += 1 + 2 * IMM2_SIZE;
  416:       break;
  417: 
  418:       default:
  419:       min = 1;
  420:       break;
  421:       }
  422: 
  423:     branchlength += min * d;
  424:     break;
  425: 
  426:     /* We can easily detect direct recursion, but not mutual recursion. This is
  427:     caught by a recursion depth count. */
  428: 
  429:     case OP_RECURSE:
  430:     cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
  431:     do ce += GET(ce, 1); while (*ce == OP_ALT);
  432:     if ((cc > cs && cc < ce) || recurse_depth > 10)
  433:       had_recurse = TRUE;
  434:     else
  435:       {
  436:       branchlength += find_minlength(cs, startcode, options, recurse_depth + 1);
  437:       }
  438:     cc += 1 + LINK_SIZE;
  439:     break;
  440: 
  441:     /* Anything else does not or need not match a character. We can get the
  442:     item's length from the table, but for those that can match zero occurrences
  443:     of a character, we must take special action for UTF-8 characters. As it
  444:     happens, the "NOT" versions of these opcodes are used at present only for
  445:     ASCII characters, so they could be omitted from this list. However, in
  446:     future that may change, so we include them here so as not to leave a
  447:     gotcha for a future maintainer. */
  448: 
  449:     case OP_UPTO:
  450:     case OP_UPTOI:
  451:     case OP_NOTUPTO:
  452:     case OP_NOTUPTOI:
  453:     case OP_MINUPTO:
  454:     case OP_MINUPTOI:
  455:     case OP_NOTMINUPTO:
  456:     case OP_NOTMINUPTOI:
  457:     case OP_POSUPTO:
  458:     case OP_POSUPTOI:
  459:     case OP_NOTPOSUPTO:
  460:     case OP_NOTPOSUPTOI:
  461: 
  462:     case OP_STAR:
  463:     case OP_STARI:
  464:     case OP_NOTSTAR:
  465:     case OP_NOTSTARI:
  466:     case OP_MINSTAR:
  467:     case OP_MINSTARI:
  468:     case OP_NOTMINSTAR:
  469:     case OP_NOTMINSTARI:
  470:     case OP_POSSTAR:
  471:     case OP_POSSTARI:
  472:     case OP_NOTPOSSTAR:
  473:     case OP_NOTPOSSTARI:
  474: 
  475:     case OP_QUERY:
  476:     case OP_QUERYI:
  477:     case OP_NOTQUERY:
  478:     case OP_NOTQUERYI:
  479:     case OP_MINQUERY:
  480:     case OP_MINQUERYI:
  481:     case OP_NOTMINQUERY:
  482:     case OP_NOTMINQUERYI:
  483:     case OP_POSQUERY:
  484:     case OP_POSQUERYI:
  485:     case OP_NOTPOSQUERY:
  486:     case OP_NOTPOSQUERYI:
  487: 
  488:     cc += PRIV(OP_lengths)[op];
  489: #ifdef SUPPORT_UTF
  490:     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
  491: #endif
  492:     break;
  493: 
  494:     /* Skip these, but we need to add in the name length. */
  495: 
  496:     case OP_MARK:
  497:     case OP_PRUNE_ARG:
  498:     case OP_SKIP_ARG:
  499:     case OP_THEN_ARG:
  500:     cc += PRIV(OP_lengths)[op] + cc[1];
  501:     break;
  502: 
  503:     /* The remaining opcodes are just skipped over. */
  504: 
  505:     case OP_CLOSE:
  506:     case OP_COMMIT:
  507:     case OP_FAIL:
  508:     case OP_PRUNE:
  509:     case OP_SET_SOM:
  510:     case OP_SKIP:
  511:     case OP_THEN:
  512:     cc += PRIV(OP_lengths)[op];
  513:     break;
  514: 
  515:     /* This should not occur: we list all opcodes explicitly so that when
  516:     new ones get added they are properly considered. */
  517: 
  518:     default:
  519:     return -3;
  520:     }
  521:   }
  522: /* Control never gets here */
  523: }
  524: 
  525: 
  526: 
  527: /*************************************************
  528: *      Set a bit and maybe its alternate case    *
  529: *************************************************/
  530: 
  531: /* Given a character, set its first byte's bit in the table, and also the
  532: corresponding bit for the other version of a letter if we are caseless. In
  533: UTF-8 mode, for characters greater than 127, we can only do the caseless thing
  534: when Unicode property support is available.
  535: 
  536: Arguments:
  537:   start_bits    points to the bit map
  538:   p             points to the character
  539:   caseless      the caseless flag
  540:   cd            the block with char table pointers
  541:   utf           TRUE for UTF-8 / UTF-16 mode
  542: 
  543: Returns:        pointer after the character
  544: */
  545: 
  546: static const pcre_uchar *
  547: set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
  548:   compile_data *cd, BOOL utf)
  549: {
  550: unsigned int c = *p;
  551: 
  552: #ifdef COMPILE_PCRE8
  553: SET_BIT(c);
  554: 
  555: #ifdef SUPPORT_UTF
  556: if (utf && c > 127)
  557:   {
  558:   GETCHARINC(c, p);
  559: #ifdef SUPPORT_UCP
  560:   if (caseless)
  561:     {
  562:     pcre_uchar buff[6];
  563:     c = UCD_OTHERCASE(c);
  564:     (void)PRIV(ord2utf)(c, buff);
  565:     SET_BIT(buff[0]);
  566:     }
  567: #endif
  568:   return p;
  569:   }
  570: #endif
  571: 
  572: /* Not UTF-8 mode, or character is less than 127. */
  573: 
  574: if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
  575: return p + 1;
  576: #endif
  577: 
  578: #ifdef COMPILE_PCRE16
  579: if (c > 0xff)
  580:   {
  581:   c = 0xff;
  582:   caseless = FALSE;
  583:   }
  584: SET_BIT(c);
  585: 
  586: #ifdef SUPPORT_UTF
  587: if (utf && c > 127)
  588:   {
  589:   GETCHARINC(c, p);
  590: #ifdef SUPPORT_UCP
  591:   if (caseless)
  592:     {
  593:     c = UCD_OTHERCASE(c);
  594:     if (c > 0xff)
  595:       c = 0xff;
  596:     SET_BIT(c);
  597:     }
  598: #endif
  599:   return p;
  600:   }
  601: #endif
  602: 
  603: if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
  604: return p + 1;
  605: #endif
  606: }
  607: 
  608: 
  609: 
  610: /*************************************************
  611: *     Set bits for a positive character type     *
  612: *************************************************/
  613: 
  614: /* This function sets starting bits for a character type. In UTF-8 mode, we can
  615: only do a direct setting for bytes less than 128, as otherwise there can be
  616: confusion with bytes in the middle of UTF-8 characters. In a "traditional"
  617: environment, the tables will only recognize ASCII characters anyway, but in at
  618: least one Windows environment, some higher bytes bits were set in the tables.
  619: So we deal with that case by considering the UTF-8 encoding.
  620: 
  621: Arguments:
  622:   start_bits     the starting bitmap
  623:   cbit type      the type of character wanted
  624:   table_limit    32 for non-UTF-8; 16 for UTF-8
  625:   cd             the block with char table pointers
  626: 
  627: Returns:         nothing
  628: */
  629: 
  630: static void
  631: set_type_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit,
  632:   compile_data *cd)
  633: {
  634: register int c;
  635: for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
  636: #if defined SUPPORT_UTF && defined COMPILE_PCRE8
  637: if (table_limit == 32) return;
  638: for (c = 128; c < 256; c++)
  639:   {
  640:   if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
  641:     {
  642:     pcre_uchar buff[6];
  643:     (void)PRIV(ord2utf)(c, buff);
  644:     SET_BIT(buff[0]);
  645:     }
  646:   }
  647: #endif
  648: }
  649: 
  650: 
  651: /*************************************************
  652: *     Set bits for a negative character type     *
  653: *************************************************/
  654: 
  655: /* This function sets starting bits for a negative character type such as \D.
  656: In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
  657: otherwise there can be confusion with bytes in the middle of UTF-8 characters.
  658: Unlike in the positive case, where we can set appropriate starting bits for
  659: specific high-valued UTF-8 characters, in this case we have to set the bits for
  660: all high-valued characters. The lowest is 0xc2, but we overkill by starting at
  661: 0xc0 (192) for simplicity.
  662: 
  663: Arguments:
  664:   start_bits     the starting bitmap
  665:   cbit type      the type of character wanted
  666:   table_limit    32 for non-UTF-8; 16 for UTF-8
  667:   cd             the block with char table pointers
  668: 
  669: Returns:         nothing
  670: */
  671: 
  672: static void
  673: set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit,
  674:   compile_data *cd)
  675: {
  676: register int c;
  677: for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
  678: #if defined SUPPORT_UTF && defined COMPILE_PCRE8
  679: if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
  680: #endif
  681: }
  682: 
  683: 
  684: 
  685: /*************************************************
  686: *          Create bitmap of starting bytes       *
  687: *************************************************/
  688: 
  689: /* This function scans a compiled unanchored expression recursively and
  690: attempts to build a bitmap of the set of possible starting bytes. As time goes
  691: by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
  692: useful for parenthesized groups in patterns such as (a*)b where the group
  693: provides some optional starting bytes but scanning must continue at the outer
  694: level to find at least one mandatory byte. At the outermost level, this
  695: function fails unless the result is SSB_DONE.
  696: 
  697: Arguments:
  698:   code         points to an expression
  699:   start_bits   points to a 32-byte table, initialized to 0
  700:   utf          TRUE if in UTF-8 / UTF-16 mode
  701:   cd           the block with char table pointers
  702: 
  703: Returns:       SSB_FAIL     => Failed to find any starting bytes
  704:                SSB_DONE     => Found mandatory starting bytes
  705:                SSB_CONTINUE => Found optional starting bytes
  706:                SSB_UNKNOWN  => Hit an unrecognized opcode
  707: */
  708: 
  709: static int
  710: set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
  711:   compile_data *cd)
  712: {
  713: register int c;
  714: int yield = SSB_DONE;
  715: #if defined SUPPORT_UTF && defined COMPILE_PCRE8
  716: int table_limit = utf? 16:32;
  717: #else
  718: int table_limit = 32;
  719: #endif
  720: 
  721: #if 0
  722: /* ========================================================================= */
  723: /* The following comment and code was inserted in January 1999. In May 2006,
  724: when it was observed to cause compiler warnings about unused values, I took it
  725: out again. If anybody is still using OS/2, they will have to put it back
  726: manually. */
  727: 
  728: /* This next statement and the later reference to dummy are here in order to
  729: trick the optimizer of the IBM C compiler for OS/2 into generating correct
  730: code. Apparently IBM isn't going to fix the problem, and we would rather not
  731: disable optimization (in this module it actually makes a big difference, and
  732: the pcre module can use all the optimization it can get). */
  733: 
  734: volatile int dummy;
  735: /* ========================================================================= */
  736: #endif
  737: 
  738: do
  739:   {
  740:   BOOL try_next = TRUE;
  741:   const pcre_uchar *tcode = code + 1 + LINK_SIZE;
  742: 
  743:   if (*code == OP_CBRA || *code == OP_SCBRA ||
  744:       *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
  745: 
  746:   while (try_next)    /* Loop for items in this branch */
  747:     {
  748:     int rc;
  749: 
  750:     switch(*tcode)
  751:       {
  752:       /* If we reach something we don't understand, it means a new opcode has
  753:       been created that hasn't been added to this code. Hopefully this problem
  754:       will be discovered during testing. */
  755: 
  756:       default:
  757:       return SSB_UNKNOWN;
  758: 
  759:       /* Fail for a valid opcode that implies no starting bits. */
  760: 
  761:       case OP_ACCEPT:
  762:       case OP_ASSERT_ACCEPT:
  763:       case OP_ALLANY:
  764:       case OP_ANY:
  765:       case OP_ANYBYTE:
  766:       case OP_CIRC:
  767:       case OP_CIRCM:
  768:       case OP_CLOSE:
  769:       case OP_COMMIT:
  770:       case OP_COND:
  771:       case OP_CREF:
  772:       case OP_DEF:
  773:       case OP_DOLL:
  774:       case OP_DOLLM:
  775:       case OP_END:
  776:       case OP_EOD:
  777:       case OP_EODN:
  778:       case OP_EXTUNI:
  779:       case OP_FAIL:
  780:       case OP_MARK:
  781:       case OP_NCREF:
  782:       case OP_NOT:
  783:       case OP_NOTEXACT:
  784:       case OP_NOTEXACTI:
  785:       case OP_NOTI:
  786:       case OP_NOTMINPLUS:
  787:       case OP_NOTMINPLUSI:
  788:       case OP_NOTMINQUERY:
  789:       case OP_NOTMINQUERYI:
  790:       case OP_NOTMINSTAR:
  791:       case OP_NOTMINSTARI:
  792:       case OP_NOTMINUPTO:
  793:       case OP_NOTMINUPTOI:
  794:       case OP_NOTPLUS:
  795:       case OP_NOTPLUSI:
  796:       case OP_NOTPOSPLUS:
  797:       case OP_NOTPOSPLUSI:
  798:       case OP_NOTPOSQUERY:
  799:       case OP_NOTPOSQUERYI:
  800:       case OP_NOTPOSSTAR:
  801:       case OP_NOTPOSSTARI:
  802:       case OP_NOTPOSUPTO:
  803:       case OP_NOTPOSUPTOI:
  804:       case OP_NOTPROP:
  805:       case OP_NOTQUERY:
  806:       case OP_NOTQUERYI:
  807:       case OP_NOTSTAR:
  808:       case OP_NOTSTARI:
  809:       case OP_NOTUPTO:
  810:       case OP_NOTUPTOI:
  811:       case OP_NOT_HSPACE:
  812:       case OP_NOT_VSPACE:
  813:       case OP_NRREF:
  814:       case OP_PROP:
  815:       case OP_PRUNE:
  816:       case OP_PRUNE_ARG:
  817:       case OP_RECURSE:
  818:       case OP_REF:
  819:       case OP_REFI:
  820:       case OP_REVERSE:
  821:       case OP_RREF:
  822:       case OP_SCOND:
  823:       case OP_SET_SOM:
  824:       case OP_SKIP:
  825:       case OP_SKIP_ARG:
  826:       case OP_SOD:
  827:       case OP_SOM:
  828:       case OP_THEN:
  829:       case OP_THEN_ARG:
  830: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
  831:       case OP_XCLASS:
  832: #endif
  833:       return SSB_FAIL;
  834: 
  835:       /* We can ignore word boundary tests. */
  836: 
  837:       case OP_WORD_BOUNDARY:
  838:       case OP_NOT_WORD_BOUNDARY:
  839:       tcode++;
  840:       break;
  841: 
  842:       /* If we hit a bracket or a positive lookahead assertion, recurse to set
  843:       bits from within the subpattern. If it can't find anything, we have to
  844:       give up. If it finds some mandatory character(s), we are done for this
  845:       branch. Otherwise, carry on scanning after the subpattern. */
  846: 
  847:       case OP_BRA:
  848:       case OP_SBRA:
  849:       case OP_CBRA:
  850:       case OP_SCBRA:
  851:       case OP_BRAPOS:
  852:       case OP_SBRAPOS:
  853:       case OP_CBRAPOS:
  854:       case OP_SCBRAPOS:
  855:       case OP_ONCE:
  856:       case OP_ONCE_NC:
  857:       case OP_ASSERT:
  858:       rc = set_start_bits(tcode, start_bits, utf, cd);
  859:       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
  860:       if (rc == SSB_DONE) try_next = FALSE; else
  861:         {
  862:         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
  863:         tcode += 1 + LINK_SIZE;
  864:         }
  865:       break;
  866: 
  867:       /* If we hit ALT or KET, it means we haven't found anything mandatory in
  868:       this branch, though we might have found something optional. For ALT, we
  869:       continue with the next alternative, but we have to arrange that the final
  870:       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
  871:       return SSB_CONTINUE: if this is the top level, that indicates failure,
  872:       but after a nested subpattern, it causes scanning to continue. */
  873: 
  874:       case OP_ALT:
  875:       yield = SSB_CONTINUE;
  876:       try_next = FALSE;
  877:       break;
  878: 
  879:       case OP_KET:
  880:       case OP_KETRMAX:
  881:       case OP_KETRMIN:
  882:       case OP_KETRPOS:
  883:       return SSB_CONTINUE;
  884: 
  885:       /* Skip over callout */
  886: 
  887:       case OP_CALLOUT:
  888:       tcode += 2 + 2*LINK_SIZE;
  889:       break;
  890: 
  891:       /* Skip over lookbehind and negative lookahead assertions */
  892: 
  893:       case OP_ASSERT_NOT:
  894:       case OP_ASSERTBACK:
  895:       case OP_ASSERTBACK_NOT:
  896:       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
  897:       tcode += 1 + LINK_SIZE;
  898:       break;
  899: 
  900:       /* BRAZERO does the bracket, but carries on. */
  901: 
  902:       case OP_BRAZERO:
  903:       case OP_BRAMINZERO:
  904:       case OP_BRAPOSZERO:
  905:       rc = set_start_bits(++tcode, start_bits, utf, cd);
  906:       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
  907: /* =========================================================================
  908:       See the comment at the head of this function concerning the next line,
  909:       which was an old fudge for the benefit of OS/2.
  910:       dummy = 1;
  911:   ========================================================================= */
  912:       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
  913:       tcode += 1 + LINK_SIZE;
  914:       break;
  915: 
  916:       /* SKIPZERO skips the bracket. */
  917: 
  918:       case OP_SKIPZERO:
  919:       tcode++;
  920:       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
  921:       tcode += 1 + LINK_SIZE;
  922:       break;
  923: 
  924:       /* Single-char * or ? sets the bit and tries the next item */
  925: 
  926:       case OP_STAR:
  927:       case OP_MINSTAR:
  928:       case OP_POSSTAR:
  929:       case OP_QUERY:
  930:       case OP_MINQUERY:
  931:       case OP_POSQUERY:
  932:       tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
  933:       break;
  934: 
  935:       case OP_STARI:
  936:       case OP_MINSTARI:
  937:       case OP_POSSTARI:
  938:       case OP_QUERYI:
  939:       case OP_MINQUERYI:
  940:       case OP_POSQUERYI:
  941:       tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
  942:       break;
  943: 
  944:       /* Single-char upto sets the bit and tries the next */
  945: 
  946:       case OP_UPTO:
  947:       case OP_MINUPTO:
  948:       case OP_POSUPTO:
  949:       tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
  950:       break;
  951: 
  952:       case OP_UPTOI:
  953:       case OP_MINUPTOI:
  954:       case OP_POSUPTOI:
  955:       tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
  956:       break;
  957: 
  958:       /* At least one single char sets the bit and stops */
  959: 
  960:       case OP_EXACT:
  961:       tcode += IMM2_SIZE;
  962:       /* Fall through */
  963:       case OP_CHAR:
  964:       case OP_PLUS:
  965:       case OP_MINPLUS:
  966:       case OP_POSPLUS:
  967:       (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
  968:       try_next = FALSE;
  969:       break;
  970: 
  971:       case OP_EXACTI:
  972:       tcode += IMM2_SIZE;
  973:       /* Fall through */
  974:       case OP_CHARI:
  975:       case OP_PLUSI:
  976:       case OP_MINPLUSI:
  977:       case OP_POSPLUSI:
  978:       (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
  979:       try_next = FALSE;
  980:       break;
  981: 
  982:       /* Special spacing and line-terminating items. These recognize specific
  983:       lists of characters. The difference between VSPACE and ANYNL is that the
  984:       latter can match the two-character CRLF sequence, but that is not
  985:       relevant for finding the first character, so their code here is
  986:       identical. */
  987: 
  988:       case OP_HSPACE:
  989:       SET_BIT(0x09);
  990:       SET_BIT(0x20);
  991: #ifdef SUPPORT_UTF
  992:       if (utf)
  993:         {
  994: #ifdef COMPILE_PCRE8
  995:         SET_BIT(0xC2);  /* For U+00A0 */
  996:         SET_BIT(0xE1);  /* For U+1680, U+180E */
  997:         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
  998:         SET_BIT(0xE3);  /* For U+3000 */
  999: #endif
 1000: #ifdef COMPILE_PCRE16
 1001:         SET_BIT(0xA0);
 1002:         SET_BIT(0xFF);  /* For characters > 255 */
 1003: #endif
 1004:         }
 1005:       else
 1006: #endif /* SUPPORT_UTF */
 1007:         {
 1008:         SET_BIT(0xA0);
 1009: #ifdef COMPILE_PCRE16
 1010:         SET_BIT(0xFF);  /* For characters > 255 */
 1011: #endif
 1012:         }
 1013:       try_next = FALSE;
 1014:       break;
 1015: 
 1016:       case OP_ANYNL:
 1017:       case OP_VSPACE:
 1018:       SET_BIT(0x0A);
 1019:       SET_BIT(0x0B);
 1020:       SET_BIT(0x0C);
 1021:       SET_BIT(0x0D);
 1022: #ifdef SUPPORT_UTF
 1023:       if (utf)
 1024:         {
 1025: #ifdef COMPILE_PCRE8
 1026:         SET_BIT(0xC2);  /* For U+0085 */
 1027:         SET_BIT(0xE2);  /* For U+2028, U+2029 */
 1028: #endif
 1029: #ifdef COMPILE_PCRE16
 1030:         SET_BIT(0x85);
 1031:         SET_BIT(0xFF);  /* For characters > 255 */
 1032: #endif
 1033:         }
 1034:       else
 1035: #endif /* SUPPORT_UTF */
 1036:         {
 1037:         SET_BIT(0x85);
 1038: #ifdef COMPILE_PCRE16
 1039:         SET_BIT(0xFF);  /* For characters > 255 */
 1040: #endif
 1041:         }
 1042:       try_next = FALSE;
 1043:       break;
 1044: 
 1045:       /* Single character types set the bits and stop. Note that if PCRE_UCP
 1046:       is set, we do not see these op codes because \d etc are converted to
 1047:       properties. Therefore, these apply in the case when only characters less
 1048:       than 256 are recognized to match the types. */
 1049: 
 1050:       case OP_NOT_DIGIT:
 1051:       set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
 1052:       try_next = FALSE;
 1053:       break;
 1054: 
 1055:       case OP_DIGIT:
 1056:       set_type_bits(start_bits, cbit_digit, table_limit, cd);
 1057:       try_next = FALSE;
 1058:       break;
 1059: 
 1060:       /* The cbit_space table has vertical tab as whitespace; we have to
 1061:       ensure it is set as not whitespace. */
 1062: 
 1063:       case OP_NOT_WHITESPACE:
 1064:       set_nottype_bits(start_bits, cbit_space, table_limit, cd);
 1065:       start_bits[1] |= 0x08;
 1066:       try_next = FALSE;
 1067:       break;
 1068: 
 1069:       /* The cbit_space table has vertical tab as whitespace; we have to
 1070:       not set it from the table. */
 1071: 
 1072:       case OP_WHITESPACE:
 1073:       c = start_bits[1];    /* Save in case it was already set */
 1074:       set_type_bits(start_bits, cbit_space, table_limit, cd);
 1075:       start_bits[1] = (start_bits[1] & ~0x08) | c;
 1076:       try_next = FALSE;
 1077:       break;
 1078: 
 1079:       case OP_NOT_WORDCHAR:
 1080:       set_nottype_bits(start_bits, cbit_word, table_limit, cd);
 1081:       try_next = FALSE;
 1082:       break;
 1083: 
 1084:       case OP_WORDCHAR:
 1085:       set_type_bits(start_bits, cbit_word, table_limit, cd);
 1086:       try_next = FALSE;
 1087:       break;
 1088: 
 1089:       /* One or more character type fudges the pointer and restarts, knowing
 1090:       it will hit a single character type and stop there. */
 1091: 
 1092:       case OP_TYPEPLUS:
 1093:       case OP_TYPEMINPLUS:
 1094:       case OP_TYPEPOSPLUS:
 1095:       tcode++;
 1096:       break;
 1097: 
 1098:       case OP_TYPEEXACT:
 1099:       tcode += 1 + IMM2_SIZE;
 1100:       break;
 1101: 
 1102:       /* Zero or more repeats of character types set the bits and then
 1103:       try again. */
 1104: 
 1105:       case OP_TYPEUPTO:
 1106:       case OP_TYPEMINUPTO:
 1107:       case OP_TYPEPOSUPTO:
 1108:       tcode += IMM2_SIZE;  /* Fall through */
 1109: 
 1110:       case OP_TYPESTAR:
 1111:       case OP_TYPEMINSTAR:
 1112:       case OP_TYPEPOSSTAR:
 1113:       case OP_TYPEQUERY:
 1114:       case OP_TYPEMINQUERY:
 1115:       case OP_TYPEPOSQUERY:
 1116:       switch(tcode[1])
 1117:         {
 1118:         default:
 1119:         case OP_ANY:
 1120:         case OP_ALLANY:
 1121:         return SSB_FAIL;
 1122: 
 1123:         case OP_HSPACE:
 1124:         SET_BIT(0x09);
 1125:         SET_BIT(0x20);
 1126: #ifdef COMPILE_PCRE8
 1127:         if (utf)
 1128:           {
 1129: #ifdef COMPILE_PCRE8
 1130:           SET_BIT(0xC2);  /* For U+00A0 */
 1131:           SET_BIT(0xE1);  /* For U+1680, U+180E */
 1132:           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
 1133:           SET_BIT(0xE3);  /* For U+3000 */
 1134: #endif
 1135: #ifdef COMPILE_PCRE16
 1136:           SET_BIT(0xA0);
 1137:           SET_BIT(0xFF);  /* For characters > 255 */
 1138: #endif
 1139:           }
 1140:         else
 1141: #endif /* SUPPORT_UTF */
 1142:           SET_BIT(0xA0);
 1143:         break;
 1144: 
 1145:         case OP_ANYNL:
 1146:         case OP_VSPACE:
 1147:         SET_BIT(0x0A);
 1148:         SET_BIT(0x0B);
 1149:         SET_BIT(0x0C);
 1150:         SET_BIT(0x0D);
 1151: #ifdef COMPILE_PCRE8
 1152:         if (utf)
 1153:           {
 1154: #ifdef COMPILE_PCRE8
 1155:           SET_BIT(0xC2);  /* For U+0085 */
 1156:           SET_BIT(0xE2);  /* For U+2028, U+2029 */
 1157: #endif
 1158: #ifdef COMPILE_PCRE16
 1159:           SET_BIT(0x85);
 1160:           SET_BIT(0xFF);  /* For characters > 255 */
 1161: #endif
 1162:           }
 1163:         else
 1164: #endif /* SUPPORT_UTF */
 1165:           SET_BIT(0x85);
 1166:         break;
 1167: 
 1168:         case OP_NOT_DIGIT:
 1169:         set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
 1170:         break;
 1171: 
 1172:         case OP_DIGIT:
 1173:         set_type_bits(start_bits, cbit_digit, table_limit, cd);
 1174:         break;
 1175: 
 1176:         /* The cbit_space table has vertical tab as whitespace; we have to
 1177:         ensure it gets set as not whitespace. */
 1178: 
 1179:         case OP_NOT_WHITESPACE:
 1180:         set_nottype_bits(start_bits, cbit_space, table_limit, cd);
 1181:         start_bits[1] |= 0x08;
 1182:         break;
 1183: 
 1184:         /* The cbit_space table has vertical tab as whitespace; we have to
 1185:         avoid setting it. */
 1186: 
 1187:         case OP_WHITESPACE:
 1188:         c = start_bits[1];    /* Save in case it was already set */
 1189:         set_type_bits(start_bits, cbit_space, table_limit, cd);
 1190:         start_bits[1] = (start_bits[1] & ~0x08) | c;
 1191:         break;
 1192: 
 1193:         case OP_NOT_WORDCHAR:
 1194:         set_nottype_bits(start_bits, cbit_word, table_limit, cd);
 1195:         break;
 1196: 
 1197:         case OP_WORDCHAR:
 1198:         set_type_bits(start_bits, cbit_word, table_limit, cd);
 1199:         break;
 1200:         }
 1201: 
 1202:       tcode += 2;
 1203:       break;
 1204: 
 1205:       /* Character class where all the information is in a bit map: set the
 1206:       bits and either carry on or not, according to the repeat count. If it was
 1207:       a negative class, and we are operating with UTF-8 characters, any byte
 1208:       with a value >= 0xc4 is a potentially valid starter because it starts a
 1209:       character with a value > 255. */
 1210: 
 1211:       case OP_NCLASS:
 1212: #if defined SUPPORT_UTF && defined COMPILE_PCRE8
 1213:       if (utf)
 1214:         {
 1215:         start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
 1216:         memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
 1217:         }
 1218: #endif
 1219: #ifdef COMPILE_PCRE16
 1220:       SET_BIT(0xFF);                         /* For characters > 255 */
 1221: #endif
 1222:       /* Fall through */
 1223: 
 1224:       case OP_CLASS:
 1225:         {
 1226:         pcre_uint8 *map;
 1227:         tcode++;
 1228:         map = (pcre_uint8 *)tcode;
 1229: 
 1230:         /* In UTF-8 mode, the bits in a bit map correspond to character
 1231:         values, not to byte values. However, the bit map we are constructing is
 1232:         for byte values. So we have to do a conversion for characters whose
 1233:         value is > 127. In fact, there are only two possible starting bytes for
 1234:         characters in the range 128 - 255. */
 1235: 
 1236: #if defined SUPPORT_UTF && defined COMPILE_PCRE8
 1237:         if (utf)
 1238:           {
 1239:           for (c = 0; c < 16; c++) start_bits[c] |= map[c];
 1240:           for (c = 128; c < 256; c++)
 1241:             {
 1242:             if ((map[c/8] && (1 << (c&7))) != 0)
 1243:               {
 1244:               int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
 1245:               start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
 1246:               c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
 1247:               }
 1248:             }
 1249:           }
 1250:         else
 1251: #endif
 1252:           {
 1253:           /* In non-UTF-8 mode, the two bit maps are completely compatible. */
 1254:           for (c = 0; c < 32; c++) start_bits[c] |= map[c];
 1255:           }
 1256: 
 1257:         /* Advance past the bit map, and act on what follows. For a zero
 1258:         minimum repeat, continue; otherwise stop processing. */
 1259: 
 1260:         tcode += 32 / sizeof(pcre_uchar);
 1261:         switch (*tcode)
 1262:           {
 1263:           case OP_CRSTAR:
 1264:           case OP_CRMINSTAR:
 1265:           case OP_CRQUERY:
 1266:           case OP_CRMINQUERY:
 1267:           tcode++;
 1268:           break;
 1269: 
 1270:           case OP_CRRANGE:
 1271:           case OP_CRMINRANGE:
 1272:           if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
 1273:             else try_next = FALSE;
 1274:           break;
 1275: 
 1276:           default:
 1277:           try_next = FALSE;
 1278:           break;
 1279:           }
 1280:         }
 1281:       break; /* End of bitmap class handling */
 1282: 
 1283:       }      /* End of switch */
 1284:     }        /* End of try_next loop */
 1285: 
 1286:   code += GET(code, 1);   /* Advance to next branch */
 1287:   }
 1288: while (*code == OP_ALT);
 1289: return yield;
 1290: }
 1291: 
 1292: 
 1293: 
 1294: 
 1295: 
 1296: /*************************************************
 1297: *          Study a compiled expression           *
 1298: *************************************************/
 1299: 
 1300: /* This function is handed a compiled expression that it must study to produce
 1301: information that will speed up the matching. It returns a pcre[16]_extra block
 1302: which then gets handed back to pcre_exec().
 1303: 
 1304: Arguments:
 1305:   re        points to the compiled expression
 1306:   options   contains option bits
 1307:   errorptr  points to where to place error messages;
 1308:             set NULL unless error
 1309: 
 1310: Returns:    pointer to a pcre[16]_extra block, with study_data filled in and
 1311:               the appropriate flags set;
 1312:             NULL on error or if no optimization possible
 1313: */
 1314: 
 1315: #ifdef COMPILE_PCRE8
 1316: PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
 1317: pcre_study(const pcre *external_re, int options, const char **errorptr)
 1318: #else
 1319: PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
 1320: pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
 1321: #endif
 1322: {
 1323: int min;
 1324: BOOL bits_set = FALSE;
 1325: pcre_uint8 start_bits[32];
 1326: PUBL(extra) *extra = NULL;
 1327: pcre_study_data *study;
 1328: const pcre_uint8 *tables;
 1329: pcre_uchar *code;
 1330: compile_data compile_block;
 1331: const REAL_PCRE *re = (const REAL_PCRE *)external_re;
 1332: 
 1333: *errorptr = NULL;
 1334: 
 1335: if (re == NULL || re->magic_number != MAGIC_NUMBER)
 1336:   {
 1337:   *errorptr = "argument is not a compiled regular expression";
 1338:   return NULL;
 1339:   }
 1340: 
 1341: if ((re->flags & PCRE_MODE) == 0)
 1342:   {
 1343: #ifdef COMPILE_PCRE8
 1344:   *errorptr = "argument is compiled in 16 bit mode";
 1345: #else
 1346:   *errorptr = "argument is compiled in 8 bit mode";
 1347: #endif
 1348:   return NULL;
 1349:   }
 1350: 
 1351: if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
 1352:   {
 1353:   *errorptr = "unknown or incorrect option bit(s) set";
 1354:   return NULL;
 1355:   }
 1356: 
 1357: code = (pcre_uchar *)re + re->name_table_offset +
 1358:   (re->name_count * re->name_entry_size);
 1359: 
 1360: /* For an anchored pattern, or an unanchored pattern that has a first char, or
 1361: a multiline pattern that matches only at "line starts", there is no point in
 1362: seeking a list of starting bytes. */
 1363: 
 1364: if ((re->options & PCRE_ANCHORED) == 0 &&
 1365:     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
 1366:   {
 1367:   int rc;
 1368: 
 1369:   /* Set the character tables in the block that is passed around */
 1370: 
 1371:   tables = re->tables;
 1372: 
 1373: #ifdef COMPILE_PCRE8
 1374:   if (tables == NULL)
 1375:     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
 1376:     (void *)(&tables));
 1377: #else
 1378:   if (tables == NULL)
 1379:     (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
 1380:     (void *)(&tables));
 1381: #endif
 1382: 
 1383:   compile_block.lcc = tables + lcc_offset;
 1384:   compile_block.fcc = tables + fcc_offset;
 1385:   compile_block.cbits = tables + cbits_offset;
 1386:   compile_block.ctypes = tables + ctypes_offset;
 1387: 
 1388:   /* See if we can find a fixed set of initial characters for the pattern. */
 1389: 
 1390:   memset(start_bits, 0, 32 * sizeof(pcre_uint8));
 1391:   rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
 1392:     &compile_block);
 1393:   bits_set = rc == SSB_DONE;
 1394:   if (rc == SSB_UNKNOWN)
 1395:     {
 1396:     *errorptr = "internal error: opcode not recognized";
 1397:     return NULL;
 1398:     }
 1399:   }
 1400: 
 1401: /* Find the minimum length of subject string. */
 1402: 
 1403: switch(min = find_minlength(code, code, re->options, 0))
 1404:   {
 1405:   case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
 1406:   case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
 1407:   default: break;
 1408:   }
 1409: 
 1410: /* If a set of starting bytes has been identified, or if the minimum length is
 1411: greater than zero, or if JIT optimization has been requested, get a
 1412: pcre[16]_extra block and a pcre_study_data block. The study data is put in the
 1413: latter, which is pointed to by the former, which may also get additional data
 1414: set later by the calling program. At the moment, the size of pcre_study_data
 1415: is fixed. We nevertheless save it in a field for returning via the
 1416: pcre_fullinfo() function so that if it becomes variable in the future,
 1417: we don't have to change that code. */
 1418: 
 1419: if (bits_set || min > 0
 1420: #ifdef SUPPORT_JIT
 1421:     || (options & PCRE_STUDY_JIT_COMPILE) != 0
 1422: #endif
 1423:   )
 1424:   {
 1425:   extra = (PUBL(extra) *)(PUBL(malloc))
 1426:     (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
 1427:   if (extra == NULL)
 1428:     {
 1429:     *errorptr = "failed to get memory";
 1430:     return NULL;
 1431:     }
 1432: 
 1433:   study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
 1434:   extra->flags = PCRE_EXTRA_STUDY_DATA;
 1435:   extra->study_data = study;
 1436: 
 1437:   study->size = sizeof(pcre_study_data);
 1438:   study->flags = 0;
 1439: 
 1440:   /* Set the start bits always, to avoid unset memory errors if the
 1441:   study data is written to a file, but set the flag only if any of the bits
 1442:   are set, to save time looking when none are. */
 1443: 
 1444:   if (bits_set)
 1445:     {
 1446:     study->flags |= PCRE_STUDY_MAPPED;
 1447:     memcpy(study->start_bits, start_bits, sizeof(start_bits));
 1448:     }
 1449:   else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
 1450: 
 1451: #ifdef PCRE_DEBUG
 1452:   if (bits_set)
 1453:     {
 1454:     pcre_uint8 *ptr = start_bits;
 1455:     int i;
 1456: 
 1457:     printf("Start bits:\n");
 1458:     for (i = 0; i < 32; i++)
 1459:       printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
 1460:     }
 1461: #endif
 1462: 
 1463:   /* Always set the minlength value in the block, because the JIT compiler
 1464:   makes use of it. However, don't set the bit unless the length is greater than
 1465:   zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
 1466:   checking the zero case. */
 1467: 
 1468:   if (min > 0)
 1469:     {
 1470:     study->flags |= PCRE_STUDY_MINLEN;
 1471:     study->minlength = min;
 1472:     }
 1473:   else study->minlength = 0;
 1474: 
 1475:   /* If JIT support was compiled and requested, attempt the JIT compilation.
 1476:   If no starting bytes were found, and the minimum length is zero, and JIT
 1477:   compilation fails, abandon the extra block and return NULL. */
 1478: 
 1479: #ifdef SUPPORT_JIT
 1480:   extra->executable_jit = NULL;
 1481:   if ((options & PCRE_STUDY_JIT_COMPILE) != 0) PRIV(jit_compile)(re, extra);
 1482:   if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0)
 1483:     {
 1484: #ifdef COMPILE_PCRE8
 1485:     pcre_free_study(extra);
 1486: #endif
 1487: #ifdef COMPILE_PCRE16
 1488:     pcre16_free_study(extra);
 1489: #endif
 1490:     extra = NULL;
 1491:     }
 1492: #endif
 1493:   }
 1494: 
 1495: return extra;
 1496: }
 1497: 
 1498: 
 1499: /*************************************************
 1500: *          Free the study data                   *
 1501: *************************************************/
 1502: 
 1503: /* This function frees the memory that was obtained by pcre_study().
 1504: 
 1505: Argument:   a pointer to the pcre[16]_extra block
 1506: Returns:    nothing
 1507: */
 1508: 
 1509: #ifdef COMPILE_PCRE8
 1510: PCRE_EXP_DEFN void
 1511: pcre_free_study(pcre_extra *extra)
 1512: #else
 1513: PCRE_EXP_DEFN void
 1514: pcre16_free_study(pcre16_extra *extra)
 1515: #endif
 1516: {
 1517: if (extra == NULL)
 1518:   return;
 1519: #ifdef SUPPORT_JIT
 1520: if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
 1521:      extra->executable_jit != NULL)
 1522:   PRIV(jit_free)(extra->executable_jit);
 1523: #endif
 1524: PUBL(free)(extra);
 1525: }
 1526: 
 1527: /* End of pcre_study.c */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>