Annotation of embedaddon/php/ext/pcre/pcrelib/pcre_study.c, revision 1.1.1.1
1.1 misho 1: /*************************************************
2: * Perl-Compatible Regular Expressions *
3: *************************************************/
4:
5: /* PCRE is a library of functions to support regular expressions whose syntax
6: and semantics are as close as possible to those of the Perl 5 language.
7:
8: Written by Philip Hazel
9: Copyright (c) 1997-2010 University of Cambridge
10:
11: -----------------------------------------------------------------------------
12: Redistribution and use in source and binary forms, with or without
13: modification, are permitted provided that the following conditions are met:
14:
15: * Redistributions of source code must retain the above copyright notice,
16: this list of conditions and the following disclaimer.
17:
18: * Redistributions in binary form must reproduce the above copyright
19: notice, this list of conditions and the following disclaimer in the
20: documentation and/or other materials provided with the distribution.
21:
22: * Neither the name of the University of Cambridge nor the names of its
23: contributors may be used to endorse or promote products derived from
24: this software without specific prior written permission.
25:
26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36: POSSIBILITY OF SUCH DAMAGE.
37: -----------------------------------------------------------------------------
38: */
39:
40:
41: /* This module contains the external function pcre_study(), along with local
42: supporting functions. */
43:
44:
45: #include "config.h"
46:
47: #include "pcre_internal.h"
48:
49: #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
50:
51: /* Returns from set_start_bits() */
52:
53: enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
54:
55:
56:
57: /*************************************************
58: * Find the minimum subject length for a group *
59: *************************************************/
60:
61: /* Scan a parenthesized group and compute the minimum length of subject that
62: is needed to match it. This is a lower bound; it does not mean there is a
63: string of that length that matches. In UTF8 mode, the result is in characters
64: rather than bytes.
65:
66: Arguments:
67: code pointer to start of group (the bracket)
68: startcode pointer to start of the whole pattern
69: options the compiling options
70:
71: Returns: the minimum length
72: -1 if \C was encountered
73: -2 internal error (missing capturing bracket)
74: */
75:
76: static int
77: find_minlength(const uschar *code, const uschar *startcode, int options)
78: {
79: int length = -1;
80: BOOL utf8 = (options & PCRE_UTF8) != 0;
81: BOOL had_recurse = FALSE;
82: register int branchlength = 0;
83: register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
84:
85: if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;
86:
87: /* Scan along the opcodes for this branch. If we get to the end of the
88: branch, check the length against that of the other branches. */
89:
90: for (;;)
91: {
92: int d, min;
93: uschar *cs, *ce;
94: register int op = *cc;
95:
96: switch (op)
97: {
98: case OP_COND:
99: case OP_SCOND:
100:
101: /* If there is only one branch in a condition, the implied branch has zero
102: length, so we don't add anything. This covers the DEFINE "condition"
103: automatically. */
104:
105: cs = cc + GET(cc, 1);
106: if (*cs != OP_ALT)
107: {
108: cc = cs + 1 + LINK_SIZE;
109: break;
110: }
111:
112: /* Otherwise we can fall through and treat it the same as any other
113: subpattern. */
114:
115: case OP_CBRA:
116: case OP_SCBRA:
117: case OP_BRA:
118: case OP_SBRA:
119: case OP_ONCE:
120: d = find_minlength(cc, startcode, options);
121: if (d < 0) return d;
122: branchlength += d;
123: do cc += GET(cc, 1); while (*cc == OP_ALT);
124: cc += 1 + LINK_SIZE;
125: break;
126:
127: /* Reached end of a branch; if it's a ket it is the end of a nested
128: call. If it's ALT it is an alternation in a nested call. If it is
129: END it's the end of the outer call. All can be handled by the same code. */
130:
131: case OP_ALT:
132: case OP_KET:
133: case OP_KETRMAX:
134: case OP_KETRMIN:
135: case OP_END:
136: if (length < 0 || (!had_recurse && branchlength < length))
137: length = branchlength;
138: if (*cc != OP_ALT) return length;
139: cc += 1 + LINK_SIZE;
140: branchlength = 0;
141: had_recurse = FALSE;
142: break;
143:
144: /* Skip over assertive subpatterns */
145:
146: case OP_ASSERT:
147: case OP_ASSERT_NOT:
148: case OP_ASSERTBACK:
149: case OP_ASSERTBACK_NOT:
150: do cc += GET(cc, 1); while (*cc == OP_ALT);
151: /* Fall through */
152:
153: /* Skip over things that don't match chars */
154:
155: case OP_REVERSE:
156: case OP_CREF:
157: case OP_NCREF:
158: case OP_RREF:
159: case OP_NRREF:
160: case OP_DEF:
161: case OP_OPT:
162: case OP_CALLOUT:
163: case OP_SOD:
164: case OP_SOM:
165: case OP_EOD:
166: case OP_EODN:
167: case OP_CIRC:
168: case OP_DOLL:
169: case OP_NOT_WORD_BOUNDARY:
170: case OP_WORD_BOUNDARY:
171: cc += _pcre_OP_lengths[*cc];
172: break;
173:
174: /* Skip over a subpattern that has a {0} or {0,x} quantifier */
175:
176: case OP_BRAZERO:
177: case OP_BRAMINZERO:
178: case OP_SKIPZERO:
179: cc += _pcre_OP_lengths[*cc];
180: do cc += GET(cc, 1); while (*cc == OP_ALT);
181: cc += 1 + LINK_SIZE;
182: break;
183:
184: /* Handle literal characters and + repetitions */
185:
186: case OP_CHAR:
187: case OP_CHARNC:
188: case OP_NOT:
189: case OP_PLUS:
190: case OP_MINPLUS:
191: case OP_POSPLUS:
192: case OP_NOTPLUS:
193: case OP_NOTMINPLUS:
194: case OP_NOTPOSPLUS:
195: branchlength++;
196: cc += 2;
197: #ifdef SUPPORT_UTF8
198: if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
199: #endif
200: break;
201:
202: case OP_TYPEPLUS:
203: case OP_TYPEMINPLUS:
204: case OP_TYPEPOSPLUS:
205: branchlength++;
206: cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
207: break;
208:
209: /* Handle exact repetitions. The count is already in characters, but we
210: need to skip over a multibyte character in UTF8 mode. */
211:
212: case OP_EXACT:
213: case OP_NOTEXACT:
214: branchlength += GET2(cc,1);
215: cc += 4;
216: #ifdef SUPPORT_UTF8
217: if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
218: #endif
219: break;
220:
221: case OP_TYPEEXACT:
222: branchlength += GET2(cc,1);
223: cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
224: break;
225:
226: /* Handle single-char non-literal matchers */
227:
228: case OP_PROP:
229: case OP_NOTPROP:
230: cc += 2;
231: /* Fall through */
232:
233: case OP_NOT_DIGIT:
234: case OP_DIGIT:
235: case OP_NOT_WHITESPACE:
236: case OP_WHITESPACE:
237: case OP_NOT_WORDCHAR:
238: case OP_WORDCHAR:
239: case OP_ANY:
240: case OP_ALLANY:
241: case OP_EXTUNI:
242: case OP_HSPACE:
243: case OP_NOT_HSPACE:
244: case OP_VSPACE:
245: case OP_NOT_VSPACE:
246: branchlength++;
247: cc++;
248: break;
249:
250: /* "Any newline" might match two characters */
251:
252: case OP_ANYNL:
253: branchlength += 2;
254: cc++;
255: break;
256:
257: /* The single-byte matcher means we can't proceed in UTF-8 mode */
258:
259: case OP_ANYBYTE:
260: #ifdef SUPPORT_UTF8
261: if (utf8) return -1;
262: #endif
263: branchlength++;
264: cc++;
265: break;
266:
267: /* For repeated character types, we have to test for \p and \P, which have
268: an extra two bytes of parameters. */
269:
270: case OP_TYPESTAR:
271: case OP_TYPEMINSTAR:
272: case OP_TYPEQUERY:
273: case OP_TYPEMINQUERY:
274: case OP_TYPEPOSSTAR:
275: case OP_TYPEPOSQUERY:
276: if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
277: cc += _pcre_OP_lengths[op];
278: break;
279:
280: case OP_TYPEUPTO:
281: case OP_TYPEMINUPTO:
282: case OP_TYPEPOSUPTO:
283: if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
284: cc += _pcre_OP_lengths[op];
285: break;
286:
287: /* Check a class for variable quantification */
288:
289: #ifdef SUPPORT_UTF8
290: case OP_XCLASS:
291: cc += GET(cc, 1) - 33;
292: /* Fall through */
293: #endif
294:
295: case OP_CLASS:
296: case OP_NCLASS:
297: cc += 33;
298:
299: switch (*cc)
300: {
301: case OP_CRPLUS:
302: case OP_CRMINPLUS:
303: branchlength++;
304: /* Fall through */
305:
306: case OP_CRSTAR:
307: case OP_CRMINSTAR:
308: case OP_CRQUERY:
309: case OP_CRMINQUERY:
310: cc++;
311: break;
312:
313: case OP_CRRANGE:
314: case OP_CRMINRANGE:
315: branchlength += GET2(cc,1);
316: cc += 5;
317: break;
318:
319: default:
320: branchlength++;
321: break;
322: }
323: break;
324:
325: /* Backreferences and subroutine calls are treated in the same way: we find
326: the minimum length for the subpattern. A recursion, however, causes an
327: a flag to be set that causes the length of this branch to be ignored. The
328: logic is that a recursion can only make sense if there is another
329: alternation that stops the recursing. That will provide the minimum length
330: (when no recursion happens). A backreference within the group that it is
331: referencing behaves in the same way.
332:
333: If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
334: matches an empty string (by default it causes a matching failure), so in
335: that case we must set the minimum length to zero. */
336:
337: case OP_REF:
338: if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
339: {
340: ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
341: if (cs == NULL) return -2;
342: do ce += GET(ce, 1); while (*ce == OP_ALT);
343: if (cc > cs && cc < ce)
344: {
345: d = 0;
346: had_recurse = TRUE;
347: }
348: else d = find_minlength(cs, startcode, options);
349: }
350: else d = 0;
351: cc += 3;
352:
353: /* Handle repeated back references */
354:
355: switch (*cc)
356: {
357: case OP_CRSTAR:
358: case OP_CRMINSTAR:
359: case OP_CRQUERY:
360: case OP_CRMINQUERY:
361: min = 0;
362: cc++;
363: break;
364:
365: case OP_CRRANGE:
366: case OP_CRMINRANGE:
367: min = GET2(cc, 1);
368: cc += 5;
369: break;
370:
371: default:
372: min = 1;
373: break;
374: }
375:
376: branchlength += min * d;
377: break;
378:
379: case OP_RECURSE:
380: cs = ce = (uschar *)startcode + GET(cc, 1);
381: if (cs == NULL) return -2;
382: do ce += GET(ce, 1); while (*ce == OP_ALT);
383: if (cc > cs && cc < ce)
384: had_recurse = TRUE;
385: else
386: branchlength += find_minlength(cs, startcode, options);
387: cc += 1 + LINK_SIZE;
388: break;
389:
390: /* Anything else does not or need not match a character. We can get the
391: item's length from the table, but for those that can match zero occurrences
392: of a character, we must take special action for UTF-8 characters. */
393:
394: case OP_UPTO:
395: case OP_NOTUPTO:
396: case OP_MINUPTO:
397: case OP_NOTMINUPTO:
398: case OP_POSUPTO:
399: case OP_STAR:
400: case OP_MINSTAR:
401: case OP_NOTMINSTAR:
402: case OP_POSSTAR:
403: case OP_NOTPOSSTAR:
404: case OP_QUERY:
405: case OP_MINQUERY:
406: case OP_NOTMINQUERY:
407: case OP_POSQUERY:
408: case OP_NOTPOSQUERY:
409: cc += _pcre_OP_lengths[op];
410: #ifdef SUPPORT_UTF8
411: if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
412: #endif
413: break;
414:
415: /* Skip these, but we need to add in the name length. */
416:
417: case OP_MARK:
418: case OP_PRUNE_ARG:
419: case OP_SKIP_ARG:
420: cc += _pcre_OP_lengths[op] + cc[1];
421: break;
422:
423: case OP_THEN_ARG:
424: cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
425: break;
426:
427: /* For the record, these are the opcodes that are matched by "default":
428: OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
429: OP_THEN. */
430:
431: default:
432: cc += _pcre_OP_lengths[op];
433: break;
434: }
435: }
436: /* Control never gets here */
437: }
438:
439:
440:
441: /*************************************************
442: * Set a bit and maybe its alternate case *
443: *************************************************/
444:
445: /* Given a character, set its first byte's bit in the table, and also the
446: corresponding bit for the other version of a letter if we are caseless. In
447: UTF-8 mode, for characters greater than 127, we can only do the caseless thing
448: when Unicode property support is available.
449:
450: Arguments:
451: start_bits points to the bit map
452: p points to the character
453: caseless the caseless flag
454: cd the block with char table pointers
455: utf8 TRUE for UTF-8 mode
456:
457: Returns: pointer after the character
458: */
459:
460: static const uschar *
461: set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
462: compile_data *cd, BOOL utf8)
463: {
464: unsigned int c = *p;
465:
466: SET_BIT(c);
467:
468: #ifdef SUPPORT_UTF8
469: if (utf8 && c > 127)
470: {
471: GETCHARINC(c, p);
472: #ifdef SUPPORT_UCP
473: if (caseless)
474: {
475: uschar buff[8];
476: c = UCD_OTHERCASE(c);
477: (void)_pcre_ord2utf8(c, buff);
478: SET_BIT(buff[0]);
479: }
480: #endif
481: return p;
482: }
483: #endif
484:
485: /* Not UTF-8 mode, or character is less than 127. */
486:
487: if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
488: return p + 1;
489: }
490:
491:
492:
493: /*************************************************
494: * Set bits for a positive character type *
495: *************************************************/
496:
497: /* This function sets starting bits for a character type. In UTF-8 mode, we can
498: only do a direct setting for bytes less than 128, as otherwise there can be
499: confusion with bytes in the middle of UTF-8 characters. In a "traditional"
500: environment, the tables will only recognize ASCII characters anyway, but in at
501: least one Windows environment, some higher bytes bits were set in the tables.
502: So we deal with that case by considering the UTF-8 encoding.
503:
504: Arguments:
505: start_bits the starting bitmap
506: cbit type the type of character wanted
507: table_limit 32 for non-UTF-8; 16 for UTF-8
508: cd the block with char table pointers
509:
510: Returns: nothing
511: */
512:
513: static void
514: set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
515: compile_data *cd)
516: {
517: register int c;
518: for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
519: if (table_limit == 32) return;
520: for (c = 128; c < 256; c++)
521: {
522: if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
523: {
524: uschar buff[8];
525: (void)_pcre_ord2utf8(c, buff);
526: SET_BIT(buff[0]);
527: }
528: }
529: }
530:
531:
532: /*************************************************
533: * Set bits for a negative character type *
534: *************************************************/
535:
536: /* This function sets starting bits for a negative character type such as \D.
537: In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
538: otherwise there can be confusion with bytes in the middle of UTF-8 characters.
539: Unlike in the positive case, where we can set appropriate starting bits for
540: specific high-valued UTF-8 characters, in this case we have to set the bits for
541: all high-valued characters. The lowest is 0xc2, but we overkill by starting at
542: 0xc0 (192) for simplicity.
543:
544: Arguments:
545: start_bits the starting bitmap
546: cbit type the type of character wanted
547: table_limit 32 for non-UTF-8; 16 for UTF-8
548: cd the block with char table pointers
549:
550: Returns: nothing
551: */
552:
553: static void
554: set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
555: compile_data *cd)
556: {
557: register int c;
558: for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
559: if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
560: }
561:
562:
563:
564: /*************************************************
565: * Create bitmap of starting bytes *
566: *************************************************/
567:
568: /* This function scans a compiled unanchored expression recursively and
569: attempts to build a bitmap of the set of possible starting bytes. As time goes
570: by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
571: useful for parenthesized groups in patterns such as (a*)b where the group
572: provides some optional starting bytes but scanning must continue at the outer
573: level to find at least one mandatory byte. At the outermost level, this
574: function fails unless the result is SSB_DONE.
575:
576: Arguments:
577: code points to an expression
578: start_bits points to a 32-byte table, initialized to 0
579: caseless the current state of the caseless flag
580: utf8 TRUE if in UTF-8 mode
581: cd the block with char table pointers
582:
583: Returns: SSB_FAIL => Failed to find any starting bytes
584: SSB_DONE => Found mandatory starting bytes
585: SSB_CONTINUE => Found optional starting bytes
586: */
587:
588: static int
589: set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
590: BOOL utf8, compile_data *cd)
591: {
592: register int c;
593: int yield = SSB_DONE;
594: int table_limit = utf8? 16:32;
595:
596: #if 0
597: /* ========================================================================= */
598: /* The following comment and code was inserted in January 1999. In May 2006,
599: when it was observed to cause compiler warnings about unused values, I took it
600: out again. If anybody is still using OS/2, they will have to put it back
601: manually. */
602:
603: /* This next statement and the later reference to dummy are here in order to
604: trick the optimizer of the IBM C compiler for OS/2 into generating correct
605: code. Apparently IBM isn't going to fix the problem, and we would rather not
606: disable optimization (in this module it actually makes a big difference, and
607: the pcre module can use all the optimization it can get). */
608:
609: volatile int dummy;
610: /* ========================================================================= */
611: #endif
612:
613: do
614: {
615: const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
616: BOOL try_next = TRUE;
617:
618: while (try_next) /* Loop for items in this branch */
619: {
620: int rc;
621: switch(*tcode)
622: {
623: /* Fail if we reach something we don't understand */
624:
625: default:
626: return SSB_FAIL;
627:
628: /* If we hit a bracket or a positive lookahead assertion, recurse to set
629: bits from within the subpattern. If it can't find anything, we have to
630: give up. If it finds some mandatory character(s), we are done for this
631: branch. Otherwise, carry on scanning after the subpattern. */
632:
633: case OP_BRA:
634: case OP_SBRA:
635: case OP_CBRA:
636: case OP_SCBRA:
637: case OP_ONCE:
638: case OP_ASSERT:
639: rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
640: if (rc == SSB_FAIL) return SSB_FAIL;
641: if (rc == SSB_DONE) try_next = FALSE; else
642: {
643: do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
644: tcode += 1 + LINK_SIZE;
645: }
646: break;
647:
648: /* If we hit ALT or KET, it means we haven't found anything mandatory in
649: this branch, though we might have found something optional. For ALT, we
650: continue with the next alternative, but we have to arrange that the final
651: result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
652: return SSB_CONTINUE: if this is the top level, that indicates failure,
653: but after a nested subpattern, it causes scanning to continue. */
654:
655: case OP_ALT:
656: yield = SSB_CONTINUE;
657: try_next = FALSE;
658: break;
659:
660: case OP_KET:
661: case OP_KETRMAX:
662: case OP_KETRMIN:
663: return SSB_CONTINUE;
664:
665: /* Skip over callout */
666:
667: case OP_CALLOUT:
668: tcode += 2 + 2*LINK_SIZE;
669: break;
670:
671: /* Skip over lookbehind and negative lookahead assertions */
672:
673: case OP_ASSERT_NOT:
674: case OP_ASSERTBACK:
675: case OP_ASSERTBACK_NOT:
676: do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
677: tcode += 1 + LINK_SIZE;
678: break;
679:
680: /* Skip over an option setting, changing the caseless flag */
681:
682: case OP_OPT:
683: caseless = (tcode[1] & PCRE_CASELESS) != 0;
684: tcode += 2;
685: break;
686:
687: /* BRAZERO does the bracket, but carries on. */
688:
689: case OP_BRAZERO:
690: case OP_BRAMINZERO:
691: if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
692: return SSB_FAIL;
693: /* =========================================================================
694: See the comment at the head of this function concerning the next line,
695: which was an old fudge for the benefit of OS/2.
696: dummy = 1;
697: ========================================================================= */
698: do tcode += GET(tcode,1); while (*tcode == OP_ALT);
699: tcode += 1 + LINK_SIZE;
700: break;
701:
702: /* SKIPZERO skips the bracket. */
703:
704: case OP_SKIPZERO:
705: tcode++;
706: do tcode += GET(tcode,1); while (*tcode == OP_ALT);
707: tcode += 1 + LINK_SIZE;
708: break;
709:
710: /* Single-char * or ? sets the bit and tries the next item */
711:
712: case OP_STAR:
713: case OP_MINSTAR:
714: case OP_POSSTAR:
715: case OP_QUERY:
716: case OP_MINQUERY:
717: case OP_POSQUERY:
718: tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
719: break;
720:
721: /* Single-char upto sets the bit and tries the next */
722:
723: case OP_UPTO:
724: case OP_MINUPTO:
725: case OP_POSUPTO:
726: tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
727: break;
728:
729: /* At least one single char sets the bit and stops */
730:
731: case OP_EXACT: /* Fall through */
732: tcode += 2;
733:
734: case OP_CHAR:
735: case OP_CHARNC:
736: case OP_PLUS:
737: case OP_MINPLUS:
738: case OP_POSPLUS:
739: (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
740: try_next = FALSE;
741: break;
742:
743: /* Special spacing and line-terminating items. These recognize specific
744: lists of characters. The difference between VSPACE and ANYNL is that the
745: latter can match the two-character CRLF sequence, but that is not
746: relevant for finding the first character, so their code here is
747: identical. */
748:
749: case OP_HSPACE:
750: SET_BIT(0x09);
751: SET_BIT(0x20);
752: if (utf8)
753: {
754: SET_BIT(0xC2); /* For U+00A0 */
755: SET_BIT(0xE1); /* For U+1680, U+180E */
756: SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
757: SET_BIT(0xE3); /* For U+3000 */
758: }
759: else SET_BIT(0xA0);
760: try_next = FALSE;
761: break;
762:
763: case OP_ANYNL:
764: case OP_VSPACE:
765: SET_BIT(0x0A);
766: SET_BIT(0x0B);
767: SET_BIT(0x0C);
768: SET_BIT(0x0D);
769: if (utf8)
770: {
771: SET_BIT(0xC2); /* For U+0085 */
772: SET_BIT(0xE2); /* For U+2028, U+2029 */
773: }
774: else SET_BIT(0x85);
775: try_next = FALSE;
776: break;
777:
778: /* Single character types set the bits and stop. Note that if PCRE_UCP
779: is set, we do not see these op codes because \d etc are converted to
780: properties. Therefore, these apply in the case when only characters less
781: than 256 are recognized to match the types. */
782:
783: case OP_NOT_DIGIT:
784: set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
785: try_next = FALSE;
786: break;
787:
788: case OP_DIGIT:
789: set_type_bits(start_bits, cbit_digit, table_limit, cd);
790: try_next = FALSE;
791: break;
792:
793: /* The cbit_space table has vertical tab as whitespace; we have to
794: ensure it is set as not whitespace. */
795:
796: case OP_NOT_WHITESPACE:
797: set_nottype_bits(start_bits, cbit_space, table_limit, cd);
798: start_bits[1] |= 0x08;
799: try_next = FALSE;
800: break;
801:
802: /* The cbit_space table has vertical tab as whitespace; we have to
803: not set it from the table. */
804:
805: case OP_WHITESPACE:
806: c = start_bits[1]; /* Save in case it was already set */
807: set_type_bits(start_bits, cbit_space, table_limit, cd);
808: start_bits[1] = (start_bits[1] & ~0x08) | c;
809: try_next = FALSE;
810: break;
811:
812: case OP_NOT_WORDCHAR:
813: set_nottype_bits(start_bits, cbit_word, table_limit, cd);
814: try_next = FALSE;
815: break;
816:
817: case OP_WORDCHAR:
818: set_type_bits(start_bits, cbit_word, table_limit, cd);
819: try_next = FALSE;
820: break;
821:
822: /* One or more character type fudges the pointer and restarts, knowing
823: it will hit a single character type and stop there. */
824:
825: case OP_TYPEPLUS:
826: case OP_TYPEMINPLUS:
827: case OP_TYPEPOSPLUS:
828: tcode++;
829: break;
830:
831: case OP_TYPEEXACT:
832: tcode += 3;
833: break;
834:
835: /* Zero or more repeats of character types set the bits and then
836: try again. */
837:
838: case OP_TYPEUPTO:
839: case OP_TYPEMINUPTO:
840: case OP_TYPEPOSUPTO:
841: tcode += 2; /* Fall through */
842:
843: case OP_TYPESTAR:
844: case OP_TYPEMINSTAR:
845: case OP_TYPEPOSSTAR:
846: case OP_TYPEQUERY:
847: case OP_TYPEMINQUERY:
848: case OP_TYPEPOSQUERY:
849: switch(tcode[1])
850: {
851: default:
852: case OP_ANY:
853: case OP_ALLANY:
854: return SSB_FAIL;
855:
856: case OP_HSPACE:
857: SET_BIT(0x09);
858: SET_BIT(0x20);
859: if (utf8)
860: {
861: SET_BIT(0xC2); /* For U+00A0 */
862: SET_BIT(0xE1); /* For U+1680, U+180E */
863: SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
864: SET_BIT(0xE3); /* For U+3000 */
865: }
866: else SET_BIT(0xA0);
867: break;
868:
869: case OP_ANYNL:
870: case OP_VSPACE:
871: SET_BIT(0x0A);
872: SET_BIT(0x0B);
873: SET_BIT(0x0C);
874: SET_BIT(0x0D);
875: if (utf8)
876: {
877: SET_BIT(0xC2); /* For U+0085 */
878: SET_BIT(0xE2); /* For U+2028, U+2029 */
879: }
880: else SET_BIT(0x85);
881: break;
882:
883: case OP_NOT_DIGIT:
884: set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
885: break;
886:
887: case OP_DIGIT:
888: set_type_bits(start_bits, cbit_digit, table_limit, cd);
889: break;
890:
891: /* The cbit_space table has vertical tab as whitespace; we have to
892: ensure it gets set as not whitespace. */
893:
894: case OP_NOT_WHITESPACE:
895: set_nottype_bits(start_bits, cbit_space, table_limit, cd);
896: start_bits[1] |= 0x08;
897: break;
898:
899: /* The cbit_space table has vertical tab as whitespace; we have to
900: avoid setting it. */
901:
902: case OP_WHITESPACE:
903: c = start_bits[1]; /* Save in case it was already set */
904: set_type_bits(start_bits, cbit_space, table_limit, cd);
905: start_bits[1] = (start_bits[1] & ~0x08) | c;
906: break;
907:
908: case OP_NOT_WORDCHAR:
909: set_nottype_bits(start_bits, cbit_word, table_limit, cd);
910: break;
911:
912: case OP_WORDCHAR:
913: set_type_bits(start_bits, cbit_word, table_limit, cd);
914: break;
915: }
916:
917: tcode += 2;
918: break;
919:
920: /* Character class where all the information is in a bit map: set the
921: bits and either carry on or not, according to the repeat count. If it was
922: a negative class, and we are operating with UTF-8 characters, any byte
923: with a value >= 0xc4 is a potentially valid starter because it starts a
924: character with a value > 255. */
925:
926: case OP_NCLASS:
927: #ifdef SUPPORT_UTF8
928: if (utf8)
929: {
930: start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
931: memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
932: }
933: #endif
934: /* Fall through */
935:
936: case OP_CLASS:
937: {
938: tcode++;
939:
940: /* In UTF-8 mode, the bits in a bit map correspond to character
941: values, not to byte values. However, the bit map we are constructing is
942: for byte values. So we have to do a conversion for characters whose
943: value is > 127. In fact, there are only two possible starting bytes for
944: characters in the range 128 - 255. */
945:
946: #ifdef SUPPORT_UTF8
947: if (utf8)
948: {
949: for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
950: for (c = 128; c < 256; c++)
951: {
952: if ((tcode[c/8] && (1 << (c&7))) != 0)
953: {
954: int d = (c >> 6) | 0xc0; /* Set bit for this starter */
955: start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
956: c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
957: }
958: }
959: }
960:
961: /* In non-UTF-8 mode, the two bit maps are completely compatible. */
962:
963: else
964: #endif
965: {
966: for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
967: }
968:
969: /* Advance past the bit map, and act on what follows */
970:
971: tcode += 32;
972: switch (*tcode)
973: {
974: case OP_CRSTAR:
975: case OP_CRMINSTAR:
976: case OP_CRQUERY:
977: case OP_CRMINQUERY:
978: tcode++;
979: break;
980:
981: case OP_CRRANGE:
982: case OP_CRMINRANGE:
983: if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
984: else try_next = FALSE;
985: break;
986:
987: default:
988: try_next = FALSE;
989: break;
990: }
991: }
992: break; /* End of bitmap class handling */
993:
994: } /* End of switch */
995: } /* End of try_next loop */
996:
997: code += GET(code, 1); /* Advance to next branch */
998: }
999: while (*code == OP_ALT);
1000: return yield;
1001: }
1002:
1003:
1004:
1005: /*************************************************
1006: * Study a compiled expression *
1007: *************************************************/
1008:
1009: /* This function is handed a compiled expression that it must study to produce
1010: information that will speed up the matching. It returns a pcre_extra block
1011: which then gets handed back to pcre_exec().
1012:
1013: Arguments:
1014: re points to the compiled expression
1015: options contains option bits
1016: errorptr points to where to place error messages;
1017: set NULL unless error
1018:
1019: Returns: pointer to a pcre_extra block, with study_data filled in and the
1020: appropriate flags set;
1021: NULL on error or if no optimization possible
1022: */
1023:
1024: PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1025: pcre_study(const pcre *external_re, int options, const char **errorptr)
1026: {
1027: int min;
1028: BOOL bits_set = FALSE;
1029: uschar start_bits[32];
1030: pcre_extra *extra;
1031: pcre_study_data *study;
1032: const uschar *tables;
1033: uschar *code;
1034: compile_data compile_block;
1035: const real_pcre *re = (const real_pcre *)external_re;
1036:
1037: *errorptr = NULL;
1038:
1039: if (re == NULL || re->magic_number != MAGIC_NUMBER)
1040: {
1041: *errorptr = "argument is not a compiled regular expression";
1042: return NULL;
1043: }
1044:
1045: if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1046: {
1047: *errorptr = "unknown or incorrect option bit(s) set";
1048: return NULL;
1049: }
1050:
1051: code = (uschar *)re + re->name_table_offset +
1052: (re->name_count * re->name_entry_size);
1053:
1054: /* For an anchored pattern, or an unanchored pattern that has a first char, or
1055: a multiline pattern that matches only at "line starts", there is no point in
1056: seeking a list of starting bytes. */
1057:
1058: if ((re->options & PCRE_ANCHORED) == 0 &&
1059: (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1060: {
1061: /* Set the character tables in the block that is passed around */
1062:
1063: tables = re->tables;
1064: if (tables == NULL)
1065: (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1066: (void *)(&tables));
1067:
1068: compile_block.lcc = tables + lcc_offset;
1069: compile_block.fcc = tables + fcc_offset;
1070: compile_block.cbits = tables + cbits_offset;
1071: compile_block.ctypes = tables + ctypes_offset;
1072:
1073: /* See if we can find a fixed set of initial characters for the pattern. */
1074:
1075: memset(start_bits, 0, 32 * sizeof(uschar));
1076: bits_set = set_start_bits(code, start_bits,
1077: (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,
1078: &compile_block) == SSB_DONE;
1079: }
1080:
1081: /* Find the minimum length of subject string. */
1082:
1083: min = find_minlength(code, code, re->options);
1084:
1085: /* Return NULL if no optimization is possible. */
1086:
1087: if (!bits_set && min < 0) return NULL;
1088:
1089: /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
1090: the latter, which is pointed to by the former, which may also get additional
1091: data set later by the calling program. At the moment, the size of
1092: pcre_study_data is fixed. We nevertheless save it in a field for returning via
1093: the pcre_fullinfo() function so that if it becomes variable in the future, we
1094: don't have to change that code. */
1095:
1096: extra = (pcre_extra *)(pcre_malloc)
1097: (sizeof(pcre_extra) + sizeof(pcre_study_data));
1098:
1099: if (extra == NULL)
1100: {
1101: *errorptr = "failed to get memory";
1102: return NULL;
1103: }
1104:
1105: study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
1106: extra->flags = PCRE_EXTRA_STUDY_DATA;
1107: extra->study_data = study;
1108:
1109: study->size = sizeof(pcre_study_data);
1110: study->flags = 0;
1111:
1112: if (bits_set)
1113: {
1114: study->flags |= PCRE_STUDY_MAPPED;
1115: memcpy(study->start_bits, start_bits, sizeof(start_bits));
1116: }
1117:
1118: if (min >= 0)
1119: {
1120: study->flags |= PCRE_STUDY_MINLEN;
1121: study->minlength = min;
1122: }
1123:
1124: return extra;
1125: }
1126:
1127: /* End of pcre_study.c */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>