1: /*************************************************
2: * Perl-Compatible Regular Expressions *
3: *************************************************/
4:
5: /* PCRE is a library of functions to support regular expressions whose syntax
6: and semantics are as close as possible to those of the Perl 5 language.
7:
8: Written by Philip Hazel
9: Copyright (c) 1997-2012 University of Cambridge
10:
11: -----------------------------------------------------------------------------
12: Redistribution and use in source and binary forms, with or without
13: modification, are permitted provided that the following conditions are met:
14:
15: * Redistributions of source code must retain the above copyright notice,
16: this list of conditions and the following disclaimer.
17:
18: * Redistributions in binary form must reproduce the above copyright
19: notice, this list of conditions and the following disclaimer in the
20: documentation and/or other materials provided with the distribution.
21:
22: * Neither the name of the University of Cambridge nor the names of its
23: contributors may be used to endorse or promote products derived from
24: this software without specific prior written permission.
25:
26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36: POSSIBILITY OF SUCH DAMAGE.
37: -----------------------------------------------------------------------------
38: */
39:
40:
41: /* This module contains the external function pcre_study(), along with local
42: supporting functions. */
43:
44:
45: #ifdef HAVE_CONFIG_H
46: #include "config.h"
47: #endif
48:
49: #include "pcre_internal.h"
50:
51: #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
52:
53: /* Returns from set_start_bits() */
54:
55: enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
56:
57:
58:
59: /*************************************************
60: * Find the minimum subject length for a group *
61: *************************************************/
62:
63: /* Scan a parenthesized group and compute the minimum length of subject that
64: is needed to match it. This is a lower bound; it does not mean there is a
65: string of that length that matches. In UTF8 mode, the result is in characters
66: rather than bytes.
67:
68: Arguments:
69: re compiled pattern block
70: code pointer to start of group (the bracket)
71: startcode pointer to start of the whole pattern's code
72: options the compiling options
73: int RECURSE depth
74:
75: Returns: the minimum length
76: -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
77: -2 internal error (missing capturing bracket)
78: -3 internal error (opcode not listed)
79: */
80:
81: static int
82: find_minlength(const REAL_PCRE *re, const pcre_uchar *code,
83: const pcre_uchar *startcode, int options, int recurse_depth)
84: {
85: int length = -1;
86: /* PCRE_UTF16 has the same value as PCRE_UTF8. */
87: BOOL utf = (options & PCRE_UTF8) != 0;
88: BOOL had_recurse = FALSE;
89: register int branchlength = 0;
90: register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
91:
92: if (*code == OP_CBRA || *code == OP_SCBRA ||
93: *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
94:
95: /* Scan along the opcodes for this branch. If we get to the end of the
96: branch, check the length against that of the other branches. */
97:
98: for (;;)
99: {
100: int d, min;
101: pcre_uchar *cs, *ce;
102: register pcre_uchar op = *cc;
103:
104: switch (op)
105: {
106: case OP_COND:
107: case OP_SCOND:
108:
109: /* If there is only one branch in a condition, the implied branch has zero
110: length, so we don't add anything. This covers the DEFINE "condition"
111: automatically. */
112:
113: cs = cc + GET(cc, 1);
114: if (*cs != OP_ALT)
115: {
116: cc = cs + 1 + LINK_SIZE;
117: break;
118: }
119:
120: /* Otherwise we can fall through and treat it the same as any other
121: subpattern. */
122:
123: case OP_CBRA:
124: case OP_SCBRA:
125: case OP_BRA:
126: case OP_SBRA:
127: case OP_CBRAPOS:
128: case OP_SCBRAPOS:
129: case OP_BRAPOS:
130: case OP_SBRAPOS:
131: case OP_ONCE:
132: case OP_ONCE_NC:
133: d = find_minlength(re, cc, startcode, options, recurse_depth);
134: if (d < 0) return d;
135: branchlength += d;
136: do cc += GET(cc, 1); while (*cc == OP_ALT);
137: cc += 1 + LINK_SIZE;
138: break;
139:
140: /* ACCEPT makes things far too complicated; we have to give up. */
141:
142: case OP_ACCEPT:
143: case OP_ASSERT_ACCEPT:
144: return -1;
145:
146: /* Reached end of a branch; if it's a ket it is the end of a nested
147: call. If it's ALT it is an alternation in a nested call. If it is END it's
148: the end of the outer call. All can be handled by the same code. If an
149: ACCEPT was previously encountered, use the length that was in force at that
150: time, and pass back the shortest ACCEPT length. */
151:
152: case OP_ALT:
153: case OP_KET:
154: case OP_KETRMAX:
155: case OP_KETRMIN:
156: case OP_KETRPOS:
157: case OP_END:
158: if (length < 0 || (!had_recurse && branchlength < length))
159: length = branchlength;
160: if (op != OP_ALT) return length;
161: cc += 1 + LINK_SIZE;
162: branchlength = 0;
163: had_recurse = FALSE;
164: break;
165:
166: /* Skip over assertive subpatterns */
167:
168: case OP_ASSERT:
169: case OP_ASSERT_NOT:
170: case OP_ASSERTBACK:
171: case OP_ASSERTBACK_NOT:
172: do cc += GET(cc, 1); while (*cc == OP_ALT);
173: /* Fall through */
174:
175: /* Skip over things that don't match chars */
176:
177: case OP_REVERSE:
178: case OP_CREF:
179: case OP_DNCREF:
180: case OP_RREF:
181: case OP_DNRREF:
182: case OP_DEF:
183: case OP_CALLOUT:
184: case OP_SOD:
185: case OP_SOM:
186: case OP_EOD:
187: case OP_EODN:
188: case OP_CIRC:
189: case OP_CIRCM:
190: case OP_DOLL:
191: case OP_DOLLM:
192: case OP_NOT_WORD_BOUNDARY:
193: case OP_WORD_BOUNDARY:
194: cc += PRIV(OP_lengths)[*cc];
195: break;
196:
197: /* Skip over a subpattern that has a {0} or {0,x} quantifier */
198:
199: case OP_BRAZERO:
200: case OP_BRAMINZERO:
201: case OP_BRAPOSZERO:
202: case OP_SKIPZERO:
203: cc += PRIV(OP_lengths)[*cc];
204: do cc += GET(cc, 1); while (*cc == OP_ALT);
205: cc += 1 + LINK_SIZE;
206: break;
207:
208: /* Handle literal characters and + repetitions */
209:
210: case OP_CHAR:
211: case OP_CHARI:
212: case OP_NOT:
213: case OP_NOTI:
214: case OP_PLUS:
215: case OP_PLUSI:
216: case OP_MINPLUS:
217: case OP_MINPLUSI:
218: case OP_POSPLUS:
219: case OP_POSPLUSI:
220: case OP_NOTPLUS:
221: case OP_NOTPLUSI:
222: case OP_NOTMINPLUS:
223: case OP_NOTMINPLUSI:
224: case OP_NOTPOSPLUS:
225: case OP_NOTPOSPLUSI:
226: branchlength++;
227: cc += 2;
228: #ifdef SUPPORT_UTF
229: if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
230: #endif
231: break;
232:
233: case OP_TYPEPLUS:
234: case OP_TYPEMINPLUS:
235: case OP_TYPEPOSPLUS:
236: branchlength++;
237: cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
238: break;
239:
240: /* Handle exact repetitions. The count is already in characters, but we
241: need to skip over a multibyte character in UTF8 mode. */
242:
243: case OP_EXACT:
244: case OP_EXACTI:
245: case OP_NOTEXACT:
246: case OP_NOTEXACTI:
247: branchlength += GET2(cc,1);
248: cc += 2 + IMM2_SIZE;
249: #ifdef SUPPORT_UTF
250: if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
251: #endif
252: break;
253:
254: case OP_TYPEEXACT:
255: branchlength += GET2(cc,1);
256: cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
257: || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
258: break;
259:
260: /* Handle single-char non-literal matchers */
261:
262: case OP_PROP:
263: case OP_NOTPROP:
264: cc += 2;
265: /* Fall through */
266:
267: case OP_NOT_DIGIT:
268: case OP_DIGIT:
269: case OP_NOT_WHITESPACE:
270: case OP_WHITESPACE:
271: case OP_NOT_WORDCHAR:
272: case OP_WORDCHAR:
273: case OP_ANY:
274: case OP_ALLANY:
275: case OP_EXTUNI:
276: case OP_HSPACE:
277: case OP_NOT_HSPACE:
278: case OP_VSPACE:
279: case OP_NOT_VSPACE:
280: branchlength++;
281: cc++;
282: break;
283:
284: /* "Any newline" might match two characters, but it also might match just
285: one. */
286:
287: case OP_ANYNL:
288: branchlength += 1;
289: cc++;
290: break;
291:
292: /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
293: non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
294: appear, but leave the code, just in case.) */
295:
296: case OP_ANYBYTE:
297: #ifdef SUPPORT_UTF
298: if (utf) return -1;
299: #endif
300: branchlength++;
301: cc++;
302: break;
303:
304: /* For repeated character types, we have to test for \p and \P, which have
305: an extra two bytes of parameters. */
306:
307: case OP_TYPESTAR:
308: case OP_TYPEMINSTAR:
309: case OP_TYPEQUERY:
310: case OP_TYPEMINQUERY:
311: case OP_TYPEPOSSTAR:
312: case OP_TYPEPOSQUERY:
313: if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
314: cc += PRIV(OP_lengths)[op];
315: break;
316:
317: case OP_TYPEUPTO:
318: case OP_TYPEMINUPTO:
319: case OP_TYPEPOSUPTO:
320: if (cc[1 + IMM2_SIZE] == OP_PROP
321: || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
322: cc += PRIV(OP_lengths)[op];
323: break;
324:
325: /* Check a class for variable quantification */
326:
327: case OP_CLASS:
328: case OP_NCLASS:
329: #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
330: case OP_XCLASS:
331: /* The original code caused an unsigned overflow in 64 bit systems,
332: so now we use a conditional statement. */
333: if (op == OP_XCLASS)
334: cc += GET(cc, 1);
335: else
336: cc += PRIV(OP_lengths)[OP_CLASS];
337: #else
338: cc += PRIV(OP_lengths)[OP_CLASS];
339: #endif
340:
341: switch (*cc)
342: {
343: case OP_CRPLUS:
344: case OP_CRMINPLUS:
345: case OP_CRPOSPLUS:
346: branchlength++;
347: /* Fall through */
348:
349: case OP_CRSTAR:
350: case OP_CRMINSTAR:
351: case OP_CRQUERY:
352: case OP_CRMINQUERY:
353: case OP_CRPOSSTAR:
354: case OP_CRPOSQUERY:
355: cc++;
356: break;
357:
358: case OP_CRRANGE:
359: case OP_CRMINRANGE:
360: case OP_CRPOSRANGE:
361: branchlength += GET2(cc,1);
362: cc += 1 + 2 * IMM2_SIZE;
363: break;
364:
365: default:
366: branchlength++;
367: break;
368: }
369: break;
370:
371: /* Backreferences and subroutine calls are treated in the same way: we find
372: the minimum length for the subpattern. A recursion, however, causes an
373: a flag to be set that causes the length of this branch to be ignored. The
374: logic is that a recursion can only make sense if there is another
375: alternation that stops the recursing. That will provide the minimum length
376: (when no recursion happens). A backreference within the group that it is
377: referencing behaves in the same way.
378:
379: If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
380: matches an empty string (by default it causes a matching failure), so in
381: that case we must set the minimum length to zero. */
382:
383: case OP_DNREF: /* Duplicate named pattern back reference */
384: case OP_DNREFI:
385: if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
386: {
387: int count = GET2(cc, 1+IMM2_SIZE);
388: pcre_uchar *slot = (pcre_uchar *)re +
389: re->name_table_offset + GET2(cc, 1) * re->name_entry_size;
390: d = INT_MAX;
391: while (count-- > 0)
392: {
393: ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
394: if (cs == NULL) return -2;
395: do ce += GET(ce, 1); while (*ce == OP_ALT);
396: if (cc > cs && cc < ce)
397: {
398: d = 0;
399: had_recurse = TRUE;
400: break;
401: }
402: else
403: {
404: int dd = find_minlength(re, cs, startcode, options, recurse_depth);
405: if (dd < d) d = dd;
406: }
407: slot += re->name_entry_size;
408: }
409: }
410: else d = 0;
411: cc += 1 + 2*IMM2_SIZE;
412: goto REPEAT_BACK_REFERENCE;
413:
414: case OP_REF: /* Single back reference */
415: case OP_REFI:
416: if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
417: {
418: ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
419: if (cs == NULL) return -2;
420: do ce += GET(ce, 1); while (*ce == OP_ALT);
421: if (cc > cs && cc < ce)
422: {
423: d = 0;
424: had_recurse = TRUE;
425: }
426: else
427: {
428: d = find_minlength(re, cs, startcode, options, recurse_depth);
429: }
430: }
431: else d = 0;
432: cc += 1 + IMM2_SIZE;
433:
434: /* Handle repeated back references */
435:
436: REPEAT_BACK_REFERENCE:
437: switch (*cc)
438: {
439: case OP_CRSTAR:
440: case OP_CRMINSTAR:
441: case OP_CRQUERY:
442: case OP_CRMINQUERY:
443: case OP_CRPOSSTAR:
444: case OP_CRPOSQUERY:
445: min = 0;
446: cc++;
447: break;
448:
449: case OP_CRPLUS:
450: case OP_CRMINPLUS:
451: case OP_CRPOSPLUS:
452: min = 1;
453: cc++;
454: break;
455:
456: case OP_CRRANGE:
457: case OP_CRMINRANGE:
458: case OP_CRPOSRANGE:
459: min = GET2(cc, 1);
460: cc += 1 + 2 * IMM2_SIZE;
461: break;
462:
463: default:
464: min = 1;
465: break;
466: }
467:
468: branchlength += min * d;
469: break;
470:
471: /* We can easily detect direct recursion, but not mutual recursion. This is
472: caught by a recursion depth count. */
473:
474: case OP_RECURSE:
475: cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
476: do ce += GET(ce, 1); while (*ce == OP_ALT);
477: if ((cc > cs && cc < ce) || recurse_depth > 10)
478: had_recurse = TRUE;
479: else
480: {
481: branchlength += find_minlength(re, cs, startcode, options,
482: recurse_depth + 1);
483: }
484: cc += 1 + LINK_SIZE;
485: break;
486:
487: /* Anything else does not or need not match a character. We can get the
488: item's length from the table, but for those that can match zero occurrences
489: of a character, we must take special action for UTF-8 characters. As it
490: happens, the "NOT" versions of these opcodes are used at present only for
491: ASCII characters, so they could be omitted from this list. However, in
492: future that may change, so we include them here so as not to leave a
493: gotcha for a future maintainer. */
494:
495: case OP_UPTO:
496: case OP_UPTOI:
497: case OP_NOTUPTO:
498: case OP_NOTUPTOI:
499: case OP_MINUPTO:
500: case OP_MINUPTOI:
501: case OP_NOTMINUPTO:
502: case OP_NOTMINUPTOI:
503: case OP_POSUPTO:
504: case OP_POSUPTOI:
505: case OP_NOTPOSUPTO:
506: case OP_NOTPOSUPTOI:
507:
508: case OP_STAR:
509: case OP_STARI:
510: case OP_NOTSTAR:
511: case OP_NOTSTARI:
512: case OP_MINSTAR:
513: case OP_MINSTARI:
514: case OP_NOTMINSTAR:
515: case OP_NOTMINSTARI:
516: case OP_POSSTAR:
517: case OP_POSSTARI:
518: case OP_NOTPOSSTAR:
519: case OP_NOTPOSSTARI:
520:
521: case OP_QUERY:
522: case OP_QUERYI:
523: case OP_NOTQUERY:
524: case OP_NOTQUERYI:
525: case OP_MINQUERY:
526: case OP_MINQUERYI:
527: case OP_NOTMINQUERY:
528: case OP_NOTMINQUERYI:
529: case OP_POSQUERY:
530: case OP_POSQUERYI:
531: case OP_NOTPOSQUERY:
532: case OP_NOTPOSQUERYI:
533:
534: cc += PRIV(OP_lengths)[op];
535: #ifdef SUPPORT_UTF
536: if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
537: #endif
538: break;
539:
540: /* Skip these, but we need to add in the name length. */
541:
542: case OP_MARK:
543: case OP_PRUNE_ARG:
544: case OP_SKIP_ARG:
545: case OP_THEN_ARG:
546: cc += PRIV(OP_lengths)[op] + cc[1];
547: break;
548:
549: /* The remaining opcodes are just skipped over. */
550:
551: case OP_CLOSE:
552: case OP_COMMIT:
553: case OP_FAIL:
554: case OP_PRUNE:
555: case OP_SET_SOM:
556: case OP_SKIP:
557: case OP_THEN:
558: cc += PRIV(OP_lengths)[op];
559: break;
560:
561: /* This should not occur: we list all opcodes explicitly so that when
562: new ones get added they are properly considered. */
563:
564: default:
565: return -3;
566: }
567: }
568: /* Control never gets here */
569: }
570:
571:
572:
573: /*************************************************
574: * Set a bit and maybe its alternate case *
575: *************************************************/
576:
577: /* Given a character, set its first byte's bit in the table, and also the
578: corresponding bit for the other version of a letter if we are caseless. In
579: UTF-8 mode, for characters greater than 127, we can only do the caseless thing
580: when Unicode property support is available.
581:
582: Arguments:
583: start_bits points to the bit map
584: p points to the character
585: caseless the caseless flag
586: cd the block with char table pointers
587: utf TRUE for UTF-8 / UTF-16 / UTF-32 mode
588:
589: Returns: pointer after the character
590: */
591:
592: static const pcre_uchar *
593: set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
594: compile_data *cd, BOOL utf)
595: {
596: pcre_uint32 c = *p;
597:
598: #ifdef COMPILE_PCRE8
599: SET_BIT(c);
600:
601: #ifdef SUPPORT_UTF
602: if (utf && c > 127)
603: {
604: GETCHARINC(c, p);
605: #ifdef SUPPORT_UCP
606: if (caseless)
607: {
608: pcre_uchar buff[6];
609: c = UCD_OTHERCASE(c);
610: (void)PRIV(ord2utf)(c, buff);
611: SET_BIT(buff[0]);
612: }
613: #endif /* Not SUPPORT_UCP */
614: return p;
615: }
616: #else /* Not SUPPORT_UTF */
617: (void)(utf); /* Stops warning for unused parameter */
618: #endif /* SUPPORT_UTF */
619:
620: /* Not UTF-8 mode, or character is less than 127. */
621:
622: if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
623: return p + 1;
624: #endif /* COMPILE_PCRE8 */
625:
626: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
627: if (c > 0xff)
628: {
629: c = 0xff;
630: caseless = FALSE;
631: }
632: SET_BIT(c);
633:
634: #ifdef SUPPORT_UTF
635: if (utf && c > 127)
636: {
637: GETCHARINC(c, p);
638: #ifdef SUPPORT_UCP
639: if (caseless)
640: {
641: c = UCD_OTHERCASE(c);
642: if (c > 0xff)
643: c = 0xff;
644: SET_BIT(c);
645: }
646: #endif /* SUPPORT_UCP */
647: return p;
648: }
649: #else /* Not SUPPORT_UTF */
650: (void)(utf); /* Stops warning for unused parameter */
651: #endif /* SUPPORT_UTF */
652:
653: if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
654: return p + 1;
655: #endif
656: }
657:
658:
659:
660: /*************************************************
661: * Set bits for a positive character type *
662: *************************************************/
663:
664: /* This function sets starting bits for a character type. In UTF-8 mode, we can
665: only do a direct setting for bytes less than 128, as otherwise there can be
666: confusion with bytes in the middle of UTF-8 characters. In a "traditional"
667: environment, the tables will only recognize ASCII characters anyway, but in at
668: least one Windows environment, some higher bytes bits were set in the tables.
669: So we deal with that case by considering the UTF-8 encoding.
670:
671: Arguments:
672: start_bits the starting bitmap
673: cbit type the type of character wanted
674: table_limit 32 for non-UTF-8; 16 for UTF-8
675: cd the block with char table pointers
676:
677: Returns: nothing
678: */
679:
680: static void
681: set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
682: compile_data *cd)
683: {
684: register pcre_uint32 c;
685: for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
686: #if defined SUPPORT_UTF && defined COMPILE_PCRE8
687: if (table_limit == 32) return;
688: for (c = 128; c < 256; c++)
689: {
690: if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
691: {
692: pcre_uchar buff[6];
693: (void)PRIV(ord2utf)(c, buff);
694: SET_BIT(buff[0]);
695: }
696: }
697: #endif
698: }
699:
700:
701: /*************************************************
702: * Set bits for a negative character type *
703: *************************************************/
704:
705: /* This function sets starting bits for a negative character type such as \D.
706: In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
707: otherwise there can be confusion with bytes in the middle of UTF-8 characters.
708: Unlike in the positive case, where we can set appropriate starting bits for
709: specific high-valued UTF-8 characters, in this case we have to set the bits for
710: all high-valued characters. The lowest is 0xc2, but we overkill by starting at
711: 0xc0 (192) for simplicity.
712:
713: Arguments:
714: start_bits the starting bitmap
715: cbit type the type of character wanted
716: table_limit 32 for non-UTF-8; 16 for UTF-8
717: cd the block with char table pointers
718:
719: Returns: nothing
720: */
721:
722: static void
723: set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
724: compile_data *cd)
725: {
726: register pcre_uint32 c;
727: for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
728: #if defined SUPPORT_UTF && defined COMPILE_PCRE8
729: if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
730: #endif
731: }
732:
733:
734:
735: /*************************************************
736: * Create bitmap of starting bytes *
737: *************************************************/
738:
739: /* This function scans a compiled unanchored expression recursively and
740: attempts to build a bitmap of the set of possible starting bytes. As time goes
741: by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
742: useful for parenthesized groups in patterns such as (a*)b where the group
743: provides some optional starting bytes but scanning must continue at the outer
744: level to find at least one mandatory byte. At the outermost level, this
745: function fails unless the result is SSB_DONE.
746:
747: Arguments:
748: code points to an expression
749: start_bits points to a 32-byte table, initialized to 0
750: utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
751: cd the block with char table pointers
752:
753: Returns: SSB_FAIL => Failed to find any starting bytes
754: SSB_DONE => Found mandatory starting bytes
755: SSB_CONTINUE => Found optional starting bytes
756: SSB_UNKNOWN => Hit an unrecognized opcode
757: */
758:
759: static int
760: set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
761: compile_data *cd)
762: {
763: register pcre_uint32 c;
764: int yield = SSB_DONE;
765: #if defined SUPPORT_UTF && defined COMPILE_PCRE8
766: int table_limit = utf? 16:32;
767: #else
768: int table_limit = 32;
769: #endif
770:
771: #if 0
772: /* ========================================================================= */
773: /* The following comment and code was inserted in January 1999. In May 2006,
774: when it was observed to cause compiler warnings about unused values, I took it
775: out again. If anybody is still using OS/2, they will have to put it back
776: manually. */
777:
778: /* This next statement and the later reference to dummy are here in order to
779: trick the optimizer of the IBM C compiler for OS/2 into generating correct
780: code. Apparently IBM isn't going to fix the problem, and we would rather not
781: disable optimization (in this module it actually makes a big difference, and
782: the pcre module can use all the optimization it can get). */
783:
784: volatile int dummy;
785: /* ========================================================================= */
786: #endif
787:
788: do
789: {
790: BOOL try_next = TRUE;
791: const pcre_uchar *tcode = code + 1 + LINK_SIZE;
792:
793: if (*code == OP_CBRA || *code == OP_SCBRA ||
794: *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
795:
796: while (try_next) /* Loop for items in this branch */
797: {
798: int rc;
799:
800: switch(*tcode)
801: {
802: /* If we reach something we don't understand, it means a new opcode has
803: been created that hasn't been added to this code. Hopefully this problem
804: will be discovered during testing. */
805:
806: default:
807: return SSB_UNKNOWN;
808:
809: /* Fail for a valid opcode that implies no starting bits. */
810:
811: case OP_ACCEPT:
812: case OP_ASSERT_ACCEPT:
813: case OP_ALLANY:
814: case OP_ANY:
815: case OP_ANYBYTE:
816: case OP_CIRC:
817: case OP_CIRCM:
818: case OP_CLOSE:
819: case OP_COMMIT:
820: case OP_COND:
821: case OP_CREF:
822: case OP_DEF:
823: case OP_DNCREF:
824: case OP_DNREF:
825: case OP_DNREFI:
826: case OP_DNRREF:
827: case OP_DOLL:
828: case OP_DOLLM:
829: case OP_END:
830: case OP_EOD:
831: case OP_EODN:
832: case OP_EXTUNI:
833: case OP_FAIL:
834: case OP_MARK:
835: case OP_NOT:
836: case OP_NOTEXACT:
837: case OP_NOTEXACTI:
838: case OP_NOTI:
839: case OP_NOTMINPLUS:
840: case OP_NOTMINPLUSI:
841: case OP_NOTMINQUERY:
842: case OP_NOTMINQUERYI:
843: case OP_NOTMINSTAR:
844: case OP_NOTMINSTARI:
845: case OP_NOTMINUPTO:
846: case OP_NOTMINUPTOI:
847: case OP_NOTPLUS:
848: case OP_NOTPLUSI:
849: case OP_NOTPOSPLUS:
850: case OP_NOTPOSPLUSI:
851: case OP_NOTPOSQUERY:
852: case OP_NOTPOSQUERYI:
853: case OP_NOTPOSSTAR:
854: case OP_NOTPOSSTARI:
855: case OP_NOTPOSUPTO:
856: case OP_NOTPOSUPTOI:
857: case OP_NOTPROP:
858: case OP_NOTQUERY:
859: case OP_NOTQUERYI:
860: case OP_NOTSTAR:
861: case OP_NOTSTARI:
862: case OP_NOTUPTO:
863: case OP_NOTUPTOI:
864: case OP_NOT_HSPACE:
865: case OP_NOT_VSPACE:
866: case OP_PROP:
867: case OP_PRUNE:
868: case OP_PRUNE_ARG:
869: case OP_RECURSE:
870: case OP_REF:
871: case OP_REFI:
872: case OP_REVERSE:
873: case OP_RREF:
874: case OP_SCOND:
875: case OP_SET_SOM:
876: case OP_SKIP:
877: case OP_SKIP_ARG:
878: case OP_SOD:
879: case OP_SOM:
880: case OP_THEN:
881: case OP_THEN_ARG:
882: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
883: case OP_XCLASS:
884: #endif
885: return SSB_FAIL;
886:
887: /* We can ignore word boundary tests. */
888:
889: case OP_WORD_BOUNDARY:
890: case OP_NOT_WORD_BOUNDARY:
891: tcode++;
892: break;
893:
894: /* If we hit a bracket or a positive lookahead assertion, recurse to set
895: bits from within the subpattern. If it can't find anything, we have to
896: give up. If it finds some mandatory character(s), we are done for this
897: branch. Otherwise, carry on scanning after the subpattern. */
898:
899: case OP_BRA:
900: case OP_SBRA:
901: case OP_CBRA:
902: case OP_SCBRA:
903: case OP_BRAPOS:
904: case OP_SBRAPOS:
905: case OP_CBRAPOS:
906: case OP_SCBRAPOS:
907: case OP_ONCE:
908: case OP_ONCE_NC:
909: case OP_ASSERT:
910: rc = set_start_bits(tcode, start_bits, utf, cd);
911: if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
912: if (rc == SSB_DONE) try_next = FALSE; else
913: {
914: do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
915: tcode += 1 + LINK_SIZE;
916: }
917: break;
918:
919: /* If we hit ALT or KET, it means we haven't found anything mandatory in
920: this branch, though we might have found something optional. For ALT, we
921: continue with the next alternative, but we have to arrange that the final
922: result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
923: return SSB_CONTINUE: if this is the top level, that indicates failure,
924: but after a nested subpattern, it causes scanning to continue. */
925:
926: case OP_ALT:
927: yield = SSB_CONTINUE;
928: try_next = FALSE;
929: break;
930:
931: case OP_KET:
932: case OP_KETRMAX:
933: case OP_KETRMIN:
934: case OP_KETRPOS:
935: return SSB_CONTINUE;
936:
937: /* Skip over callout */
938:
939: case OP_CALLOUT:
940: tcode += 2 + 2*LINK_SIZE;
941: break;
942:
943: /* Skip over lookbehind and negative lookahead assertions */
944:
945: case OP_ASSERT_NOT:
946: case OP_ASSERTBACK:
947: case OP_ASSERTBACK_NOT:
948: do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
949: tcode += 1 + LINK_SIZE;
950: break;
951:
952: /* BRAZERO does the bracket, but carries on. */
953:
954: case OP_BRAZERO:
955: case OP_BRAMINZERO:
956: case OP_BRAPOSZERO:
957: rc = set_start_bits(++tcode, start_bits, utf, cd);
958: if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
959: /* =========================================================================
960: See the comment at the head of this function concerning the next line,
961: which was an old fudge for the benefit of OS/2.
962: dummy = 1;
963: ========================================================================= */
964: do tcode += GET(tcode,1); while (*tcode == OP_ALT);
965: tcode += 1 + LINK_SIZE;
966: break;
967:
968: /* SKIPZERO skips the bracket. */
969:
970: case OP_SKIPZERO:
971: tcode++;
972: do tcode += GET(tcode,1); while (*tcode == OP_ALT);
973: tcode += 1 + LINK_SIZE;
974: break;
975:
976: /* Single-char * or ? sets the bit and tries the next item */
977:
978: case OP_STAR:
979: case OP_MINSTAR:
980: case OP_POSSTAR:
981: case OP_QUERY:
982: case OP_MINQUERY:
983: case OP_POSQUERY:
984: tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
985: break;
986:
987: case OP_STARI:
988: case OP_MINSTARI:
989: case OP_POSSTARI:
990: case OP_QUERYI:
991: case OP_MINQUERYI:
992: case OP_POSQUERYI:
993: tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
994: break;
995:
996: /* Single-char upto sets the bit and tries the next */
997:
998: case OP_UPTO:
999: case OP_MINUPTO:
1000: case OP_POSUPTO:
1001: tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
1002: break;
1003:
1004: case OP_UPTOI:
1005: case OP_MINUPTOI:
1006: case OP_POSUPTOI:
1007: tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
1008: break;
1009:
1010: /* At least one single char sets the bit and stops */
1011:
1012: case OP_EXACT:
1013: tcode += IMM2_SIZE;
1014: /* Fall through */
1015: case OP_CHAR:
1016: case OP_PLUS:
1017: case OP_MINPLUS:
1018: case OP_POSPLUS:
1019: (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1020: try_next = FALSE;
1021: break;
1022:
1023: case OP_EXACTI:
1024: tcode += IMM2_SIZE;
1025: /* Fall through */
1026: case OP_CHARI:
1027: case OP_PLUSI:
1028: case OP_MINPLUSI:
1029: case OP_POSPLUSI:
1030: (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1031: try_next = FALSE;
1032: break;
1033:
1034: /* Special spacing and line-terminating items. These recognize specific
1035: lists of characters. The difference between VSPACE and ANYNL is that the
1036: latter can match the two-character CRLF sequence, but that is not
1037: relevant for finding the first character, so their code here is
1038: identical. */
1039:
1040: case OP_HSPACE:
1041: SET_BIT(CHAR_HT);
1042: SET_BIT(CHAR_SPACE);
1043: #ifdef SUPPORT_UTF
1044: if (utf)
1045: {
1046: #ifdef COMPILE_PCRE8
1047: SET_BIT(0xC2); /* For U+00A0 */
1048: SET_BIT(0xE1); /* For U+1680, U+180E */
1049: SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1050: SET_BIT(0xE3); /* For U+3000 */
1051: #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1052: SET_BIT(0xA0);
1053: SET_BIT(0xFF); /* For characters > 255 */
1054: #endif /* COMPILE_PCRE[8|16|32] */
1055: }
1056: else
1057: #endif /* SUPPORT_UTF */
1058: {
1059: #ifndef EBCDIC
1060: SET_BIT(0xA0);
1061: #endif /* Not EBCDIC */
1062: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1063: SET_BIT(0xFF); /* For characters > 255 */
1064: #endif /* COMPILE_PCRE[16|32] */
1065: }
1066: try_next = FALSE;
1067: break;
1068:
1069: case OP_ANYNL:
1070: case OP_VSPACE:
1071: SET_BIT(CHAR_LF);
1072: SET_BIT(CHAR_VT);
1073: SET_BIT(CHAR_FF);
1074: SET_BIT(CHAR_CR);
1075: #ifdef SUPPORT_UTF
1076: if (utf)
1077: {
1078: #ifdef COMPILE_PCRE8
1079: SET_BIT(0xC2); /* For U+0085 */
1080: SET_BIT(0xE2); /* For U+2028, U+2029 */
1081: #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1082: SET_BIT(CHAR_NEL);
1083: SET_BIT(0xFF); /* For characters > 255 */
1084: #endif /* COMPILE_PCRE[8|16|32] */
1085: }
1086: else
1087: #endif /* SUPPORT_UTF */
1088: {
1089: SET_BIT(CHAR_NEL);
1090: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1091: SET_BIT(0xFF); /* For characters > 255 */
1092: #endif
1093: }
1094: try_next = FALSE;
1095: break;
1096:
1097: /* Single character types set the bits and stop. Note that if PCRE_UCP
1098: is set, we do not see these op codes because \d etc are converted to
1099: properties. Therefore, these apply in the case when only characters less
1100: than 256 are recognized to match the types. */
1101:
1102: case OP_NOT_DIGIT:
1103: set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1104: try_next = FALSE;
1105: break;
1106:
1107: case OP_DIGIT:
1108: set_type_bits(start_bits, cbit_digit, table_limit, cd);
1109: try_next = FALSE;
1110: break;
1111:
1112: /* The cbit_space table has vertical tab as whitespace; we have to
1113: ensure it is set as not whitespace. Luckily, the code value is the same
1114: (0x0b) in ASCII and EBCDIC, so we can just adjust the appropriate bit. */
1115:
1116: case OP_NOT_WHITESPACE:
1117: set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1118: start_bits[1] |= 0x08;
1119: try_next = FALSE;
1120: break;
1121:
1122: /* The cbit_space table has vertical tab as whitespace; we have to not
1123: set it from the table. Luckily, the code value is the same (0x0b) in
1124: ASCII and EBCDIC, so we can just adjust the appropriate bit. */
1125:
1126: case OP_WHITESPACE:
1127: c = start_bits[1]; /* Save in case it was already set */
1128: set_type_bits(start_bits, cbit_space, table_limit, cd);
1129: start_bits[1] = (start_bits[1] & ~0x08) | c;
1130: try_next = FALSE;
1131: break;
1132:
1133: case OP_NOT_WORDCHAR:
1134: set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1135: try_next = FALSE;
1136: break;
1137:
1138: case OP_WORDCHAR:
1139: set_type_bits(start_bits, cbit_word, table_limit, cd);
1140: try_next = FALSE;
1141: break;
1142:
1143: /* One or more character type fudges the pointer and restarts, knowing
1144: it will hit a single character type and stop there. */
1145:
1146: case OP_TYPEPLUS:
1147: case OP_TYPEMINPLUS:
1148: case OP_TYPEPOSPLUS:
1149: tcode++;
1150: break;
1151:
1152: case OP_TYPEEXACT:
1153: tcode += 1 + IMM2_SIZE;
1154: break;
1155:
1156: /* Zero or more repeats of character types set the bits and then
1157: try again. */
1158:
1159: case OP_TYPEUPTO:
1160: case OP_TYPEMINUPTO:
1161: case OP_TYPEPOSUPTO:
1162: tcode += IMM2_SIZE; /* Fall through */
1163:
1164: case OP_TYPESTAR:
1165: case OP_TYPEMINSTAR:
1166: case OP_TYPEPOSSTAR:
1167: case OP_TYPEQUERY:
1168: case OP_TYPEMINQUERY:
1169: case OP_TYPEPOSQUERY:
1170: switch(tcode[1])
1171: {
1172: default:
1173: case OP_ANY:
1174: case OP_ALLANY:
1175: return SSB_FAIL;
1176:
1177: case OP_HSPACE:
1178: SET_BIT(CHAR_HT);
1179: SET_BIT(CHAR_SPACE);
1180: #ifdef SUPPORT_UTF
1181: if (utf)
1182: {
1183: #ifdef COMPILE_PCRE8
1184: SET_BIT(0xC2); /* For U+00A0 */
1185: SET_BIT(0xE1); /* For U+1680, U+180E */
1186: SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1187: SET_BIT(0xE3); /* For U+3000 */
1188: #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1189: SET_BIT(0xA0);
1190: SET_BIT(0xFF); /* For characters > 255 */
1191: #endif /* COMPILE_PCRE[8|16|32] */
1192: }
1193: else
1194: #endif /* SUPPORT_UTF */
1195: #ifndef EBCDIC
1196: SET_BIT(0xA0);
1197: #endif /* Not EBCDIC */
1198: break;
1199:
1200: case OP_ANYNL:
1201: case OP_VSPACE:
1202: SET_BIT(CHAR_LF);
1203: SET_BIT(CHAR_VT);
1204: SET_BIT(CHAR_FF);
1205: SET_BIT(CHAR_CR);
1206: #ifdef SUPPORT_UTF
1207: if (utf)
1208: {
1209: #ifdef COMPILE_PCRE8
1210: SET_BIT(0xC2); /* For U+0085 */
1211: SET_BIT(0xE2); /* For U+2028, U+2029 */
1212: #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1213: SET_BIT(CHAR_NEL);
1214: SET_BIT(0xFF); /* For characters > 255 */
1215: #endif /* COMPILE_PCRE16 */
1216: }
1217: else
1218: #endif /* SUPPORT_UTF */
1219: SET_BIT(CHAR_NEL);
1220: break;
1221:
1222: case OP_NOT_DIGIT:
1223: set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1224: break;
1225:
1226: case OP_DIGIT:
1227: set_type_bits(start_bits, cbit_digit, table_limit, cd);
1228: break;
1229:
1230: /* The cbit_space table has vertical tab as whitespace; we no longer
1231: have to play fancy tricks because Perl added VT to its whitespace at
1232: release 5.18. PCRE added it at release 8.34. */
1233:
1234: case OP_NOT_WHITESPACE:
1235: set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1236: break;
1237:
1238: case OP_WHITESPACE:
1239: set_type_bits(start_bits, cbit_space, table_limit, cd);
1240: break;
1241:
1242: case OP_NOT_WORDCHAR:
1243: set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1244: break;
1245:
1246: case OP_WORDCHAR:
1247: set_type_bits(start_bits, cbit_word, table_limit, cd);
1248: break;
1249: }
1250:
1251: tcode += 2;
1252: break;
1253:
1254: /* Character class where all the information is in a bit map: set the
1255: bits and either carry on or not, according to the repeat count. If it was
1256: a negative class, and we are operating with UTF-8 characters, any byte
1257: with a value >= 0xc4 is a potentially valid starter because it starts a
1258: character with a value > 255. */
1259:
1260: case OP_NCLASS:
1261: #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1262: if (utf)
1263: {
1264: start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
1265: memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
1266: }
1267: #endif
1268: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1269: SET_BIT(0xFF); /* For characters > 255 */
1270: #endif
1271: /* Fall through */
1272:
1273: case OP_CLASS:
1274: {
1275: pcre_uint8 *map;
1276: tcode++;
1277: map = (pcre_uint8 *)tcode;
1278:
1279: /* In UTF-8 mode, the bits in a bit map correspond to character
1280: values, not to byte values. However, the bit map we are constructing is
1281: for byte values. So we have to do a conversion for characters whose
1282: value is > 127. In fact, there are only two possible starting bytes for
1283: characters in the range 128 - 255. */
1284:
1285: #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1286: if (utf)
1287: {
1288: for (c = 0; c < 16; c++) start_bits[c] |= map[c];
1289: for (c = 128; c < 256; c++)
1290: {
1291: if ((map[c/8] && (1 << (c&7))) != 0)
1292: {
1293: int d = (c >> 6) | 0xc0; /* Set bit for this starter */
1294: start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
1295: c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
1296: }
1297: }
1298: }
1299: else
1300: #endif
1301: {
1302: /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1303: for (c = 0; c < 32; c++) start_bits[c] |= map[c];
1304: }
1305:
1306: /* Advance past the bit map, and act on what follows. For a zero
1307: minimum repeat, continue; otherwise stop processing. */
1308:
1309: tcode += 32 / sizeof(pcre_uchar);
1310: switch (*tcode)
1311: {
1312: case OP_CRSTAR:
1313: case OP_CRMINSTAR:
1314: case OP_CRQUERY:
1315: case OP_CRMINQUERY:
1316: case OP_CRPOSSTAR:
1317: case OP_CRPOSQUERY:
1318: tcode++;
1319: break;
1320:
1321: case OP_CRRANGE:
1322: case OP_CRMINRANGE:
1323: case OP_CRPOSRANGE:
1324: if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1325: else try_next = FALSE;
1326: break;
1327:
1328: default:
1329: try_next = FALSE;
1330: break;
1331: }
1332: }
1333: break; /* End of bitmap class handling */
1334:
1335: } /* End of switch */
1336: } /* End of try_next loop */
1337:
1338: code += GET(code, 1); /* Advance to next branch */
1339: }
1340: while (*code == OP_ALT);
1341: return yield;
1342: }
1343:
1344:
1345:
1346:
1347:
1348: /*************************************************
1349: * Study a compiled expression *
1350: *************************************************/
1351:
1352: /* This function is handed a compiled expression that it must study to produce
1353: information that will speed up the matching. It returns a pcre[16]_extra block
1354: which then gets handed back to pcre_exec().
1355:
1356: Arguments:
1357: re points to the compiled expression
1358: options contains option bits
1359: errorptr points to where to place error messages;
1360: set NULL unless error
1361:
1362: Returns: pointer to a pcre[16]_extra block, with study_data filled in and
1363: the appropriate flags set;
1364: NULL on error or if no optimization possible
1365: */
1366:
1367: #if defined COMPILE_PCRE8
1368: PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1369: pcre_study(const pcre *external_re, int options, const char **errorptr)
1370: #elif defined COMPILE_PCRE16
1371: PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
1372: pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
1373: #elif defined COMPILE_PCRE32
1374: PCRE_EXP_DEFN pcre32_extra * PCRE_CALL_CONVENTION
1375: pcre32_study(const pcre32 *external_re, int options, const char **errorptr)
1376: #endif
1377: {
1378: int min;
1379: BOOL bits_set = FALSE;
1380: pcre_uint8 start_bits[32];
1381: PUBL(extra) *extra = NULL;
1382: pcre_study_data *study;
1383: const pcre_uint8 *tables;
1384: pcre_uchar *code;
1385: compile_data compile_block;
1386: const REAL_PCRE *re = (const REAL_PCRE *)external_re;
1387:
1388:
1389: *errorptr = NULL;
1390:
1391: if (re == NULL || re->magic_number != MAGIC_NUMBER)
1392: {
1393: *errorptr = "argument is not a compiled regular expression";
1394: return NULL;
1395: }
1396:
1397: if ((re->flags & PCRE_MODE) == 0)
1398: {
1399: #if defined COMPILE_PCRE8
1400: *errorptr = "argument not compiled in 8 bit mode";
1401: #elif defined COMPILE_PCRE16
1402: *errorptr = "argument not compiled in 16 bit mode";
1403: #elif defined COMPILE_PCRE32
1404: *errorptr = "argument not compiled in 32 bit mode";
1405: #endif
1406: return NULL;
1407: }
1408:
1409: if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1410: {
1411: *errorptr = "unknown or incorrect option bit(s) set";
1412: return NULL;
1413: }
1414:
1415: code = (pcre_uchar *)re + re->name_table_offset +
1416: (re->name_count * re->name_entry_size);
1417:
1418: /* For an anchored pattern, or an unanchored pattern that has a first char, or
1419: a multiline pattern that matches only at "line starts", there is no point in
1420: seeking a list of starting bytes. */
1421:
1422: if ((re->options & PCRE_ANCHORED) == 0 &&
1423: (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1424: {
1425: int rc;
1426:
1427: /* Set the character tables in the block that is passed around */
1428:
1429: tables = re->tables;
1430:
1431: #if defined COMPILE_PCRE8
1432: if (tables == NULL)
1433: (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1434: (void *)(&tables));
1435: #elif defined COMPILE_PCRE16
1436: if (tables == NULL)
1437: (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1438: (void *)(&tables));
1439: #elif defined COMPILE_PCRE32
1440: if (tables == NULL)
1441: (void)pcre32_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1442: (void *)(&tables));
1443: #endif
1444:
1445: compile_block.lcc = tables + lcc_offset;
1446: compile_block.fcc = tables + fcc_offset;
1447: compile_block.cbits = tables + cbits_offset;
1448: compile_block.ctypes = tables + ctypes_offset;
1449:
1450: /* See if we can find a fixed set of initial characters for the pattern. */
1451:
1452: memset(start_bits, 0, 32 * sizeof(pcre_uint8));
1453: rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1454: &compile_block);
1455: bits_set = rc == SSB_DONE;
1456: if (rc == SSB_UNKNOWN)
1457: {
1458: *errorptr = "internal error: opcode not recognized";
1459: return NULL;
1460: }
1461: }
1462:
1463: /* Find the minimum length of subject string. */
1464:
1465: switch(min = find_minlength(re, code, code, re->options, 0))
1466: {
1467: case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
1468: case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
1469: default: break;
1470: }
1471:
1472: /* If a set of starting bytes has been identified, or if the minimum length is
1473: greater than zero, or if JIT optimization has been requested, or if
1474: PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a
1475: pcre_study_data block. The study data is put in the latter, which is pointed to
1476: by the former, which may also get additional data set later by the calling
1477: program. At the moment, the size of pcre_study_data is fixed. We nevertheless
1478: save it in a field for returning via the pcre_fullinfo() function so that if it
1479: becomes variable in the future, we don't have to change that code. */
1480:
1481: if (bits_set || min > 0 || (options & (
1482: #ifdef SUPPORT_JIT
1483: PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE |
1484: PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE |
1485: #endif
1486: PCRE_STUDY_EXTRA_NEEDED)) != 0)
1487: {
1488: extra = (PUBL(extra) *)(PUBL(malloc))
1489: (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
1490: if (extra == NULL)
1491: {
1492: *errorptr = "failed to get memory";
1493: return NULL;
1494: }
1495:
1496: study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
1497: extra->flags = PCRE_EXTRA_STUDY_DATA;
1498: extra->study_data = study;
1499:
1500: study->size = sizeof(pcre_study_data);
1501: study->flags = 0;
1502:
1503: /* Set the start bits always, to avoid unset memory errors if the
1504: study data is written to a file, but set the flag only if any of the bits
1505: are set, to save time looking when none are. */
1506:
1507: if (bits_set)
1508: {
1509: study->flags |= PCRE_STUDY_MAPPED;
1510: memcpy(study->start_bits, start_bits, sizeof(start_bits));
1511: }
1512: else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
1513:
1514: #ifdef PCRE_DEBUG
1515: if (bits_set)
1516: {
1517: pcre_uint8 *ptr = start_bits;
1518: int i;
1519:
1520: printf("Start bits:\n");
1521: for (i = 0; i < 32; i++)
1522: printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
1523: }
1524: #endif
1525:
1526: /* Always set the minlength value in the block, because the JIT compiler
1527: makes use of it. However, don't set the bit unless the length is greater than
1528: zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
1529: checking the zero case. */
1530:
1531: if (min > 0)
1532: {
1533: study->flags |= PCRE_STUDY_MINLEN;
1534: study->minlength = min;
1535: }
1536: else study->minlength = 0;
1537:
1538: /* If JIT support was compiled and requested, attempt the JIT compilation.
1539: If no starting bytes were found, and the minimum length is zero, and JIT
1540: compilation fails, abandon the extra block and return NULL, unless
1541: PCRE_STUDY_EXTRA_NEEDED is set. */
1542:
1543: #ifdef SUPPORT_JIT
1544: extra->executable_jit = NULL;
1545: if ((options & PCRE_STUDY_JIT_COMPILE) != 0)
1546: PRIV(jit_compile)(re, extra, JIT_COMPILE);
1547: if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0)
1548: PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE);
1549: if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0)
1550: PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE);
1551:
1552: if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0 &&
1553: (options & PCRE_STUDY_EXTRA_NEEDED) == 0)
1554: {
1555: #if defined COMPILE_PCRE8
1556: pcre_free_study(extra);
1557: #elif defined COMPILE_PCRE16
1558: pcre16_free_study(extra);
1559: #elif defined COMPILE_PCRE32
1560: pcre32_free_study(extra);
1561: #endif
1562: extra = NULL;
1563: }
1564: #endif
1565: }
1566:
1567: return extra;
1568: }
1569:
1570:
1571: /*************************************************
1572: * Free the study data *
1573: *************************************************/
1574:
1575: /* This function frees the memory that was obtained by pcre_study().
1576:
1577: Argument: a pointer to the pcre[16]_extra block
1578: Returns: nothing
1579: */
1580:
1581: #if defined COMPILE_PCRE8
1582: PCRE_EXP_DEFN void
1583: pcre_free_study(pcre_extra *extra)
1584: #elif defined COMPILE_PCRE16
1585: PCRE_EXP_DEFN void
1586: pcre16_free_study(pcre16_extra *extra)
1587: #elif defined COMPILE_PCRE32
1588: PCRE_EXP_DEFN void
1589: pcre32_free_study(pcre32_extra *extra)
1590: #endif
1591: {
1592: if (extra == NULL)
1593: return;
1594: #ifdef SUPPORT_JIT
1595: if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
1596: extra->executable_jit != NULL)
1597: PRIV(jit_free)(extra->executable_jit);
1598: #endif
1599: PUBL(free)(extra);
1600: }
1601:
1602: /* End of pcre_study.c */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>