--- embedaddon/pcre/pcre_study.c 2012/02/21 23:05:51 1.1.1.1 +++ embedaddon/pcre/pcre_study.c 2014/06/15 19:46:03 1.1.1.5 @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2010 University of Cambridge + Copyright (c) 1997-2012 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -66,8 +66,9 @@ string of that length that matches. In UTF8 mode, the rather than bytes. Arguments: + re compiled pattern block code pointer to start of group (the bracket) - startcode pointer to start of the whole pattern + startcode pointer to start of the whole pattern's code options the compiling options int RECURSE depth @@ -78,17 +79,18 @@ Returns: the minimum length */ static int -find_minlength(const uschar *code, const uschar *startcode, int options, - int recurse_depth) +find_minlength(const REAL_PCRE *re, const pcre_uchar *code, + const pcre_uchar *startcode, int options, int recurse_depth) { int length = -1; -BOOL utf8 = (options & PCRE_UTF8) != 0; +/* PCRE_UTF16 has the same value as PCRE_UTF8. */ +BOOL utf = (options & PCRE_UTF8) != 0; BOOL had_recurse = FALSE; register int branchlength = 0; -register uschar *cc = (uschar *)code + 1 + LINK_SIZE; +register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE; if (*code == OP_CBRA || *code == OP_SCBRA || - *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += 2; + *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE; /* Scan along the opcodes for this branch. If we get to the end of the branch, check the length against that of the other branches. */ @@ -96,8 +98,8 @@ branch, check the length against that of the other bra for (;;) { int d, min; - uschar *cs, *ce; - register int op = *cc; + pcre_uchar *cs, *ce; + register pcre_uchar op = *cc; switch (op) { @@ -128,7 +130,7 @@ for (;;) case OP_SBRAPOS: case OP_ONCE: case OP_ONCE_NC: - d = find_minlength(cc, startcode, options, recurse_depth); + d = find_minlength(re, cc, startcode, options, recurse_depth); if (d < 0) return d; branchlength += d; do cc += GET(cc, 1); while (*cc == OP_ALT); @@ -174,9 +176,9 @@ for (;;) case OP_REVERSE: case OP_CREF: - case OP_NCREF: + case OP_DNCREF: case OP_RREF: - case OP_NRREF: + case OP_DNRREF: case OP_DEF: case OP_CALLOUT: case OP_SOD: @@ -189,7 +191,7 @@ for (;;) case OP_DOLLM: case OP_NOT_WORD_BOUNDARY: case OP_WORD_BOUNDARY: - cc += _pcre_OP_lengths[*cc]; + cc += PRIV(OP_lengths)[*cc]; break; /* Skip over a subpattern that has a {0} or {0,x} quantifier */ @@ -198,7 +200,7 @@ for (;;) case OP_BRAMINZERO: case OP_BRAPOSZERO: case OP_SKIPZERO: - cc += _pcre_OP_lengths[*cc]; + cc += PRIV(OP_lengths)[*cc]; do cc += GET(cc, 1); while (*cc == OP_ALT); cc += 1 + LINK_SIZE; break; @@ -223,8 +225,8 @@ for (;;) case OP_NOTPOSPLUSI: branchlength++; cc += 2; -#ifdef SUPPORT_UTF8 - if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f]; +#ifdef SUPPORT_UTF + if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); #endif break; @@ -243,15 +245,16 @@ for (;;) case OP_NOTEXACT: case OP_NOTEXACTI: branchlength += GET2(cc,1); - cc += 4; -#ifdef SUPPORT_UTF8 - if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f]; + cc += 2 + IMM2_SIZE; +#ifdef SUPPORT_UTF + if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); #endif break; case OP_TYPEEXACT: branchlength += GET2(cc,1); - cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4; + cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP + || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0); break; /* Handle single-char non-literal matchers */ @@ -291,8 +294,8 @@ for (;;) appear, but leave the code, just in case.) */ case OP_ANYBYTE: -#ifdef SUPPORT_UTF8 - if (utf8) return -1; +#ifdef SUPPORT_UTF + if (utf) return -1; #endif branchlength++; cc++; @@ -308,32 +311,38 @@ for (;;) case OP_TYPEPOSSTAR: case OP_TYPEPOSQUERY: if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2; - cc += _pcre_OP_lengths[op]; + cc += PRIV(OP_lengths)[op]; break; case OP_TYPEUPTO: case OP_TYPEMINUPTO: case OP_TYPEPOSUPTO: - if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2; - cc += _pcre_OP_lengths[op]; + if (cc[1 + IMM2_SIZE] == OP_PROP + || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2; + cc += PRIV(OP_lengths)[op]; break; /* Check a class for variable quantification */ -#ifdef SUPPORT_UTF8 + case OP_CLASS: + case OP_NCLASS: +#if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32 case OP_XCLASS: - cc += GET(cc, 1) - 33; - /* Fall through */ + /* The original code caused an unsigned overflow in 64 bit systems, + so now we use a conditional statement. */ + if (op == OP_XCLASS) + cc += GET(cc, 1); + else + cc += PRIV(OP_lengths)[OP_CLASS]; +#else + cc += PRIV(OP_lengths)[OP_CLASS]; #endif - case OP_CLASS: - case OP_NCLASS: - cc += 33; - switch (*cc) { case OP_CRPLUS: case OP_CRMINPLUS: + case OP_CRPOSPLUS: branchlength++; /* Fall through */ @@ -341,13 +350,16 @@ for (;;) case OP_CRMINSTAR: case OP_CRQUERY: case OP_CRMINQUERY: + case OP_CRPOSSTAR: + case OP_CRPOSQUERY: cc++; break; case OP_CRRANGE: case OP_CRMINRANGE: + case OP_CRPOSRANGE: branchlength += GET2(cc,1); - cc += 5; + cc += 1 + 2 * IMM2_SIZE; break; default: @@ -368,11 +380,42 @@ for (;;) matches an empty string (by default it causes a matching failure), so in that case we must set the minimum length to zero. */ - case OP_REF: + case OP_DNREF: /* Duplicate named pattern back reference */ + case OP_DNREFI: + if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) + { + int count = GET2(cc, 1+IMM2_SIZE); + pcre_uchar *slot = (pcre_uchar *)re + + re->name_table_offset + GET2(cc, 1) * re->name_entry_size; + d = INT_MAX; + while (count-- > 0) + { + ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0)); + if (cs == NULL) return -2; + do ce += GET(ce, 1); while (*ce == OP_ALT); + if (cc > cs && cc < ce) + { + d = 0; + had_recurse = TRUE; + break; + } + else + { + int dd = find_minlength(re, cs, startcode, options, recurse_depth); + if (dd < d) d = dd; + } + slot += re->name_entry_size; + } + } + else d = 0; + cc += 1 + 2*IMM2_SIZE; + goto REPEAT_BACK_REFERENCE; + + case OP_REF: /* Single back reference */ case OP_REFI: if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) { - ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1)); + ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1)); if (cs == NULL) return -2; do ce += GET(ce, 1); while (*ce == OP_ALT); if (cc > cs && cc < ce) @@ -382,34 +425,39 @@ for (;;) } else { - d = find_minlength(cs, startcode, options, recurse_depth); + d = find_minlength(re, cs, startcode, options, recurse_depth); } } else d = 0; - cc += 3; + cc += 1 + IMM2_SIZE; /* Handle repeated back references */ + REPEAT_BACK_REFERENCE: switch (*cc) { case OP_CRSTAR: case OP_CRMINSTAR: case OP_CRQUERY: case OP_CRMINQUERY: + case OP_CRPOSSTAR: + case OP_CRPOSQUERY: min = 0; cc++; break; case OP_CRPLUS: case OP_CRMINPLUS: + case OP_CRPOSPLUS: min = 1; cc++; break; case OP_CRRANGE: case OP_CRMINRANGE: + case OP_CRPOSRANGE: min = GET2(cc, 1); - cc += 5; + cc += 1 + 2 * IMM2_SIZE; break; default: @@ -424,13 +472,14 @@ for (;;) caught by a recursion depth count. */ case OP_RECURSE: - cs = ce = (uschar *)startcode + GET(cc, 1); + cs = ce = (pcre_uchar *)startcode + GET(cc, 1); do ce += GET(ce, 1); while (*ce == OP_ALT); if ((cc > cs && cc < ce) || recurse_depth > 10) had_recurse = TRUE; else { - branchlength += find_minlength(cs, startcode, options, recurse_depth + 1); + branchlength += find_minlength(re, cs, startcode, options, + recurse_depth + 1); } cc += 1 + LINK_SIZE; break; @@ -482,9 +531,9 @@ for (;;) case OP_NOTPOSQUERY: case OP_NOTPOSQUERYI: - cc += _pcre_OP_lengths[op]; -#ifdef SUPPORT_UTF8 - if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f]; + cc += PRIV(OP_lengths)[op]; +#ifdef SUPPORT_UTF + if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); #endif break; @@ -494,7 +543,7 @@ for (;;) case OP_PRUNE_ARG: case OP_SKIP_ARG: case OP_THEN_ARG: - cc += _pcre_OP_lengths[op] + cc[1]; + cc += PRIV(OP_lengths)[op] + cc[1]; break; /* The remaining opcodes are just skipped over. */ @@ -506,7 +555,7 @@ for (;;) case OP_SET_SOM: case OP_SKIP: case OP_THEN: - cc += _pcre_OP_lengths[op]; + cc += PRIV(OP_lengths)[op]; break; /* This should not occur: we list all opcodes explicitly so that when @@ -535,40 +584,75 @@ Arguments: p points to the character caseless the caseless flag cd the block with char table pointers - utf8 TRUE for UTF-8 mode + utf TRUE for UTF-8 / UTF-16 / UTF-32 mode Returns: pointer after the character */ -static const uschar * -set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless, - compile_data *cd, BOOL utf8) +static const pcre_uchar * +set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless, + compile_data *cd, BOOL utf) { -unsigned int c = *p; +pcre_uint32 c = *p; +#ifdef COMPILE_PCRE8 SET_BIT(c); -#ifdef SUPPORT_UTF8 -if (utf8 && c > 127) +#ifdef SUPPORT_UTF +if (utf && c > 127) { GETCHARINC(c, p); #ifdef SUPPORT_UCP if (caseless) { - uschar buff[8]; + pcre_uchar buff[6]; c = UCD_OTHERCASE(c); - (void)_pcre_ord2utf8(c, buff); + (void)PRIV(ord2utf)(c, buff); SET_BIT(buff[0]); } -#endif +#endif /* Not SUPPORT_UCP */ return p; } -#endif +#else /* Not SUPPORT_UTF */ +(void)(utf); /* Stops warning for unused parameter */ +#endif /* SUPPORT_UTF */ /* Not UTF-8 mode, or character is less than 127. */ if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]); return p + 1; +#endif /* COMPILE_PCRE8 */ + +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 +if (c > 0xff) + { + c = 0xff; + caseless = FALSE; + } +SET_BIT(c); + +#ifdef SUPPORT_UTF +if (utf && c > 127) + { + GETCHARINC(c, p); +#ifdef SUPPORT_UCP + if (caseless) + { + c = UCD_OTHERCASE(c); + if (c > 0xff) + c = 0xff; + SET_BIT(c); + } +#endif /* SUPPORT_UCP */ + return p; + } +#else /* Not SUPPORT_UTF */ +(void)(utf); /* Stops warning for unused parameter */ +#endif /* SUPPORT_UTF */ + +if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]); +return p + 1; +#endif } @@ -594,21 +678,23 @@ Returns: nothing */ static void -set_type_bits(uschar *start_bits, int cbit_type, int table_limit, +set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit, compile_data *cd) { -register int c; +register pcre_uint32 c; for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type]; +#if defined SUPPORT_UTF && defined COMPILE_PCRE8 if (table_limit == 32) return; for (c = 128; c < 256; c++) { if ((cd->cbits[c/8] & (1 << (c&7))) != 0) { - uschar buff[8]; - (void)_pcre_ord2utf8(c, buff); + pcre_uchar buff[6]; + (void)PRIV(ord2utf)(c, buff); SET_BIT(buff[0]); } } +#endif } @@ -634,12 +720,14 @@ Returns: nothing */ static void -set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit, +set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit, compile_data *cd) { -register int c; +register pcre_uint32 c; for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type]; +#if defined SUPPORT_UTF && defined COMPILE_PCRE8 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff; +#endif } @@ -659,7 +747,7 @@ function fails unless the result is SSB_DONE. Arguments: code points to an expression start_bits points to a 32-byte table, initialized to 0 - utf8 TRUE if in UTF-8 mode + utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode cd the block with char table pointers Returns: SSB_FAIL => Failed to find any starting bytes @@ -669,12 +757,16 @@ Returns: SSB_FAIL => Failed to find any star */ static int -set_start_bits(const uschar *code, uschar *start_bits, BOOL utf8, +set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf, compile_data *cd) { -register int c; +register pcre_uint32 c; int yield = SSB_DONE; -int table_limit = utf8? 16:32; +#if defined SUPPORT_UTF && defined COMPILE_PCRE8 +int table_limit = utf? 16:32; +#else +int table_limit = 32; +#endif #if 0 /* ========================================================================= */ @@ -696,10 +788,10 @@ volatile int dummy; do { BOOL try_next = TRUE; - const uschar *tcode = code + 1 + LINK_SIZE; + const pcre_uchar *tcode = code + 1 + LINK_SIZE; if (*code == OP_CBRA || *code == OP_SCBRA || - *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += 2; + *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE; while (try_next) /* Loop for items in this branch */ { @@ -728,6 +820,10 @@ do case OP_COND: case OP_CREF: case OP_DEF: + case OP_DNCREF: + case OP_DNREF: + case OP_DNREFI: + case OP_DNRREF: case OP_DOLL: case OP_DOLLM: case OP_END: @@ -736,7 +832,6 @@ do case OP_EXTUNI: case OP_FAIL: case OP_MARK: - case OP_NCREF: case OP_NOT: case OP_NOTEXACT: case OP_NOTEXACTI: @@ -768,7 +863,6 @@ do case OP_NOTUPTOI: case OP_NOT_HSPACE: case OP_NOT_VSPACE: - case OP_NRREF: case OP_PROP: case OP_PRUNE: case OP_PRUNE_ARG: @@ -785,7 +879,9 @@ do case OP_SOM: case OP_THEN: case OP_THEN_ARG: +#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 case OP_XCLASS: +#endif return SSB_FAIL; /* We can ignore word boundary tests. */ @@ -811,7 +907,7 @@ do case OP_ONCE: case OP_ONCE_NC: case OP_ASSERT: - rc = set_start_bits(tcode, start_bits, utf8, cd); + rc = set_start_bits(tcode, start_bits, utf, cd); if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc; if (rc == SSB_DONE) try_next = FALSE; else { @@ -858,7 +954,7 @@ do case OP_BRAZERO: case OP_BRAMINZERO: case OP_BRAPOSZERO: - rc = set_start_bits(++tcode, start_bits, utf8, cd); + rc = set_start_bits(++tcode, start_bits, utf, cd); if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc; /* ========================================================================= See the comment at the head of this function concerning the next line, @@ -885,7 +981,7 @@ do case OP_QUERY: case OP_MINQUERY: case OP_POSQUERY: - tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8); + tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf); break; case OP_STARI: @@ -894,7 +990,7 @@ do case OP_QUERYI: case OP_MINQUERYI: case OP_POSQUERYI: - tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8); + tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf); break; /* Single-char upto sets the bit and tries the next */ @@ -902,36 +998,36 @@ do case OP_UPTO: case OP_MINUPTO: case OP_POSUPTO: - tcode = set_table_bit(start_bits, tcode + 3, FALSE, cd, utf8); + tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf); break; case OP_UPTOI: case OP_MINUPTOI: case OP_POSUPTOI: - tcode = set_table_bit(start_bits, tcode + 3, TRUE, cd, utf8); + tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf); break; /* At least one single char sets the bit and stops */ case OP_EXACT: - tcode += 2; + tcode += IMM2_SIZE; /* Fall through */ case OP_CHAR: case OP_PLUS: case OP_MINPLUS: case OP_POSPLUS: - (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8); + (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf); try_next = FALSE; break; case OP_EXACTI: - tcode += 2; + tcode += IMM2_SIZE; /* Fall through */ case OP_CHARI: case OP_PLUSI: case OP_MINPLUSI: case OP_POSPLUSI: - (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8); + (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf); try_next = FALSE; break; @@ -942,31 +1038,59 @@ do identical. */ case OP_HSPACE: - SET_BIT(0x09); - SET_BIT(0x20); - if (utf8) + SET_BIT(CHAR_HT); + SET_BIT(CHAR_SPACE); +#ifdef SUPPORT_UTF + if (utf) { +#ifdef COMPILE_PCRE8 SET_BIT(0xC2); /* For U+00A0 */ SET_BIT(0xE1); /* For U+1680, U+180E */ SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */ SET_BIT(0xE3); /* For U+3000 */ +#elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + SET_BIT(0xA0); + SET_BIT(0xFF); /* For characters > 255 */ +#endif /* COMPILE_PCRE[8|16|32] */ } - else SET_BIT(0xA0); + else +#endif /* SUPPORT_UTF */ + { +#ifndef EBCDIC + SET_BIT(0xA0); +#endif /* Not EBCDIC */ +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + SET_BIT(0xFF); /* For characters > 255 */ +#endif /* COMPILE_PCRE[16|32] */ + } try_next = FALSE; break; case OP_ANYNL: case OP_VSPACE: - SET_BIT(0x0A); - SET_BIT(0x0B); - SET_BIT(0x0C); - SET_BIT(0x0D); - if (utf8) + SET_BIT(CHAR_LF); + SET_BIT(CHAR_VT); + SET_BIT(CHAR_FF); + SET_BIT(CHAR_CR); +#ifdef SUPPORT_UTF + if (utf) { +#ifdef COMPILE_PCRE8 SET_BIT(0xC2); /* For U+0085 */ SET_BIT(0xE2); /* For U+2028, U+2029 */ +#elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + SET_BIT(CHAR_NEL); + SET_BIT(0xFF); /* For characters > 255 */ +#endif /* COMPILE_PCRE[8|16|32] */ } - else SET_BIT(0x85); + else +#endif /* SUPPORT_UTF */ + { + SET_BIT(CHAR_NEL); +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + SET_BIT(0xFF); /* For characters > 255 */ +#endif + } try_next = FALSE; break; @@ -986,7 +1110,8 @@ do break; /* The cbit_space table has vertical tab as whitespace; we have to - ensure it is set as not whitespace. */ + ensure it is set as not whitespace. Luckily, the code value is the same + (0x0b) in ASCII and EBCDIC, so we can just adjust the appropriate bit. */ case OP_NOT_WHITESPACE: set_nottype_bits(start_bits, cbit_space, table_limit, cd); @@ -994,8 +1119,9 @@ do try_next = FALSE; break; - /* The cbit_space table has vertical tab as whitespace; we have to - not set it from the table. */ + /* The cbit_space table has vertical tab as whitespace; we have to not + set it from the table. Luckily, the code value is the same (0x0b) in + ASCII and EBCDIC, so we can just adjust the appropriate bit. */ case OP_WHITESPACE: c = start_bits[1]; /* Save in case it was already set */ @@ -1024,7 +1150,7 @@ do break; case OP_TYPEEXACT: - tcode += 3; + tcode += 1 + IMM2_SIZE; break; /* Zero or more repeats of character types set the bits and then @@ -1033,7 +1159,7 @@ do case OP_TYPEUPTO: case OP_TYPEMINUPTO: case OP_TYPEPOSUPTO: - tcode += 2; /* Fall through */ + tcode += IMM2_SIZE; /* Fall through */ case OP_TYPESTAR: case OP_TYPEMINSTAR: @@ -1049,30 +1175,48 @@ do return SSB_FAIL; case OP_HSPACE: - SET_BIT(0x09); - SET_BIT(0x20); - if (utf8) + SET_BIT(CHAR_HT); + SET_BIT(CHAR_SPACE); +#ifdef SUPPORT_UTF + if (utf) { +#ifdef COMPILE_PCRE8 SET_BIT(0xC2); /* For U+00A0 */ SET_BIT(0xE1); /* For U+1680, U+180E */ SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */ SET_BIT(0xE3); /* For U+3000 */ +#elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + SET_BIT(0xA0); + SET_BIT(0xFF); /* For characters > 255 */ +#endif /* COMPILE_PCRE[8|16|32] */ } - else SET_BIT(0xA0); + else +#endif /* SUPPORT_UTF */ +#ifndef EBCDIC + SET_BIT(0xA0); +#endif /* Not EBCDIC */ break; case OP_ANYNL: case OP_VSPACE: - SET_BIT(0x0A); - SET_BIT(0x0B); - SET_BIT(0x0C); - SET_BIT(0x0D); - if (utf8) + SET_BIT(CHAR_LF); + SET_BIT(CHAR_VT); + SET_BIT(CHAR_FF); + SET_BIT(CHAR_CR); +#ifdef SUPPORT_UTF + if (utf) { +#ifdef COMPILE_PCRE8 SET_BIT(0xC2); /* For U+0085 */ SET_BIT(0xE2); /* For U+2028, U+2029 */ +#elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + SET_BIT(CHAR_NEL); + SET_BIT(0xFF); /* For characters > 255 */ +#endif /* COMPILE_PCRE16 */ } - else SET_BIT(0x85); + else +#endif /* SUPPORT_UTF */ + SET_BIT(CHAR_NEL); break; case OP_NOT_DIGIT: @@ -1083,21 +1227,16 @@ do set_type_bits(start_bits, cbit_digit, table_limit, cd); break; - /* The cbit_space table has vertical tab as whitespace; we have to - ensure it gets set as not whitespace. */ + /* The cbit_space table has vertical tab as whitespace; we no longer + have to play fancy tricks because Perl added VT to its whitespace at + release 5.18. PCRE added it at release 8.34. */ case OP_NOT_WHITESPACE: set_nottype_bits(start_bits, cbit_space, table_limit, cd); - start_bits[1] |= 0x08; break; - /* The cbit_space table has vertical tab as whitespace; we have to - avoid setting it. */ - case OP_WHITESPACE: - c = start_bits[1]; /* Save in case it was already set */ set_type_bits(start_bits, cbit_space, table_limit, cd); - start_bits[1] = (start_bits[1] & ~0x08) | c; break; case OP_NOT_WORDCHAR: @@ -1119,18 +1258,23 @@ do character with a value > 255. */ case OP_NCLASS: -#ifdef SUPPORT_UTF8 - if (utf8) +#if defined SUPPORT_UTF && defined COMPILE_PCRE8 + if (utf) { start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */ memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */ } #endif +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + SET_BIT(0xFF); /* For characters > 255 */ +#endif /* Fall through */ case OP_CLASS: { + pcre_uint8 *map; tcode++; + map = (pcre_uint8 *)tcode; /* In UTF-8 mode, the bits in a bit map correspond to character values, not to byte values. However, the bit map we are constructing is @@ -1138,13 +1282,13 @@ do value is > 127. In fact, there are only two possible starting bytes for characters in the range 128 - 255. */ -#ifdef SUPPORT_UTF8 - if (utf8) +#if defined SUPPORT_UTF && defined COMPILE_PCRE8 + if (utf) { - for (c = 0; c < 16; c++) start_bits[c] |= tcode[c]; + for (c = 0; c < 16; c++) start_bits[c] |= map[c]; for (c = 128; c < 256; c++) { - if ((tcode[c/8] && (1 << (c&7))) != 0) + if ((map[c/8] && (1 << (c&7))) != 0) { int d = (c >> 6) | 0xc0; /* Set bit for this starter */ start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */ @@ -1152,31 +1296,32 @@ do } } } - - /* In non-UTF-8 mode, the two bit maps are completely compatible. */ - else #endif { - for (c = 0; c < 32; c++) start_bits[c] |= tcode[c]; + /* In non-UTF-8 mode, the two bit maps are completely compatible. */ + for (c = 0; c < 32; c++) start_bits[c] |= map[c]; } /* Advance past the bit map, and act on what follows. For a zero minimum repeat, continue; otherwise stop processing. */ - tcode += 32; + tcode += 32 / sizeof(pcre_uchar); switch (*tcode) { case OP_CRSTAR: case OP_CRMINSTAR: case OP_CRQUERY: case OP_CRMINQUERY: + case OP_CRPOSSTAR: + case OP_CRPOSQUERY: tcode++; break; case OP_CRRANGE: case OP_CRMINRANGE: - if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5; + case OP_CRPOSRANGE: + if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE; else try_next = FALSE; break; @@ -1205,7 +1350,7 @@ return yield; *************************************************/ /* This function is handed a compiled expression that it must study to produce -information that will speed up the matching. It returns a pcre_extra block +information that will speed up the matching. It returns a pcre[16]_extra block which then gets handed back to pcre_exec(). Arguments: @@ -1214,24 +1359,33 @@ Arguments: errorptr points to where to place error messages; set NULL unless error -Returns: pointer to a pcre_extra block, with study_data filled in and the - appropriate flags set; +Returns: pointer to a pcre[16]_extra block, with study_data filled in and + the appropriate flags set; NULL on error or if no optimization possible */ +#if defined COMPILE_PCRE8 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION pcre_study(const pcre *external_re, int options, const char **errorptr) +#elif defined COMPILE_PCRE16 +PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION +pcre16_study(const pcre16 *external_re, int options, const char **errorptr) +#elif defined COMPILE_PCRE32 +PCRE_EXP_DEFN pcre32_extra * PCRE_CALL_CONVENTION +pcre32_study(const pcre32 *external_re, int options, const char **errorptr) +#endif { int min; BOOL bits_set = FALSE; -uschar start_bits[32]; -pcre_extra *extra = NULL; +pcre_uint8 start_bits[32]; +PUBL(extra) *extra = NULL; pcre_study_data *study; -const uschar *tables; -uschar *code; +const pcre_uint8 *tables; +pcre_uchar *code; compile_data compile_block; -const real_pcre *re = (const real_pcre *)external_re; +const REAL_PCRE *re = (const REAL_PCRE *)external_re; + *errorptr = NULL; if (re == NULL || re->magic_number != MAGIC_NUMBER) @@ -1240,13 +1394,25 @@ if (re == NULL || re->magic_number != MAGIC_NUMBER) return NULL; } +if ((re->flags & PCRE_MODE) == 0) + { +#if defined COMPILE_PCRE8 + *errorptr = "argument not compiled in 8 bit mode"; +#elif defined COMPILE_PCRE16 + *errorptr = "argument not compiled in 16 bit mode"; +#elif defined COMPILE_PCRE32 + *errorptr = "argument not compiled in 32 bit mode"; +#endif + return NULL; + } + if ((options & ~PUBLIC_STUDY_OPTIONS) != 0) { *errorptr = "unknown or incorrect option bit(s) set"; return NULL; } -code = (uschar *)re + re->name_table_offset + +code = (pcre_uchar *)re + re->name_table_offset + (re->name_count * re->name_entry_size); /* For an anchored pattern, or an unanchored pattern that has a first char, or @@ -1261,9 +1427,20 @@ if ((re->options & PCRE_ANCHORED) == 0 && /* Set the character tables in the block that is passed around */ tables = re->tables; + +#if defined COMPILE_PCRE8 if (tables == NULL) (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, (void *)(&tables)); +#elif defined COMPILE_PCRE16 + if (tables == NULL) + (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, + (void *)(&tables)); +#elif defined COMPILE_PCRE32 + if (tables == NULL) + (void)pcre32_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, + (void *)(&tables)); +#endif compile_block.lcc = tables + lcc_offset; compile_block.fcc = tables + fcc_offset; @@ -1272,7 +1449,7 @@ if ((re->options & PCRE_ANCHORED) == 0 && /* See if we can find a fixed set of initial characters for the pattern. */ - memset(start_bits, 0, 32 * sizeof(uschar)); + memset(start_bits, 0, 32 * sizeof(pcre_uint8)); rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0, &compile_block); bits_set = rc == SSB_DONE; @@ -1285,7 +1462,7 @@ if ((re->options & PCRE_ANCHORED) == 0 && /* Find the minimum length of subject string. */ -switch(min = find_minlength(code, code, re->options, 0)) +switch(min = find_minlength(re, code, code, re->options, 0)) { case -2: *errorptr = "internal error: missing capturing bracket"; return NULL; case -3: *errorptr = "internal error: opcode not recognized"; return NULL; @@ -1293,29 +1470,30 @@ switch(min = find_minlength(code, code, re->options, 0 } /* If a set of starting bytes has been identified, or if the minimum length is -greater than zero, or if JIT optimization has been requested, get a pcre_extra -block and a pcre_study_data block. The study data is put in the latter, which -is pointed to by the former, which may also get additional data set later by -the calling program. At the moment, the size of pcre_study_data is fixed. We -nevertheless save it in a field for returning via the pcre_fullinfo() function -so that if it becomes variable in the future, we don't have to change that -code. */ +greater than zero, or if JIT optimization has been requested, or if +PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a +pcre_study_data block. The study data is put in the latter, which is pointed to +by the former, which may also get additional data set later by the calling +program. At the moment, the size of pcre_study_data is fixed. We nevertheless +save it in a field for returning via the pcre_fullinfo() function so that if it +becomes variable in the future, we don't have to change that code. */ -if (bits_set || min > 0 +if (bits_set || min > 0 || (options & ( #ifdef SUPPORT_JIT - || (options & PCRE_STUDY_JIT_COMPILE) != 0 + PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE | + PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE | #endif - ) + PCRE_STUDY_EXTRA_NEEDED)) != 0) { - extra = (pcre_extra *)(pcre_malloc) - (sizeof(pcre_extra) + sizeof(pcre_study_data)); + extra = (PUBL(extra) *)(PUBL(malloc)) + (sizeof(PUBL(extra)) + sizeof(pcre_study_data)); if (extra == NULL) { *errorptr = "failed to get memory"; return NULL; } - study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra)); + study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra))); extra->flags = PCRE_EXTRA_STUDY_DATA; extra->study_data = study; @@ -1331,8 +1509,20 @@ if (bits_set || min > 0 study->flags |= PCRE_STUDY_MAPPED; memcpy(study->start_bits, start_bits, sizeof(start_bits)); } - else memset(study->start_bits, 0, 32 * sizeof(uschar)); + else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8)); +#ifdef PCRE_DEBUG + if (bits_set) + { + pcre_uint8 *ptr = start_bits; + int i; + + printf("Start bits:\n"); + for (i = 0; i < 32; i++) + printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n"); + } +#endif + /* Always set the minlength value in the block, because the JIT compiler makes use of it. However, don't set the bit unless the length is greater than zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time @@ -1347,14 +1537,28 @@ if (bits_set || min > 0 /* If JIT support was compiled and requested, attempt the JIT compilation. If no starting bytes were found, and the minimum length is zero, and JIT - compilation fails, abandon the extra block and return NULL. */ + compilation fails, abandon the extra block and return NULL, unless + PCRE_STUDY_EXTRA_NEEDED is set. */ #ifdef SUPPORT_JIT extra->executable_jit = NULL; - if ((options & PCRE_STUDY_JIT_COMPILE) != 0) _pcre_jit_compile(re, extra); - if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0) + if ((options & PCRE_STUDY_JIT_COMPILE) != 0) + PRIV(jit_compile)(re, extra, JIT_COMPILE); + if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0) + PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE); + if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0) + PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE); + + if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0 && + (options & PCRE_STUDY_EXTRA_NEEDED) == 0) { +#if defined COMPILE_PCRE8 pcre_free_study(extra); +#elif defined COMPILE_PCRE16 + pcre16_free_study(extra); +#elif defined COMPILE_PCRE32 + pcre32_free_study(extra); +#endif extra = NULL; } #endif @@ -1370,19 +1574,29 @@ return extra; /* This function frees the memory that was obtained by pcre_study(). -Argument: a pointer to the pcre_extra block +Argument: a pointer to the pcre[16]_extra block Returns: nothing */ +#if defined COMPILE_PCRE8 PCRE_EXP_DEFN void pcre_free_study(pcre_extra *extra) +#elif defined COMPILE_PCRE16 +PCRE_EXP_DEFN void +pcre16_free_study(pcre16_extra *extra) +#elif defined COMPILE_PCRE32 +PCRE_EXP_DEFN void +pcre32_free_study(pcre32_extra *extra) +#endif { +if (extra == NULL) + return; #ifdef SUPPORT_JIT if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 && extra->executable_jit != NULL) - _pcre_jit_free(extra->executable_jit); + PRIV(jit_free)(extra->executable_jit); #endif -pcre_free(extra); +PUBL(free)(extra); } /* End of pcre_study.c */