--- embedaddon/pcre/pcre_compile.c 2012/02/21 23:50:25 1.1.1.2 +++ embedaddon/pcre/pcre_compile.c 2012/10/09 09:19:17 1.1.1.3 @@ -489,6 +489,9 @@ static const char error_texts[] = "too many forward references\0" "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0" "invalid UTF-16 string\0" + /* 75 */ + "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0" + "character value in \\u.... sequence is too large\0" ; /* Table to identify digits and hex digits. This is used when compiling @@ -829,6 +832,18 @@ else c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); #endif } + +#ifdef COMPILE_PCRE8 + if (c > (utf ? 0x10ffff : 0xff)) +#else +#ifdef COMPILE_PCRE16 + if (c > (utf ? 0x10ffff : 0xffff)) +#endif +#endif + { + *errorcodeptr = ERR76; + } + else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73; } } else @@ -2225,32 +2240,60 @@ for (;;) { case OP_CHAR: case OP_CHARI: + case OP_NOT: + case OP_NOTI: case OP_EXACT: case OP_EXACTI: + case OP_NOTEXACT: + case OP_NOTEXACTI: case OP_UPTO: case OP_UPTOI: + case OP_NOTUPTO: + case OP_NOTUPTOI: case OP_MINUPTO: case OP_MINUPTOI: + case OP_NOTMINUPTO: + case OP_NOTMINUPTOI: case OP_POSUPTO: case OP_POSUPTOI: + case OP_NOTPOSUPTO: + case OP_NOTPOSUPTOI: case OP_STAR: case OP_STARI: + case OP_NOTSTAR: + case OP_NOTSTARI: case OP_MINSTAR: case OP_MINSTARI: + case OP_NOTMINSTAR: + case OP_NOTMINSTARI: case OP_POSSTAR: case OP_POSSTARI: + case OP_NOTPOSSTAR: + case OP_NOTPOSSTARI: case OP_PLUS: case OP_PLUSI: + case OP_NOTPLUS: + case OP_NOTPLUSI: case OP_MINPLUS: case OP_MINPLUSI: + case OP_NOTMINPLUS: + case OP_NOTMINPLUSI: case OP_POSPLUS: case OP_POSPLUSI: + case OP_NOTPOSPLUS: + case OP_NOTPOSPLUSI: case OP_QUERY: case OP_QUERYI: + case OP_NOTQUERY: + case OP_NOTQUERYI: case OP_MINQUERY: case OP_MINQUERYI: + case OP_NOTMINQUERY: + case OP_NOTMINQUERYI: case OP_POSQUERY: case OP_POSQUERYI: + case OP_NOTPOSQUERY: + case OP_NOTPOSQUERYI: if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); break; } @@ -3069,22 +3112,28 @@ if (next >= 0) switch(op_code) #endif /* SUPPORT_UTF */ return (c != TABLE_GET((unsigned int)next, cd->fcc, next)); /* Non-UTF-8 mode */ - /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These - opcodes are not used for multi-byte characters, because they are coded using - an XCLASS instead. */ - case OP_NOT: - return (c = *previous) == next; +#ifdef SUPPORT_UTF + GETCHARTEST(c, previous); +#else + c = *previous; +#endif + return c == next; case OP_NOTI: - if ((c = *previous) == next) return TRUE; #ifdef SUPPORT_UTF + GETCHARTEST(c, previous); +#else + c = *previous; +#endif + if (c == next) return TRUE; +#ifdef SUPPORT_UTF if (utf) { unsigned int othercase; if (next < 128) othercase = cd->fcc[next]; else #ifdef SUPPORT_UCP - othercase = UCD_OTHERCASE(next); + othercase = UCD_OTHERCASE((unsigned int)next); #else othercase = NOTACHAR; #endif @@ -3092,28 +3141,28 @@ if (next >= 0) switch(op_code) } else #endif /* SUPPORT_UTF */ - return (c == (int)(TABLE_GET((unsigned int)next, cd->fcc, next))); /* Non-UTF-8 mode */ + return (c == TABLE_GET((unsigned int)next, cd->fcc, next)); /* Non-UTF-8 mode */ /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ case OP_DIGIT: - return next > 127 || (cd->ctypes[next] & ctype_digit) == 0; + return next > 255 || (cd->ctypes[next] & ctype_digit) == 0; case OP_NOT_DIGIT: - return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0; + return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0; case OP_WHITESPACE: - return next > 127 || (cd->ctypes[next] & ctype_space) == 0; + return next > 255 || (cd->ctypes[next] & ctype_space) == 0; case OP_NOT_WHITESPACE: - return next <= 127 && (cd->ctypes[next] & ctype_space) != 0; + return next <= 255 && (cd->ctypes[next] & ctype_space) != 0; case OP_WORDCHAR: - return next > 127 || (cd->ctypes[next] & ctype_word) == 0; + return next > 255 || (cd->ctypes[next] & ctype_word) == 0; case OP_NOT_WORDCHAR: - return next <= 127 && (cd->ctypes[next] & ctype_word) != 0; + return next <= 255 && (cd->ctypes[next] & ctype_word) != 0; case OP_HSPACE: case OP_NOT_HSPACE: @@ -3191,22 +3240,22 @@ switch(op_code) switch(-next) { case ESC_d: - return c > 127 || (cd->ctypes[c] & ctype_digit) == 0; + return c > 255 || (cd->ctypes[c] & ctype_digit) == 0; case ESC_D: - return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0; + return c <= 255 && (cd->ctypes[c] & ctype_digit) != 0; case ESC_s: - return c > 127 || (cd->ctypes[c] & ctype_space) == 0; + return c > 255 || (cd->ctypes[c] & ctype_space) == 0; case ESC_S: - return c <= 127 && (cd->ctypes[c] & ctype_space) != 0; + return c <= 255 && (cd->ctypes[c] & ctype_space) != 0; case ESC_w: - return c > 127 || (cd->ctypes[c] & ctype_word) == 0; + return c > 255 || (cd->ctypes[c] & ctype_word) == 0; case ESC_W: - return c <= 127 && (cd->ctypes[c] & ctype_word) != 0; + return c <= 255 && (cd->ctypes[c] & ctype_word) != 0; case ESC_h: case ESC_H: @@ -3315,10 +3364,10 @@ switch(op_code) return next == -ESC_d; case OP_WHITESPACE: - return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R; + return next == -ESC_S || next == -ESC_d || next == -ESC_w; case OP_NOT_WHITESPACE: - return next == -ESC_s || next == -ESC_h || next == -ESC_v; + return next == -ESC_s || next == -ESC_h || next == -ESC_v || next == -ESC_R; case OP_HSPACE: return next == -ESC_S || next == -ESC_H || next == -ESC_d || @@ -4482,42 +4531,35 @@ for (;; ptr++) LONE_SINGLE_CHARACTER: /* Only the value of 1 matters for class_single_char. */ + if (class_single_char < 2) class_single_char++; /* If class_charcount is 1, we saw precisely one character. As long as - there were no negated characters >= 128 and there was no use of \p or \P, - in other words, no use of any XCLASS features, we can optimize. + there was no use of \p or \P, in other words, no use of any XCLASS + features, we can optimize. - In UTF-8 mode, we can optimize the negative case only if there were no - characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR - operate on single-bytes characters only. This is an historical hangover. - Maybe one day we can tidy these opcodes to handle multi-byte characters. - The optimization throws away the bit map. We turn the item into a 1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative. - Note that OP_NOT[I] does not support multibyte characters. In the positive - case, it can cause firstchar to be set. Otherwise, there can be no first - char if this item is first, whatever repeat count may follow. In the case - of reqchar, save the previous value for reinstating. */ + In the positive case, it can cause firstchar to be set. Otherwise, there + can be no first char if this item is first, whatever repeat count may + follow. In the case of reqchar, save the previous value for reinstating. */ -#ifdef SUPPORT_UTF - if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET - && (!utf || !negate_class || c < (MAX_VALUE_FOR_SINGLE_CHAR + 1))) -#else if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) -#endif { ptr++; zeroreqchar = reqchar; - /* The OP_NOT[I] opcodes work on single characters only. */ - if (negate_class) { if (firstchar == REQ_UNSET) firstchar = REQ_NONE; zerofirstchar = firstchar; *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT; - *code++ = c; +#ifdef SUPPORT_UTF + if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR) + code += PRIV(ord2utf)(c, code); + else +#endif + *code++ = c; goto NOT_CHAR; } @@ -4775,15 +4817,23 @@ for (;; ptr++) /* Now handle repetition for the different types of item. */ - /* If previous was a character match, abolish the item and generate a - repeat item instead. If a char item has a minumum of more than one, ensure - that it is set in reqchar - it might not be if a sequence such as x{3} is - the first thing in a branch because the x will have gone into firstchar - instead. */ + /* If previous was a character or negated character match, abolish the item + and generate a repeat item instead. If a char item has a minimum of more + than one, ensure that it is set in reqchar - it might not be if a sequence + such as x{3} is the first thing in a branch because the x will have gone + into firstchar instead. */ - if (*previous == OP_CHAR || *previous == OP_CHARI) + if (*previous == OP_CHAR || *previous == OP_CHARI + || *previous == OP_NOT || *previous == OP_NOTI) { - op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR; + switch (*previous) + { + default: /* Make compiler happy. */ + case OP_CHAR: op_type = OP_STAR - OP_STAR; break; + case OP_CHARI: op_type = OP_STARI - OP_STAR; break; + case OP_NOT: op_type = OP_NOTSTAR - OP_STAR; break; + case OP_NOTI: op_type = OP_NOTSTARI - OP_STAR; break; + } /* Deal with UTF characters that take up more than one character. It's easier to write this out separately than try to macrify it. Use c to @@ -4806,7 +4856,8 @@ for (;; ptr++) with UTF disabled, or for a single character UTF character. */ { c = code[-1]; - if (repeat_min > 1) reqchar = c | req_caseopt | cd->req_varyopt; + if (*previous <= OP_CHARI && repeat_min > 1) + reqchar = c | req_caseopt | cd->req_varyopt; } /* If the repetition is unlimited, it pays to see if the next thing on @@ -4825,26 +4876,6 @@ for (;; ptr++) goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */ } - /* If previous was a single negated character ([^a] or similar), we use - one of the special opcodes, replacing it. The code is shared with single- - character repeats by setting opt_type to add a suitable offset into - repeat_type. We can also test for auto-possessification. OP_NOT and OP_NOTI - are currently used only for single-byte chars. */ - - else if (*previous == OP_NOT || *previous == OP_NOTI) - { - op_type = ((*previous == OP_NOT)? OP_NOTSTAR : OP_NOTSTARI) - OP_STAR; - c = previous[1]; - if (!possessive_quantifier && - repeat_max < 0 && - check_auto_possessive(previous, utf, ptr + 1, options, cd)) - { - repeat_type = 0; /* Force greedy */ - possessive_quantifier = TRUE; - } - goto OUTPUT_SINGLE_REPEAT; - } - /* If previous was a character type match (\d or similar), abolish it and create a suitable repeat item. The code is shared with single-character repeats by setting op_type to add a suitable offset into repeat_type. Note @@ -5585,6 +5616,11 @@ for (;; ptr++) arg = ++ptr; while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++; arglen = (int)(ptr - arg); + if (arglen > (int)MAX_MARK) + { + *errorcodeptr = ERR75; + goto FAILED; + } } if (*ptr != CHAR_RIGHT_PARENTHESIS) @@ -6836,10 +6872,13 @@ for (;; ptr++) /* For the rest (including \X when Unicode properties are supported), we can obtain the OP value by negating the escape value in the default situation when PCRE_UCP is not set. When it *is* set, we substitute - Unicode property tests. */ + Unicode property tests. Note that \b and \B do a one-character + lookbehind. */ else { + if ((-c == ESC_b || -c == ESC_B) && cd->max_lookbehind == 0) + cd->max_lookbehind = 1; #ifdef SUPPORT_UCP if (-c >= ESC_DU && -c <= ESC_wu) { @@ -7147,7 +7186,12 @@ for (;;) *ptrptr = ptr; return FALSE; } - else { PUT(reverse_count, 0, fixed_length); } + else + { + if (fixed_length > cd->max_lookbehind) + cd->max_lookbehind = fixed_length; + PUT(reverse_count, 0, fixed_length); + } } } @@ -7817,6 +7861,7 @@ cd->start_pattern = (const pcre_uchar *)pattern; cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern)); cd->req_varyopt = 0; cd->assert_depth = 0; +cd->max_lookbehind = 0; cd->external_options = options; cd->external_flags = 0; cd->open_caps = NULL; @@ -7867,7 +7912,6 @@ re->magic_number = MAGIC_NUMBER; re->size = (int)size; re->options = cd->external_options; re->flags = cd->external_flags; -re->dummy1 = 0; re->first_char = 0; re->req_char = 0; re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar); @@ -7887,6 +7931,7 @@ field; this time it's used for remembering forward ref cd->final_bracount = cd->bracount; /* Save for checking forward references */ cd->assert_depth = 0; cd->bracount = 0; +cd->max_lookbehind = 0; cd->names_found = 0; cd->name_table = (pcre_uchar *)re + re->name_table_offset; codestart = cd->name_table + re->name_entry_size * re->name_count; @@ -7908,6 +7953,7 @@ code = (pcre_uchar *)codestart; &firstchar, &reqchar, NULL, cd, NULL); re->top_bracket = cd->bracount; re->top_backref = cd->top_backref; +re->max_lookbehind = cd->max_lookbehind; re->flags = cd->external_flags | PCRE_MODE; if (cd->had_accept) reqchar = REQ_NONE; /* Must disable after (*ACCEPT) */ @@ -7995,6 +8041,7 @@ if (cd->check_lookbehind) (fixed_length == -4)? ERR70 : ERR25; break; } + if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length; PUT(cc, 1, fixed_length); } cc += 1 + LINK_SIZE;