version 1.1.1.2, 2012/02/21 23:50:25
|
version 1.1.1.3, 2012/10/09 09:19:17
|
Line 489 static const char error_texts[] =
|
Line 489 static const char error_texts[] =
|
"too many forward references\0" |
"too many forward references\0" |
"disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0" |
"disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0" |
"invalid UTF-16 string\0" |
"invalid UTF-16 string\0" |
|
/* 75 */ |
|
"name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0" |
|
"character value in \\u.... sequence is too large\0" |
; |
; |
|
|
/* Table to identify digits and hex digits. This is used when compiling |
/* Table to identify digits and hex digits. This is used when compiling |
Line 829 else
|
Line 832 else
|
c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); |
c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); |
#endif |
#endif |
} |
} |
|
|
|
#ifdef COMPILE_PCRE8 |
|
if (c > (utf ? 0x10ffff : 0xff)) |
|
#else |
|
#ifdef COMPILE_PCRE16 |
|
if (c > (utf ? 0x10ffff : 0xffff)) |
|
#endif |
|
#endif |
|
{ |
|
*errorcodeptr = ERR76; |
|
} |
|
else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73; |
} |
} |
} |
} |
else |
else |
Line 2225 for (;;)
|
Line 2240 for (;;)
|
{ |
{ |
case OP_CHAR: |
case OP_CHAR: |
case OP_CHARI: |
case OP_CHARI: |
|
case OP_NOT: |
|
case OP_NOTI: |
case OP_EXACT: |
case OP_EXACT: |
case OP_EXACTI: |
case OP_EXACTI: |
|
case OP_NOTEXACT: |
|
case OP_NOTEXACTI: |
case OP_UPTO: |
case OP_UPTO: |
case OP_UPTOI: |
case OP_UPTOI: |
|
case OP_NOTUPTO: |
|
case OP_NOTUPTOI: |
case OP_MINUPTO: |
case OP_MINUPTO: |
case OP_MINUPTOI: |
case OP_MINUPTOI: |
|
case OP_NOTMINUPTO: |
|
case OP_NOTMINUPTOI: |
case OP_POSUPTO: |
case OP_POSUPTO: |
case OP_POSUPTOI: |
case OP_POSUPTOI: |
|
case OP_NOTPOSUPTO: |
|
case OP_NOTPOSUPTOI: |
case OP_STAR: |
case OP_STAR: |
case OP_STARI: |
case OP_STARI: |
|
case OP_NOTSTAR: |
|
case OP_NOTSTARI: |
case OP_MINSTAR: |
case OP_MINSTAR: |
case OP_MINSTARI: |
case OP_MINSTARI: |
|
case OP_NOTMINSTAR: |
|
case OP_NOTMINSTARI: |
case OP_POSSTAR: |
case OP_POSSTAR: |
case OP_POSSTARI: |
case OP_POSSTARI: |
|
case OP_NOTPOSSTAR: |
|
case OP_NOTPOSSTARI: |
case OP_PLUS: |
case OP_PLUS: |
case OP_PLUSI: |
case OP_PLUSI: |
|
case OP_NOTPLUS: |
|
case OP_NOTPLUSI: |
case OP_MINPLUS: |
case OP_MINPLUS: |
case OP_MINPLUSI: |
case OP_MINPLUSI: |
|
case OP_NOTMINPLUS: |
|
case OP_NOTMINPLUSI: |
case OP_POSPLUS: |
case OP_POSPLUS: |
case OP_POSPLUSI: |
case OP_POSPLUSI: |
|
case OP_NOTPOSPLUS: |
|
case OP_NOTPOSPLUSI: |
case OP_QUERY: |
case OP_QUERY: |
case OP_QUERYI: |
case OP_QUERYI: |
|
case OP_NOTQUERY: |
|
case OP_NOTQUERYI: |
case OP_MINQUERY: |
case OP_MINQUERY: |
case OP_MINQUERYI: |
case OP_MINQUERYI: |
|
case OP_NOTMINQUERY: |
|
case OP_NOTMINQUERYI: |
case OP_POSQUERY: |
case OP_POSQUERY: |
case OP_POSQUERYI: |
case OP_POSQUERYI: |
|
case OP_NOTPOSQUERY: |
|
case OP_NOTPOSQUERYI: |
if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); |
if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); |
break; |
break; |
} |
} |
Line 3069 if (next >= 0) switch(op_code)
|
Line 3112 if (next >= 0) switch(op_code)
|
#endif /* SUPPORT_UTF */ |
#endif /* SUPPORT_UTF */ |
return (c != TABLE_GET((unsigned int)next, cd->fcc, next)); /* Non-UTF-8 mode */ |
return (c != TABLE_GET((unsigned int)next, cd->fcc, next)); /* Non-UTF-8 mode */ |
|
|
/* For OP_NOT and OP_NOTI, the data is always a single-byte character. These |
|
opcodes are not used for multi-byte characters, because they are coded using |
|
an XCLASS instead. */ |
|
|
|
case OP_NOT: |
case OP_NOT: |
return (c = *previous) == next; | #ifdef SUPPORT_UTF |
| GETCHARTEST(c, previous); |
| #else |
| c = *previous; |
| #endif |
| return c == next; |
|
|
case OP_NOTI: |
case OP_NOTI: |
if ((c = *previous) == next) return TRUE; |
|
#ifdef SUPPORT_UTF |
#ifdef SUPPORT_UTF |
|
GETCHARTEST(c, previous); |
|
#else |
|
c = *previous; |
|
#endif |
|
if (c == next) return TRUE; |
|
#ifdef SUPPORT_UTF |
if (utf) |
if (utf) |
{ |
{ |
unsigned int othercase; |
unsigned int othercase; |
if (next < 128) othercase = cd->fcc[next]; else |
if (next < 128) othercase = cd->fcc[next]; else |
#ifdef SUPPORT_UCP |
#ifdef SUPPORT_UCP |
othercase = UCD_OTHERCASE(next); | othercase = UCD_OTHERCASE((unsigned int)next); |
#else |
#else |
othercase = NOTACHAR; |
othercase = NOTACHAR; |
#endif |
#endif |
Line 3092 if (next >= 0) switch(op_code)
|
Line 3141 if (next >= 0) switch(op_code)
|
} |
} |
else |
else |
#endif /* SUPPORT_UTF */ |
#endif /* SUPPORT_UTF */ |
return (c == (int)(TABLE_GET((unsigned int)next, cd->fcc, next))); /* Non-UTF-8 mode */ | return (c == TABLE_GET((unsigned int)next, cd->fcc, next)); /* Non-UTF-8 mode */ |
|
|
/* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set. |
/* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set. |
When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ |
When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ |
|
|
case OP_DIGIT: |
case OP_DIGIT: |
return next > 127 || (cd->ctypes[next] & ctype_digit) == 0; | return next > 255 || (cd->ctypes[next] & ctype_digit) == 0; |
|
|
case OP_NOT_DIGIT: |
case OP_NOT_DIGIT: |
return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0; | return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0; |
|
|
case OP_WHITESPACE: |
case OP_WHITESPACE: |
return next > 127 || (cd->ctypes[next] & ctype_space) == 0; | return next > 255 || (cd->ctypes[next] & ctype_space) == 0; |
|
|
case OP_NOT_WHITESPACE: |
case OP_NOT_WHITESPACE: |
return next <= 127 && (cd->ctypes[next] & ctype_space) != 0; | return next <= 255 && (cd->ctypes[next] & ctype_space) != 0; |
|
|
case OP_WORDCHAR: |
case OP_WORDCHAR: |
return next > 127 || (cd->ctypes[next] & ctype_word) == 0; | return next > 255 || (cd->ctypes[next] & ctype_word) == 0; |
|
|
case OP_NOT_WORDCHAR: |
case OP_NOT_WORDCHAR: |
return next <= 127 && (cd->ctypes[next] & ctype_word) != 0; | return next <= 255 && (cd->ctypes[next] & ctype_word) != 0; |
|
|
case OP_HSPACE: |
case OP_HSPACE: |
case OP_NOT_HSPACE: |
case OP_NOT_HSPACE: |
Line 3191 switch(op_code)
|
Line 3240 switch(op_code)
|
switch(-next) |
switch(-next) |
{ |
{ |
case ESC_d: |
case ESC_d: |
return c > 127 || (cd->ctypes[c] & ctype_digit) == 0; | return c > 255 || (cd->ctypes[c] & ctype_digit) == 0; |
|
|
case ESC_D: |
case ESC_D: |
return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0; | return c <= 255 && (cd->ctypes[c] & ctype_digit) != 0; |
|
|
case ESC_s: |
case ESC_s: |
return c > 127 || (cd->ctypes[c] & ctype_space) == 0; | return c > 255 || (cd->ctypes[c] & ctype_space) == 0; |
|
|
case ESC_S: |
case ESC_S: |
return c <= 127 && (cd->ctypes[c] & ctype_space) != 0; | return c <= 255 && (cd->ctypes[c] & ctype_space) != 0; |
|
|
case ESC_w: |
case ESC_w: |
return c > 127 || (cd->ctypes[c] & ctype_word) == 0; | return c > 255 || (cd->ctypes[c] & ctype_word) == 0; |
|
|
case ESC_W: |
case ESC_W: |
return c <= 127 && (cd->ctypes[c] & ctype_word) != 0; | return c <= 255 && (cd->ctypes[c] & ctype_word) != 0; |
|
|
case ESC_h: |
case ESC_h: |
case ESC_H: |
case ESC_H: |
Line 3315 switch(op_code)
|
Line 3364 switch(op_code)
|
return next == -ESC_d; |
return next == -ESC_d; |
|
|
case OP_WHITESPACE: |
case OP_WHITESPACE: |
return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R; | return next == -ESC_S || next == -ESC_d || next == -ESC_w; |
|
|
case OP_NOT_WHITESPACE: |
case OP_NOT_WHITESPACE: |
return next == -ESC_s || next == -ESC_h || next == -ESC_v; | return next == -ESC_s || next == -ESC_h || next == -ESC_v || next == -ESC_R; |
|
|
case OP_HSPACE: |
case OP_HSPACE: |
return next == -ESC_S || next == -ESC_H || next == -ESC_d || |
return next == -ESC_S || next == -ESC_H || next == -ESC_d || |
Line 4482 for (;; ptr++)
|
Line 4531 for (;; ptr++)
|
LONE_SINGLE_CHARACTER: |
LONE_SINGLE_CHARACTER: |
|
|
/* Only the value of 1 matters for class_single_char. */ |
/* Only the value of 1 matters for class_single_char. */ |
|
|
if (class_single_char < 2) class_single_char++; |
if (class_single_char < 2) class_single_char++; |
|
|
/* If class_charcount is 1, we saw precisely one character. As long as |
/* If class_charcount is 1, we saw precisely one character. As long as |
there were no negated characters >= 128 and there was no use of \p or \P, | there was no use of \p or \P, in other words, no use of any XCLASS |
in other words, no use of any XCLASS features, we can optimize. | features, we can optimize. |
|
|
In UTF-8 mode, we can optimize the negative case only if there were no |
|
characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR |
|
operate on single-bytes characters only. This is an historical hangover. |
|
Maybe one day we can tidy these opcodes to handle multi-byte characters. |
|
|
|
The optimization throws away the bit map. We turn the item into a |
The optimization throws away the bit map. We turn the item into a |
1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative. |
1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative. |
Note that OP_NOT[I] does not support multibyte characters. In the positive | In the positive case, it can cause firstchar to be set. Otherwise, there |
case, it can cause firstchar to be set. Otherwise, there can be no first | can be no first char if this item is first, whatever repeat count may |
char if this item is first, whatever repeat count may follow. In the case | follow. In the case of reqchar, save the previous value for reinstating. */ |
of reqchar, save the previous value for reinstating. */ | |
|
|
#ifdef SUPPORT_UTF |
|
if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET |
|
&& (!utf || !negate_class || c < (MAX_VALUE_FOR_SINGLE_CHAR + 1))) |
|
#else |
|
if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) |
if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) |
#endif |
|
{ |
{ |
ptr++; |
ptr++; |
zeroreqchar = reqchar; |
zeroreqchar = reqchar; |
|
|
/* The OP_NOT[I] opcodes work on single characters only. */ |
|
|
|
if (negate_class) |
if (negate_class) |
{ |
{ |
if (firstchar == REQ_UNSET) firstchar = REQ_NONE; |
if (firstchar == REQ_UNSET) firstchar = REQ_NONE; |
zerofirstchar = firstchar; |
zerofirstchar = firstchar; |
*code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT; |
*code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT; |
*code++ = c; | #ifdef SUPPORT_UTF |
| if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR) |
| code += PRIV(ord2utf)(c, code); |
| else |
| #endif |
| *code++ = c; |
goto NOT_CHAR; |
goto NOT_CHAR; |
} |
} |
|
|
Line 4775 for (;; ptr++)
|
Line 4817 for (;; ptr++)
|
|
|
/* Now handle repetition for the different types of item. */ |
/* Now handle repetition for the different types of item. */ |
|
|
/* If previous was a character match, abolish the item and generate a | /* If previous was a character or negated character match, abolish the item |
repeat item instead. If a char item has a minumum of more than one, ensure | and generate a repeat item instead. If a char item has a minimum of more |
that it is set in reqchar - it might not be if a sequence such as x{3} is | than one, ensure that it is set in reqchar - it might not be if a sequence |
the first thing in a branch because the x will have gone into firstchar | such as x{3} is the first thing in a branch because the x will have gone |
instead. */ | into firstchar instead. */ |
|
|
if (*previous == OP_CHAR || *previous == OP_CHARI) | if (*previous == OP_CHAR || *previous == OP_CHARI |
| || *previous == OP_NOT || *previous == OP_NOTI) |
{ |
{ |
op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR; | switch (*previous) |
| { |
| default: /* Make compiler happy. */ |
| case OP_CHAR: op_type = OP_STAR - OP_STAR; break; |
| case OP_CHARI: op_type = OP_STARI - OP_STAR; break; |
| case OP_NOT: op_type = OP_NOTSTAR - OP_STAR; break; |
| case OP_NOTI: op_type = OP_NOTSTARI - OP_STAR; break; |
| } |
|
|
/* Deal with UTF characters that take up more than one character. It's |
/* Deal with UTF characters that take up more than one character. It's |
easier to write this out separately than try to macrify it. Use c to |
easier to write this out separately than try to macrify it. Use c to |
Line 4806 for (;; ptr++)
|
Line 4856 for (;; ptr++)
|
with UTF disabled, or for a single character UTF character. */ |
with UTF disabled, or for a single character UTF character. */ |
{ |
{ |
c = code[-1]; |
c = code[-1]; |
if (repeat_min > 1) reqchar = c | req_caseopt | cd->req_varyopt; | if (*previous <= OP_CHARI && repeat_min > 1) |
| reqchar = c | req_caseopt | cd->req_varyopt; |
} |
} |
|
|
/* If the repetition is unlimited, it pays to see if the next thing on |
/* If the repetition is unlimited, it pays to see if the next thing on |
Line 4825 for (;; ptr++)
|
Line 4876 for (;; ptr++)
|
goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */ |
goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */ |
} |
} |
|
|
/* If previous was a single negated character ([^a] or similar), we use |
|
one of the special opcodes, replacing it. The code is shared with single- |
|
character repeats by setting opt_type to add a suitable offset into |
|
repeat_type. We can also test for auto-possessification. OP_NOT and OP_NOTI |
|
are currently used only for single-byte chars. */ |
|
|
|
else if (*previous == OP_NOT || *previous == OP_NOTI) |
|
{ |
|
op_type = ((*previous == OP_NOT)? OP_NOTSTAR : OP_NOTSTARI) - OP_STAR; |
|
c = previous[1]; |
|
if (!possessive_quantifier && |
|
repeat_max < 0 && |
|
check_auto_possessive(previous, utf, ptr + 1, options, cd)) |
|
{ |
|
repeat_type = 0; /* Force greedy */ |
|
possessive_quantifier = TRUE; |
|
} |
|
goto OUTPUT_SINGLE_REPEAT; |
|
} |
|
|
|
/* If previous was a character type match (\d or similar), abolish it and |
/* If previous was a character type match (\d or similar), abolish it and |
create a suitable repeat item. The code is shared with single-character |
create a suitable repeat item. The code is shared with single-character |
repeats by setting op_type to add a suitable offset into repeat_type. Note |
repeats by setting op_type to add a suitable offset into repeat_type. Note |
Line 5585 for (;; ptr++)
|
Line 5616 for (;; ptr++)
|
arg = ++ptr; |
arg = ++ptr; |
while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++; |
while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++; |
arglen = (int)(ptr - arg); |
arglen = (int)(ptr - arg); |
|
if (arglen > (int)MAX_MARK) |
|
{ |
|
*errorcodeptr = ERR75; |
|
goto FAILED; |
|
} |
} |
} |
|
|
if (*ptr != CHAR_RIGHT_PARENTHESIS) |
if (*ptr != CHAR_RIGHT_PARENTHESIS) |
Line 6836 for (;; ptr++)
|
Line 6872 for (;; ptr++)
|
/* For the rest (including \X when Unicode properties are supported), we |
/* For the rest (including \X when Unicode properties are supported), we |
can obtain the OP value by negating the escape value in the default |
can obtain the OP value by negating the escape value in the default |
situation when PCRE_UCP is not set. When it *is* set, we substitute |
situation when PCRE_UCP is not set. When it *is* set, we substitute |
Unicode property tests. */ | Unicode property tests. Note that \b and \B do a one-character |
| lookbehind. */ |
|
|
else |
else |
{ |
{ |
|
if ((-c == ESC_b || -c == ESC_B) && cd->max_lookbehind == 0) |
|
cd->max_lookbehind = 1; |
#ifdef SUPPORT_UCP |
#ifdef SUPPORT_UCP |
if (-c >= ESC_DU && -c <= ESC_wu) |
if (-c >= ESC_DU && -c <= ESC_wu) |
{ |
{ |
Line 7147 for (;;)
|
Line 7186 for (;;)
|
*ptrptr = ptr; |
*ptrptr = ptr; |
return FALSE; |
return FALSE; |
} |
} |
else { PUT(reverse_count, 0, fixed_length); } | else |
| { |
| if (fixed_length > cd->max_lookbehind) |
| cd->max_lookbehind = fixed_length; |
| PUT(reverse_count, 0, fixed_length); |
| } |
} |
} |
} |
} |
|
|
Line 7817 cd->start_pattern = (const pcre_uchar *)pattern;
|
Line 7861 cd->start_pattern = (const pcre_uchar *)pattern;
|
cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern)); |
cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern)); |
cd->req_varyopt = 0; |
cd->req_varyopt = 0; |
cd->assert_depth = 0; |
cd->assert_depth = 0; |
|
cd->max_lookbehind = 0; |
cd->external_options = options; |
cd->external_options = options; |
cd->external_flags = 0; |
cd->external_flags = 0; |
cd->open_caps = NULL; |
cd->open_caps = NULL; |
Line 7867 re->magic_number = MAGIC_NUMBER;
|
Line 7912 re->magic_number = MAGIC_NUMBER;
|
re->size = (int)size; |
re->size = (int)size; |
re->options = cd->external_options; |
re->options = cd->external_options; |
re->flags = cd->external_flags; |
re->flags = cd->external_flags; |
re->dummy1 = 0; |
|
re->first_char = 0; |
re->first_char = 0; |
re->req_char = 0; |
re->req_char = 0; |
re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar); |
re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar); |
Line 7887 field; this time it's used for remembering forward ref
|
Line 7931 field; this time it's used for remembering forward ref
|
cd->final_bracount = cd->bracount; /* Save for checking forward references */ |
cd->final_bracount = cd->bracount; /* Save for checking forward references */ |
cd->assert_depth = 0; |
cd->assert_depth = 0; |
cd->bracount = 0; |
cd->bracount = 0; |
|
cd->max_lookbehind = 0; |
cd->names_found = 0; |
cd->names_found = 0; |
cd->name_table = (pcre_uchar *)re + re->name_table_offset; |
cd->name_table = (pcre_uchar *)re + re->name_table_offset; |
codestart = cd->name_table + re->name_entry_size * re->name_count; |
codestart = cd->name_table + re->name_entry_size * re->name_count; |
Line 7908 code = (pcre_uchar *)codestart;
|
Line 7953 code = (pcre_uchar *)codestart;
|
&firstchar, &reqchar, NULL, cd, NULL); |
&firstchar, &reqchar, NULL, cd, NULL); |
re->top_bracket = cd->bracount; |
re->top_bracket = cd->bracount; |
re->top_backref = cd->top_backref; |
re->top_backref = cd->top_backref; |
|
re->max_lookbehind = cd->max_lookbehind; |
re->flags = cd->external_flags | PCRE_MODE; |
re->flags = cd->external_flags | PCRE_MODE; |
|
|
if (cd->had_accept) reqchar = REQ_NONE; /* Must disable after (*ACCEPT) */ |
if (cd->had_accept) reqchar = REQ_NONE; /* Must disable after (*ACCEPT) */ |
Line 7995 if (cd->check_lookbehind)
|
Line 8041 if (cd->check_lookbehind)
|
(fixed_length == -4)? ERR70 : ERR25; |
(fixed_length == -4)? ERR70 : ERR25; |
break; |
break; |
} |
} |
|
if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length; |
PUT(cc, 1, fixed_length); |
PUT(cc, 1, fixed_length); |
} |
} |
cc += 1 + LINK_SIZE; |
cc += 1 + LINK_SIZE; |