embedaddon/pcre/pcre_compile.c - diff

Return to pcre_compile.c CVS log

Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / pcre

Diff for /embedaddon/pcre/pcre_compile.c between versions 1.1.1.2 and 1.1.1.3

version 1.1.1.2, 2012/02/21 23:50:25	version 1.1.1.3, 2012/10/09 09:19:17
Line 489 static const char error_texts[] =	Line 489 static const char error_texts[] =
"too many forward references\0"	"too many forward references\0"
"disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"	"disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
"invalid UTF-16 string\0"	"invalid UTF-16 string\0"
	/* 75 */
	"name is too long in (MARK), (PRUNE), (SKIP), or (THEN)\0"
	"character value in \\u.... sequence is too large\0"
;	;

/* Table to identify digits and hex digits. This is used when compiling	/* Table to identify digits and hex digits. This is used when compiling
Line 829 else	Line 832 else
c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));	c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
#endif	#endif
}	}

	#ifdef COMPILE_PCRE8
	if (c > (utf ? 0x10ffff : 0xff))
	#else
	#ifdef COMPILE_PCRE16
	if (c > (utf ? 0x10ffff : 0xffff))
	#endif
	#endif
	{
	*errorcodeptr = ERR76;
	}
	else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
}	}
}	}
else	else
Line 2225 for (;;)	Line 2240 for (;;)
{	{
case OP_CHAR:	case OP_CHAR:
case OP_CHARI:	case OP_CHARI:
	case OP_NOT:
	case OP_NOTI:
case OP_EXACT:	case OP_EXACT:
case OP_EXACTI:	case OP_EXACTI:
	case OP_NOTEXACT:
	case OP_NOTEXACTI:
case OP_UPTO:	case OP_UPTO:
case OP_UPTOI:	case OP_UPTOI:
	case OP_NOTUPTO:
	case OP_NOTUPTOI:
case OP_MINUPTO:	case OP_MINUPTO:
case OP_MINUPTOI:	case OP_MINUPTOI:
	case OP_NOTMINUPTO:
	case OP_NOTMINUPTOI:
case OP_POSUPTO:	case OP_POSUPTO:
case OP_POSUPTOI:	case OP_POSUPTOI:
	case OP_NOTPOSUPTO:
	case OP_NOTPOSUPTOI:
case OP_STAR:	case OP_STAR:
case OP_STARI:	case OP_STARI:
	case OP_NOTSTAR:
	case OP_NOTSTARI:
case OP_MINSTAR:	case OP_MINSTAR:
case OP_MINSTARI:	case OP_MINSTARI:
	case OP_NOTMINSTAR:
	case OP_NOTMINSTARI:
case OP_POSSTAR:	case OP_POSSTAR:
case OP_POSSTARI:	case OP_POSSTARI:
	case OP_NOTPOSSTAR:
	case OP_NOTPOSSTARI:
case OP_PLUS:	case OP_PLUS:
case OP_PLUSI:	case OP_PLUSI:
	case OP_NOTPLUS:
	case OP_NOTPLUSI:
case OP_MINPLUS:	case OP_MINPLUS:
case OP_MINPLUSI:	case OP_MINPLUSI:
	case OP_NOTMINPLUS:
	case OP_NOTMINPLUSI:
case OP_POSPLUS:	case OP_POSPLUS:
case OP_POSPLUSI:	case OP_POSPLUSI:
	case OP_NOTPOSPLUS:
	case OP_NOTPOSPLUSI:
case OP_QUERY:	case OP_QUERY:
case OP_QUERYI:	case OP_QUERYI:
	case OP_NOTQUERY:
	case OP_NOTQUERYI:
case OP_MINQUERY:	case OP_MINQUERY:
case OP_MINQUERYI:	case OP_MINQUERYI:
	case OP_NOTMINQUERY:
	case OP_NOTMINQUERYI:
case OP_POSQUERY:	case OP_POSQUERY:
case OP_POSQUERYI:	case OP_POSQUERYI:
	case OP_NOTPOSQUERY:
	case OP_NOTPOSQUERYI:
if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);	if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
break;	break;
}	}
Line 3069 if (next >= 0) switch(op_code)	Line 3112 if (next >= 0) switch(op_code)
#endif /* SUPPORT_UTF */	#endif /* SUPPORT_UTF */
return (c != TABLE_GET((unsigned int)next, cd->fcc, next)); /* Non-UTF-8 mode */	return (c != TABLE_GET((unsigned int)next, cd->fcc, next)); /* Non-UTF-8 mode */

/* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
opcodes are not used for multi-byte characters, because they are coded using
an XCLASS instead. */

case OP_NOT:	case OP_NOT:
return (c = *previous) == next;	#ifdef SUPPORT_UTF
	GETCHARTEST(c, previous);
	#else
	c = *previous;
	#endif
	return c == next;

case OP_NOTI:	case OP_NOTI:
if ((c = *previous) == next) return TRUE;
#ifdef SUPPORT_UTF	#ifdef SUPPORT_UTF
	GETCHARTEST(c, previous);
	#else
	c = *previous;
	#endif
	if (c == next) return TRUE;
	#ifdef SUPPORT_UTF
if (utf)	if (utf)
{	{
unsigned int othercase;	unsigned int othercase;
if (next < 128) othercase = cd->fcc[next]; else	if (next < 128) othercase = cd->fcc[next]; else
#ifdef SUPPORT_UCP	#ifdef SUPPORT_UCP
othercase = UCD_OTHERCASE(next);	othercase = UCD_OTHERCASE((unsigned int)next);
#else	#else
othercase = NOTACHAR;	othercase = NOTACHAR;
#endif	#endif
Line 3092 if (next >= 0) switch(op_code)	Line 3141 if (next >= 0) switch(op_code)
}	}
else	else
#endif /* SUPPORT_UTF */	#endif /* SUPPORT_UTF */
return (c == (int)(TABLE_GET((unsigned int)next, cd->fcc, next))); /* Non-UTF-8 mode */	return (c == TABLE_GET((unsigned int)next, cd->fcc, next)); /* Non-UTF-8 mode */

/* Note that OP_DIGIT etc. are generated only when PCRE_UCP is not set.	/* Note that OP_DIGIT etc. are generated only when PCRE_UCP is not set.
When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */	When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */

case OP_DIGIT:	case OP_DIGIT:
return next > 127 \|\| (cd->ctypes[next] & ctype_digit) == 0;	return next > 255 \|\| (cd->ctypes[next] & ctype_digit) == 0;

case OP_NOT_DIGIT:	case OP_NOT_DIGIT:
return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;	return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0;

case OP_WHITESPACE:	case OP_WHITESPACE:
return next > 127 \|\| (cd->ctypes[next] & ctype_space) == 0;	return next > 255 \|\| (cd->ctypes[next] & ctype_space) == 0;

case OP_NOT_WHITESPACE:	case OP_NOT_WHITESPACE:
return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;	return next <= 255 && (cd->ctypes[next] & ctype_space) != 0;

case OP_WORDCHAR:	case OP_WORDCHAR:
return next > 127 \|\| (cd->ctypes[next] & ctype_word) == 0;	return next > 255 \|\| (cd->ctypes[next] & ctype_word) == 0;

case OP_NOT_WORDCHAR:	case OP_NOT_WORDCHAR:
return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;	return next <= 255 && (cd->ctypes[next] & ctype_word) != 0;

case OP_HSPACE:	case OP_HSPACE:
case OP_NOT_HSPACE:	case OP_NOT_HSPACE:
Line 3191 switch(op_code)	Line 3240 switch(op_code)
switch(-next)	switch(-next)
{	{
case ESC_d:	case ESC_d:
return c > 127 \|\| (cd->ctypes[c] & ctype_digit) == 0;	return c > 255 \|\| (cd->ctypes[c] & ctype_digit) == 0;

case ESC_D:	case ESC_D:
return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;	return c <= 255 && (cd->ctypes[c] & ctype_digit) != 0;

case ESC_s:	case ESC_s:
return c > 127 \|\| (cd->ctypes[c] & ctype_space) == 0;	return c > 255 \|\| (cd->ctypes[c] & ctype_space) == 0;

case ESC_S:	case ESC_S:
return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;	return c <= 255 && (cd->ctypes[c] & ctype_space) != 0;

case ESC_w:	case ESC_w:
return c > 127 \|\| (cd->ctypes[c] & ctype_word) == 0;	return c > 255 \|\| (cd->ctypes[c] & ctype_word) == 0;

case ESC_W:	case ESC_W:
return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;	return c <= 255 && (cd->ctypes[c] & ctype_word) != 0;

case ESC_h:	case ESC_h:
case ESC_H:	case ESC_H:
Line 3315 switch(op_code)	Line 3364 switch(op_code)
return next == -ESC_d;	return next == -ESC_d;

case OP_WHITESPACE:	case OP_WHITESPACE:
return next == -ESC_S \|\| next == -ESC_d \|\| next == -ESC_w \|\| next == -ESC_R;	return next == -ESC_S \|\| next == -ESC_d \|\| next == -ESC_w;

case OP_NOT_WHITESPACE:	case OP_NOT_WHITESPACE:
return next == -ESC_s \|\| next == -ESC_h \|\| next == -ESC_v;	return next == -ESC_s \|\| next == -ESC_h \|\| next == -ESC_v \|\| next == -ESC_R;

case OP_HSPACE:	case OP_HSPACE:
return next == -ESC_S \|\| next == -ESC_H \|\| next == -ESC_d \|\|	return next == -ESC_S \|\| next == -ESC_H \|\| next == -ESC_d \|\|
Line 4482 for (;; ptr++)	Line 4531 for (;; ptr++)
LONE_SINGLE_CHARACTER:	LONE_SINGLE_CHARACTER:

/* Only the value of 1 matters for class_single_char. */	/* Only the value of 1 matters for class_single_char. */

if (class_single_char < 2) class_single_char++;	if (class_single_char < 2) class_single_char++;

/* If class_charcount is 1, we saw precisely one character. As long as	/* If class_charcount is 1, we saw precisely one character. As long as
there were no negated characters >= 128 and there was no use of \p or \P,	there was no use of \p or \P, in other words, no use of any XCLASS
in other words, no use of any XCLASS features, we can optimize.	features, we can optimize.

In UTF-8 mode, we can optimize the negative case only if there were no
characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
operate on single-bytes characters only. This is an historical hangover.
Maybe one day we can tidy these opcodes to handle multi-byte characters.

The optimization throws away the bit map. We turn the item into a	The optimization throws away the bit map. We turn the item into a
1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.	1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
Note that OP_NOT[I] does not support multibyte characters. In the positive	In the positive case, it can cause firstchar to be set. Otherwise, there
case, it can cause firstchar to be set. Otherwise, there can be no first	can be no first char if this item is first, whatever repeat count may
char if this item is first, whatever repeat count may follow. In the case	follow. In the case of reqchar, save the previous value for reinstating. */
of reqchar, save the previous value for reinstating. */

#ifdef SUPPORT_UTF
if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET
&& (!utf \|\| !negate_class \|\| c < (MAX_VALUE_FOR_SINGLE_CHAR + 1)))
#else
if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)	if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
#endif
{	{
ptr++;	ptr++;
zeroreqchar = reqchar;	zeroreqchar = reqchar;

/* The OP_NOT[I] opcodes work on single characters only. */

if (negate_class)	if (negate_class)
{	{
if (firstchar == REQ_UNSET) firstchar = REQ_NONE;	if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
zerofirstchar = firstchar;	zerofirstchar = firstchar;
*code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;	*code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
*code++ = c;	#ifdef SUPPORT_UTF
	if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
	code += PRIV(ord2utf)(c, code);
	else
	#endif
	*code++ = c;
goto NOT_CHAR;	goto NOT_CHAR;
}	}

Line 4775 for (;; ptr++)	Line 4817 for (;; ptr++)

/* Now handle repetition for the different types of item. */	/* Now handle repetition for the different types of item. */

/* If previous was a character match, abolish the item and generate a	/* If previous was a character or negated character match, abolish the item
repeat item instead. If a char item has a minumum of more than one, ensure	and generate a repeat item instead. If a char item has a minimum of more
that it is set in reqchar - it might not be if a sequence such as x{3} is	than one, ensure that it is set in reqchar - it might not be if a sequence
the first thing in a branch because the x will have gone into firstchar	such as x{3} is the first thing in a branch because the x will have gone
instead. */	into firstchar instead. */

if (previous == OP_CHAR \|\| previous == OP_CHARI)	if (previous == OP_CHAR \|\| previous == OP_CHARI
	\|\| previous == OP_NOT \|\| previous == OP_NOTI)
{	{
op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;	switch (*previous)
	{
	default: /* Make compiler happy. */
	case OP_CHAR: op_type = OP_STAR - OP_STAR; break;
	case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
	case OP_NOT: op_type = OP_NOTSTAR - OP_STAR; break;
	case OP_NOTI: op_type = OP_NOTSTARI - OP_STAR; break;
	}

/* Deal with UTF characters that take up more than one character. It's	/* Deal with UTF characters that take up more than one character. It's
easier to write this out separately than try to macrify it. Use c to	easier to write this out separately than try to macrify it. Use c to
Line 4806 for (;; ptr++)	Line 4856 for (;; ptr++)
with UTF disabled, or for a single character UTF character. */	with UTF disabled, or for a single character UTF character. */
{	{
c = code[-1];	c = code[-1];
if (repeat_min > 1) reqchar = c \| req_caseopt \| cd->req_varyopt;	if (*previous <= OP_CHARI && repeat_min > 1)
	reqchar = c \| req_caseopt \| cd->req_varyopt;
}	}

/* If the repetition is unlimited, it pays to see if the next thing on	/* If the repetition is unlimited, it pays to see if the next thing on
Line 4825 for (;; ptr++)	Line 4876 for (;; ptr++)
goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */	goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
}	}

/* If previous was a single negated character ([^a] or similar), we use
one of the special opcodes, replacing it. The code is shared with single-
character repeats by setting opt_type to add a suitable offset into
repeat_type. We can also test for auto-possessification. OP_NOT and OP_NOTI
are currently used only for single-byte chars. */

else if (previous == OP_NOT \|\| previous == OP_NOTI)
{
op_type = ((*previous == OP_NOT)? OP_NOTSTAR : OP_NOTSTARI) - OP_STAR;
c = previous[1];
if (!possessive_quantifier &&
repeat_max < 0 &&
check_auto_possessive(previous, utf, ptr + 1, options, cd))
{
repeat_type = 0; /* Force greedy */
possessive_quantifier = TRUE;
}
goto OUTPUT_SINGLE_REPEAT;
}

/* If previous was a character type match (\d or similar), abolish it and	/* If previous was a character type match (\d or similar), abolish it and
create a suitable repeat item. The code is shared with single-character	create a suitable repeat item. The code is shared with single-character
repeats by setting op_type to add a suitable offset into repeat_type. Note	repeats by setting op_type to add a suitable offset into repeat_type. Note
Line 5585 for (;; ptr++)	Line 5616 for (;; ptr++)
arg = ++ptr;	arg = ++ptr;
while (ptr != 0 && ptr != CHAR_RIGHT_PARENTHESIS) ptr++;	while (ptr != 0 && ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
arglen = (int)(ptr - arg);	arglen = (int)(ptr - arg);
	if (arglen > (int)MAX_MARK)
	{
	*errorcodeptr = ERR75;
	goto FAILED;
	}
}	}

if (*ptr != CHAR_RIGHT_PARENTHESIS)	if (*ptr != CHAR_RIGHT_PARENTHESIS)
Line 6836 for (;; ptr++)	Line 6872 for (;; ptr++)
/* For the rest (including \X when Unicode properties are supported), we	/* For the rest (including \X when Unicode properties are supported), we
can obtain the OP value by negating the escape value in the default	can obtain the OP value by negating the escape value in the default
situation when PCRE_UCP is not set. When it is set, we substitute	situation when PCRE_UCP is not set. When it is set, we substitute
Unicode property tests. */	Unicode property tests. Note that \b and \B do a one-character
	lookbehind. */

else	else
{	{
	if ((-c == ESC_b \|\| -c == ESC_B) && cd->max_lookbehind == 0)
	cd->max_lookbehind = 1;
#ifdef SUPPORT_UCP	#ifdef SUPPORT_UCP
if (-c >= ESC_DU && -c <= ESC_wu)	if (-c >= ESC_DU && -c <= ESC_wu)
{	{
Line 7147 for (;;)	Line 7186 for (;;)
*ptrptr = ptr;	*ptrptr = ptr;
return FALSE;	return FALSE;
}	}
else { PUT(reverse_count, 0, fixed_length); }	else
	{
	if (fixed_length > cd->max_lookbehind)
	cd->max_lookbehind = fixed_length;
	PUT(reverse_count, 0, fixed_length);
	}
}	}
}	}

Line 7817 cd->start_pattern = (const pcre_uchar *)pattern;	Line 7861 cd->start_pattern = (const pcre_uchar *)pattern;
cd->end_pattern = (const pcre_uchar )(pattern + STRLEN_UC((const pcre_uchar )pattern));	cd->end_pattern = (const pcre_uchar )(pattern + STRLEN_UC((const pcre_uchar )pattern));
cd->req_varyopt = 0;	cd->req_varyopt = 0;
cd->assert_depth = 0;	cd->assert_depth = 0;
	cd->max_lookbehind = 0;
cd->external_options = options;	cd->external_options = options;
cd->external_flags = 0;	cd->external_flags = 0;
cd->open_caps = NULL;	cd->open_caps = NULL;
Line 7867 re->magic_number = MAGIC_NUMBER;	Line 7912 re->magic_number = MAGIC_NUMBER;
re->size = (int)size;	re->size = (int)size;
re->options = cd->external_options;	re->options = cd->external_options;
re->flags = cd->external_flags;	re->flags = cd->external_flags;
re->dummy1 = 0;
re->first_char = 0;	re->first_char = 0;
re->req_char = 0;	re->req_char = 0;
re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);	re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);
Line 7887 field; this time it's used for remembering forward ref	Line 7931 field; this time it's used for remembering forward ref
cd->final_bracount = cd->bracount; /* Save for checking forward references */	cd->final_bracount = cd->bracount; /* Save for checking forward references */
cd->assert_depth = 0;	cd->assert_depth = 0;
cd->bracount = 0;	cd->bracount = 0;
	cd->max_lookbehind = 0;
cd->names_found = 0;	cd->names_found = 0;
cd->name_table = (pcre_uchar *)re + re->name_table_offset;	cd->name_table = (pcre_uchar *)re + re->name_table_offset;
codestart = cd->name_table + re->name_entry_size * re->name_count;	codestart = cd->name_table + re->name_entry_size * re->name_count;
Line 7908 code = (pcre_uchar *)codestart;	Line 7953 code = (pcre_uchar *)codestart;
&firstchar, &reqchar, NULL, cd, NULL);	&firstchar, &reqchar, NULL, cd, NULL);
re->top_bracket = cd->bracount;	re->top_bracket = cd->bracount;
re->top_backref = cd->top_backref;	re->top_backref = cd->top_backref;
	re->max_lookbehind = cd->max_lookbehind;
re->flags = cd->external_flags \| PCRE_MODE;	re->flags = cd->external_flags \| PCRE_MODE;

if (cd->had_accept) reqchar = REQ_NONE; /* Must disable after (ACCEPT) /	if (cd->had_accept) reqchar = REQ_NONE; /* Must disable after (ACCEPT) /
Line 7995 if (cd->check_lookbehind)	Line 8041 if (cd->check_lookbehind)
(fixed_length == -4)? ERR70 : ERR25;	(fixed_length == -4)? ERR70 : ERR25;
break;	break;
}	}
	if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length;
PUT(cc, 1, fixed_length);	PUT(cc, 1, fixed_length);
}	}
cc += 1 + LINK_SIZE;	cc += 1 + LINK_SIZE;

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>

Removed from v.1.1.1.2
changed lines
	Added in v.1.1.1.3