--- embedaddon/pcre/pcre_dfa_exec.c 2012/02/21 23:05:51 1.1.1.1 +++ embedaddon/pcre/pcre_dfa_exec.c 2013/07/22 08:25:55 1.1.1.4 @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the below for why this module is different). Written by Philip Hazel - Copyright (c) 1997-2011 University of Cambridge + Copyright (c) 1997-2013 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -38,10 +38,9 @@ POSSIBILITY OF SUCH DAMAGE. ----------------------------------------------------------------------------- */ - /* This module contains the external function pcre_dfa_exec(), which is an alternative matching function that uses a sort of DFA algorithm (not a true -FSM). This is NOT Perl- compatible, but it has advantages in certain +FSM). This is NOT Perl-compatible, but it has advantages in certain applications. */ @@ -113,7 +112,7 @@ small value. Non-zero values in the table are the offs the character is to be found. ***NOTE*** If the start of this table is modified, the three tables that follow must also be modified. */ -static const uschar coptable[] = { +static const pcre_uint8 coptable[] = { 0, /* End */ 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */ 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */ @@ -128,22 +127,27 @@ static const uschar coptable[] = { 1, /* noti */ /* Positive single-char repeats */ 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ - 3, 3, 3, /* upto, minupto, exact */ - 1, 1, 1, 3, /* *+, ++, ?+, upto+ */ + 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */ + 1+IMM2_SIZE, /* exact */ + 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */ 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */ - 3, 3, 3, /* upto I, minupto I, exact I */ - 1, 1, 1, 3, /* *+I, ++I, ?+I, upto+I */ + 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */ + 1+IMM2_SIZE, /* exact I */ + 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */ /* Negative single-char repeats - only for chars < 256 */ 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */ - 3, 3, 3, /* NOT upto, minupto, exact */ - 1, 1, 1, 3, /* NOT *+, ++, ?+, upto+ */ + 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */ + 1+IMM2_SIZE, /* NOT exact */ + 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */ 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */ - 3, 3, 3, /* NOT upto I, minupto I, exact I */ - 1, 1, 1, 3, /* NOT *+I, ++I, ?+I, upto+I */ + 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */ + 1+IMM2_SIZE, /* NOT exact I */ + 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */ /* Positive type repeats */ 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */ - 3, 3, 3, /* Type upto, minupto, exact */ - 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */ + 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */ + 1+IMM2_SIZE, /* Type exact */ + 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */ /* Character class & ref repeats */ 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */ 0, 0, /* CRRANGE, CRMINRANGE */ @@ -182,7 +186,7 @@ remember the fact that a character could have been ins the subject is reached. ***NOTE*** If the start of this table is modified, the two tables that follow must also be modified. */ -static const uschar poptable[] = { +static const pcre_uint8 poptable[] = { 0, /* End */ 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */ 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ @@ -249,7 +253,7 @@ static const uschar poptable[] = { /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W, and \w */ -static const uschar toptable1[] = { +static const pcre_uint8 toptable1[] = { 0, 0, 0, 0, 0, 0, ctype_digit, ctype_digit, ctype_space, ctype_space, @@ -257,7 +261,7 @@ static const uschar toptable1[] = { 0, 0 /* OP_ANY, OP_ALLANY */ }; -static const uschar toptable2[] = { +static const pcre_uint8 toptable2[] = { 0, 0, 0, 0, 0, 0, ctype_digit, 0, ctype_space, 0, @@ -277,7 +281,7 @@ typedef struct stateblock { int data; /* Some use extra data */ } stateblock; -#define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int)) +#define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int)) #ifdef PCRE_DEBUG @@ -296,15 +300,15 @@ Returns: nothing */ static void -pchars(unsigned char *p, int length, FILE *f) +pchars(const pcre_uchar *p, int length, FILE *f) { -int c; +pcre_uint32 c; while (length-- > 0) { if (isprint(c = *(p++))) fprintf(f, "%c", c); else - fprintf(f, "\\x%02x", c); + fprintf(f, "\\x{%02x}", c); } } #endif @@ -377,7 +381,8 @@ for the current character, one for the following chara next_new_state->count = (y); \ next_new_state->data = (z); \ next_new_state++; \ - DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \ + DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \ + (x), (y), (z), __LINE__)); \ } \ else return PCRE_ERROR_DFA_WSSIZE @@ -386,8 +391,8 @@ for the current character, one for the following chara static int internal_dfa_exec( dfa_match_data *md, - const uschar *this_start_code, - const uschar *current_subject, + const pcre_uchar *this_start_code, + const pcre_uchar *current_subject, int start_offset, int *offsets, int offsetcount, @@ -398,9 +403,9 @@ internal_dfa_exec( stateblock *active_states, *new_states, *temp_states; stateblock *next_active_state, *next_new_state; -const uschar *ctypes, *lcc, *fcc; -const uschar *ptr; -const uschar *end_code, *first_op; +const pcre_uint8 *ctypes, *lcc, *fcc; +const pcre_uchar *ptr; +const pcre_uchar *end_code, *first_op; dfa_recursion_info new_recursive; @@ -409,16 +414,18 @@ int active_count, new_count, match_count; /* Some fields in the md block are frequently referenced, so we load them into independent variables in the hope that this will perform better. */ -const uschar *start_subject = md->start_subject; -const uschar *end_subject = md->end_subject; -const uschar *start_code = md->start_code; +const pcre_uchar *start_subject = md->start_subject; +const pcre_uchar *end_subject = md->end_subject; +const pcre_uchar *start_code = md->start_code; -#ifdef SUPPORT_UTF8 -BOOL utf8 = (md->poptions & PCRE_UTF8) != 0; +#ifdef SUPPORT_UTF +BOOL utf = (md->poptions & PCRE_UTF8) != 0; #else -BOOL utf8 = FALSE; +BOOL utf = FALSE; #endif +BOOL reset_could_continue = FALSE; + rlevel++; offsetcount &= (-2); @@ -442,7 +449,8 @@ new_count = 0; first_op = this_start_code + 1 + LINK_SIZE + ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA || - *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)? 2:0); + *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS) + ? IMM2_SIZE:0); /* The first thing in any (sub) pattern is a bracket of some sort. Push all the alternative states onto the list, and find out where the end is. This @@ -470,18 +478,16 @@ if (*first_op == OP_REVERSE) /* If we can't go back the amount required for the longest lookbehind pattern, go back as far as we can; some alternatives may still be viable. */ -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF /* In character mode we have to step back character by character */ - if (utf8) + if (utf) { for (gone_back = 0; gone_back < max_back; gone_back++) { if (current_subject <= start_subject) break; current_subject--; - while (current_subject > start_subject && - (*current_subject & 0xc0) == 0x80) - current_subject--; + ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--); } } else @@ -542,8 +548,8 @@ else { int length = 1 + LINK_SIZE + ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA || - *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)? - 2:0); + *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS) + ? IMM2_SIZE:0); do { ADD_NEW((int)(end_code - start_code + length), 0); @@ -556,7 +562,7 @@ else workspace[0] = 0; /* Bit indicating which vector is current */ -DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code)); +DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code))); /* Loop for scanning the subject */ @@ -565,9 +571,11 @@ for (;;) { int i, j; int clen, dlen; - unsigned int c, d; + pcre_uint32 c, d; int forced_fail = 0; - BOOL could_continue = FALSE; + BOOL partial_newline = FALSE; + BOOL could_continue = reset_could_continue; + reset_could_continue = FALSE; /* Make the new state list into the active state list and empty the new state list. */ @@ -583,7 +591,7 @@ for (;;) #ifdef PCRE_DEBUG printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP); - pchars((uschar *)ptr, strlen((char *)ptr), stdout); + pchars(ptr, STRLEN_UC(ptr), stdout); printf("\"\n"); printf("%.*sActive states: ", rlevel*2-2, SP); @@ -603,11 +611,12 @@ for (;;) if (ptr < end_subject) { - clen = 1; /* Number of bytes in the character */ -#ifdef SUPPORT_UTF8 - if (utf8) { GETCHARLEN(c, ptr, clen); } else -#endif /* SUPPORT_UTF8 */ + clen = 1; /* Number of data items in the character */ +#ifdef SUPPORT_UTF + GETCHARLENTEST(c, ptr, clen); +#else c = *ptr; +#endif /* SUPPORT_UTF */ } else { @@ -624,9 +633,10 @@ for (;;) { stateblock *current_state = active_states + i; BOOL caseless = FALSE; - const uschar *code; + const pcre_uchar *code; int state_offset = current_state->offset; - int count, codevalue, rrc; + int codevalue, rrc; + int count; #ifdef PCRE_DEBUG printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset); @@ -637,7 +647,8 @@ for (;;) /* A negative offset is a special case meaning "hold off going to this (negated) state until the number of characters in the data field have - been skipped". */ + been skipped". If the could_continue flag was passed over from a previous + state, arrange for it to passed on. */ if (state_offset < 0) { @@ -646,6 +657,7 @@ for (;;) DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP)); ADD_NEW_DATA(state_offset, current_state->count, current_state->data - 1); + if (could_continue) reset_could_continue = TRUE; continue; } else @@ -685,17 +697,17 @@ for (;;) permitted. We also use this mechanism for opcodes such as OP_TYPEPLUS that take an - argument that is not a data character - but is always one byte long. We - have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in - this case. To keep the other cases fast, convert these ones to new opcodes. - */ + argument that is not a data character - but is always one byte long because + the values are small. We have to take special action to deal with \P, \p, + \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert + these ones to new opcodes. */ if (coptable[codevalue] > 0) { dlen = 1; -#ifdef SUPPORT_UTF8 - if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else -#endif /* SUPPORT_UTF8 */ +#ifdef SUPPORT_UTF + if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else +#endif /* SUPPORT_UTF */ d = code[coptable[codevalue]]; if (codevalue >= OP_TYPESTAR) { @@ -779,7 +791,7 @@ for (;;) offsets[0] = (int)(current_subject - start_subject); offsets[1] = (int)(ptr - start_subject); DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP, - offsets[1] - offsets[0], current_subject)); + offsets[1] - offsets[0], (char *)current_subject)); } if ((md->moptions & PCRE_DFA_SHORTEST) != 0) { @@ -816,7 +828,7 @@ for (;;) /*-----------------------------------------------------------------*/ case OP_CBRA: case OP_SCBRA: - ADD_ACTIVE((int)(code - start_code + 3 + LINK_SIZE), 0); + ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0); code += GET(code, 1); while (*code == OP_ALT) { @@ -884,7 +896,20 @@ for (;;) /*-----------------------------------------------------------------*/ case OP_ANY: if (clen > 0 && !IS_NEWLINE(ptr)) - { ADD_NEW(state_offset + 1, 0); } + { + if (ptr + 1 >= md->end_subject && + (md->moptions & (PCRE_PARTIAL_HARD)) != 0 && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + c == NLBLOCK->nl[0]) + { + could_continue = partial_newline = TRUE; + } + else + { + ADD_NEW(state_offset + 1, 0); + } + } break; /*-----------------------------------------------------------------*/ @@ -912,6 +937,19 @@ for (;;) (ptr == end_subject - md->nllen) )) { ADD_ACTIVE(state_offset + 1, 0); } + else if (ptr + 1 >= md->end_subject && + (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + c == NLBLOCK->nl[0]) + { + if ((md->moptions & PCRE_PARTIAL_HARD) != 0) + { + reset_could_continue = TRUE; + ADD_NEW_DATA(-(state_offset + 1), 0, 1); + } + else could_continue = partial_newline = TRUE; + } } break; @@ -924,6 +962,19 @@ for (;;) else if (clen == 0 || ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr))) { ADD_ACTIVE(state_offset + 1, 0); } + else if (ptr + 1 >= md->end_subject && + (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + c == NLBLOCK->nl[0]) + { + if ((md->moptions & PCRE_PARTIAL_HARD) != 0) + { + reset_could_continue = TRUE; + ADD_NEW_DATA(-(state_offset + 1), 0, 1); + } + else could_continue = partial_newline = TRUE; + } } else if (IS_NEWLINE(ptr)) { ADD_ACTIVE(state_offset + 1, 0); } @@ -956,10 +1007,10 @@ for (;;) if (ptr > start_subject) { - const uschar *temp = ptr - 1; + const pcre_uchar *temp = ptr - 1; if (temp < md->start_used_ptr) md->start_used_ptr = temp; -#ifdef SUPPORT_UTF8 - if (utf8) BACKCHAR(temp); +#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 + if (utf) { BACKCHAR(temp); } #endif GETCHARTEST(d, temp); #ifdef SUPPORT_UCP @@ -1011,6 +1062,7 @@ for (;;) if (clen > 0) { BOOL OK; + const pcre_uint32 *cp; const ucd_record * prop = GET_UCD(c); switch(code[1]) { @@ -1024,7 +1076,7 @@ for (;;) break; case PT_GC: - OK = _pcre_ucp_gentype[prop->chartype] == code[2]; + OK = PRIV(ucp_gentype)[prop->chartype] == code[2]; break; case PT_PC: @@ -1038,27 +1090,42 @@ for (;;) /* These are specials for combination cases. */ case PT_ALNUM: - OK = _pcre_ucp_gentype[prop->chartype] == ucp_L || - _pcre_ucp_gentype[prop->chartype] == ucp_N; + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || + PRIV(ucp_gentype)[prop->chartype] == ucp_N; break; case PT_SPACE: /* Perl space */ - OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z || + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; break; case PT_PXSPACE: /* POSIX space */ - OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z || + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || c == CHAR_FF || c == CHAR_CR; break; case PT_WORD: - OK = _pcre_ucp_gentype[prop->chartype] == ucp_L || - _pcre_ucp_gentype[prop->chartype] == ucp_N || + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || + PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE; break; + case PT_CLIST: + cp = PRIV(ucd_caseless_sets) + code[2]; + for (;;) + { + if (c < *cp) { OK = FALSE; break; } + if (c == *cp++) { OK = TRUE; break; } + } + break; + + case PT_UCNC: + OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || + c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || + c >= 0xe000; + break; + /* Should never occur, but keep compilers from grumbling. */ default: @@ -1086,7 +1153,15 @@ for (;;) if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } if (clen > 0) { - if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || + if (d == OP_ANY && ptr + 1 >= md->end_subject && + (md->moptions & (PCRE_PARTIAL_HARD)) != 0 && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + c == NLBLOCK->nl[0]) + { + could_continue = partial_newline = TRUE; + } + else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || (c < 256 && (d != OP_ANY || !IS_NEWLINE(ptr)) && ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) @@ -1109,7 +1184,15 @@ for (;;) ADD_ACTIVE(state_offset + 2, 0); if (clen > 0) { - if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || + if (d == OP_ANY && ptr + 1 >= md->end_subject && + (md->moptions & (PCRE_PARTIAL_HARD)) != 0 && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + c == NLBLOCK->nl[0]) + { + could_continue = partial_newline = TRUE; + } + else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || (c < 256 && (d != OP_ANY || !IS_NEWLINE(ptr)) && ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) @@ -1131,7 +1214,15 @@ for (;;) ADD_ACTIVE(state_offset + 2, 0); if (clen > 0) { - if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || + if (d == OP_ANY && ptr + 1 >= md->end_subject && + (md->moptions & (PCRE_PARTIAL_HARD)) != 0 && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + c == NLBLOCK->nl[0]) + { + could_continue = partial_newline = TRUE; + } + else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || (c < 256 && (d != OP_ANY || !IS_NEWLINE(ptr)) && ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) @@ -1151,13 +1242,21 @@ for (;;) count = current_state->count; /* Number already matched */ if (clen > 0) { - if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || + if (d == OP_ANY && ptr + 1 >= md->end_subject && + (md->moptions & (PCRE_PARTIAL_HARD)) != 0 && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + c == NLBLOCK->nl[0]) + { + could_continue = partial_newline = TRUE; + } + else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || (c < 256 && (d != OP_ANY || !IS_NEWLINE(ptr)) && ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) { - if (++count >= GET2(code, 1)) - { ADD_NEW(state_offset + 4, 0); } + if (++count >= (int)GET2(code, 1)) + { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); } else { ADD_NEW(state_offset, count); } } @@ -1168,11 +1267,19 @@ for (;;) case OP_TYPEUPTO: case OP_TYPEMINUPTO: case OP_TYPEPOSUPTO: - ADD_ACTIVE(state_offset + 4, 0); + ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); count = current_state->count; /* Number already matched */ if (clen > 0) { - if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || + if (d == OP_ANY && ptr + 1 >= md->end_subject && + (md->moptions & (PCRE_PARTIAL_HARD)) != 0 && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + c == NLBLOCK->nl[0]) + { + could_continue = partial_newline = TRUE; + } + else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || (c < 256 && (d != OP_ANY || !IS_NEWLINE(ptr)) && ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) @@ -1182,8 +1289,8 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } - if (++count >= GET2(code, 1)) - { ADD_NEW(state_offset + 4, 0); } + if (++count >= (int)GET2(code, 1)) + { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); } else { ADD_NEW(state_offset, count); } } @@ -1205,6 +1312,7 @@ for (;;) if (clen > 0) { BOOL OK; + const pcre_uint32 *cp; const ucd_record * prop = GET_UCD(c); switch(code[2]) { @@ -1218,7 +1326,7 @@ for (;;) break; case PT_GC: - OK = _pcre_ucp_gentype[prop->chartype] == code[3]; + OK = PRIV(ucp_gentype)[prop->chartype] == code[3]; break; case PT_PC: @@ -1232,27 +1340,42 @@ for (;;) /* These are specials for combination cases. */ case PT_ALNUM: - OK = _pcre_ucp_gentype[prop->chartype] == ucp_L || - _pcre_ucp_gentype[prop->chartype] == ucp_N; + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || + PRIV(ucp_gentype)[prop->chartype] == ucp_N; break; case PT_SPACE: /* Perl space */ - OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z || + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; break; case PT_PXSPACE: /* POSIX space */ - OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z || + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || c == CHAR_FF || c == CHAR_CR; break; case PT_WORD: - OK = _pcre_ucp_gentype[prop->chartype] == ucp_L || - _pcre_ucp_gentype[prop->chartype] == ucp_N || + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || + PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE; break; + case PT_CLIST: + cp = PRIV(ucd_caseless_sets) + code[3]; + for (;;) + { + if (c < *cp) { OK = FALSE; break; } + if (c == *cp++) { OK = TRUE; break; } + } + break; + + case PT_UCNC: + OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || + c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || + c >= 0xe000; + break; + /* Should never occur, but keep compilers from grumbling. */ default: @@ -1279,23 +1402,26 @@ for (;;) case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS: count = current_state->count; /* Already matched */ if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } - if (clen > 0 && UCD_CATEGORY(c) != ucp_M) + if (clen > 0) { - const uschar *nptr = ptr + clen; + int lgb, rgb; + const pcre_uchar *nptr = ptr + clen; int ncount = 0; if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS) { active_count--; /* Remove non-match possibility */ next_active_state--; } + lgb = UCD_GRAPHBREAK(c); while (nptr < end_subject) { - int nd; - int ndlen = 1; - GETCHARLEN(nd, nptr, ndlen); - if (UCD_CATEGORY(nd) != ucp_M) break; + dlen = 1; + if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } + rgb = UCD_GRAPHBREAK(d); + if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; ncount++; - nptr += ndlen; + lgb = rgb; + nptr += dlen; } count++; ADD_NEW_DATA(-state_offset, count, ncount); @@ -1314,20 +1440,22 @@ for (;;) int ncount = 0; switch (c) { - case 0x000b: - case 0x000c: - case 0x0085: + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; goto ANYNL01; - case 0x000d: - if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; + case CHAR_CR: + if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1; /* Fall through */ ANYNL01: - case 0x000a: + case CHAR_LF: if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS) { active_count--; /* Remove non-match possibility */ @@ -1354,13 +1482,7 @@ for (;;) BOOL OK; switch (c) { - case 0x000a: - case 0x000b: - case 0x000c: - case 0x000d: - case 0x0085: - case 0x2028: - case 0x2029: + VSPACE_CASES: OK = TRUE; break; @@ -1393,25 +1515,7 @@ for (;;) BOOL OK; switch (c) { - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ + HSPACE_CASES: OK = TRUE; break; @@ -1452,6 +1556,7 @@ for (;;) if (clen > 0) { BOOL OK; + const pcre_uint32 *cp; const ucd_record * prop = GET_UCD(c); switch(code[2]) { @@ -1465,7 +1570,7 @@ for (;;) break; case PT_GC: - OK = _pcre_ucp_gentype[prop->chartype] == code[3]; + OK = PRIV(ucp_gentype)[prop->chartype] == code[3]; break; case PT_PC: @@ -1479,27 +1584,42 @@ for (;;) /* These are specials for combination cases. */ case PT_ALNUM: - OK = _pcre_ucp_gentype[prop->chartype] == ucp_L || - _pcre_ucp_gentype[prop->chartype] == ucp_N; + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || + PRIV(ucp_gentype)[prop->chartype] == ucp_N; break; case PT_SPACE: /* Perl space */ - OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z || + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; break; case PT_PXSPACE: /* POSIX space */ - OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z || + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || c == CHAR_FF || c == CHAR_CR; break; case PT_WORD: - OK = _pcre_ucp_gentype[prop->chartype] == ucp_L || - _pcre_ucp_gentype[prop->chartype] == ucp_N || + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || + PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE; break; + case PT_CLIST: + cp = PRIV(ucd_caseless_sets) + code[3]; + for (;;) + { + if (c < *cp) { OK = FALSE; break; } + if (c == *cp++) { OK = TRUE; break; } + } + break; + + case PT_UCNC: + OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || + c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || + c >= 0xe000; + break; + /* Should never occur, but keep compilers from grumbling. */ default: @@ -1535,9 +1655,10 @@ for (;;) QS2: ADD_ACTIVE(state_offset + 2, 0); - if (clen > 0 && UCD_CATEGORY(c) != ucp_M) + if (clen > 0) { - const uschar *nptr = ptr + clen; + int lgb, rgb; + const pcre_uchar *nptr = ptr + clen; int ncount = 0; if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR || codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY) @@ -1545,14 +1666,16 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } + lgb = UCD_GRAPHBREAK(c); while (nptr < end_subject) { - int nd; - int ndlen = 1; - GETCHARLEN(nd, nptr, ndlen); - if (UCD_CATEGORY(nd) != ucp_M) break; + dlen = 1; + if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } + rgb = UCD_GRAPHBREAK(d); + if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; ncount++; - nptr += ndlen; + lgb = rgb; + nptr += dlen; } ADD_NEW_DATA(-(state_offset + count), 0, ncount); } @@ -1578,27 +1701,29 @@ for (;;) int ncount = 0; switch (c) { - case 0x000b: - case 0x000c: - case 0x0085: + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; goto ANYNL02; - case 0x000d: - if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; + case CHAR_CR: + if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1; /* Fall through */ ANYNL02: - case 0x000a: + case CHAR_LF: if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR || codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY) { active_count--; /* Remove non-match possibility */ next_active_state--; } - ADD_NEW_DATA(-(state_offset + count), 0, ncount); + ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount); break; default: @@ -1626,13 +1751,7 @@ for (;;) BOOL OK; switch (c) { - case 0x000a: - case 0x000b: - case 0x000c: - case 0x000d: - case 0x0085: - case 0x2028: - case 0x2029: + VSPACE_CASES: OK = TRUE; break; @@ -1648,7 +1767,7 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } - ADD_NEW_DATA(-(state_offset + count), 0, 0); + ADD_NEW_DATA(-(state_offset + (int)count), 0, 0); } } break; @@ -1672,25 +1791,7 @@ for (;;) BOOL OK; switch (c) { - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ + HSPACE_CASES: OK = TRUE; break; @@ -1707,7 +1808,7 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } - ADD_NEW_DATA(-(state_offset + count), 0, 0); + ADD_NEW_DATA(-(state_offset + (int)count), 0, 0); } } break; @@ -1719,13 +1820,14 @@ for (;;) case OP_PROP_EXTRA + OP_TYPEMINUPTO: case OP_PROP_EXTRA + OP_TYPEPOSUPTO: if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT) - { ADD_ACTIVE(state_offset + 6, 0); } + { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); } count = current_state->count; /* Number already matched */ if (clen > 0) { BOOL OK; + const pcre_uint32 *cp; const ucd_record * prop = GET_UCD(c); - switch(code[4]) + switch(code[1 + IMM2_SIZE + 1]) { case PT_ANY: OK = TRUE; @@ -1737,41 +1839,56 @@ for (;;) break; case PT_GC: - OK = _pcre_ucp_gentype[prop->chartype] == code[5]; + OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2]; break; case PT_PC: - OK = prop->chartype == code[5]; + OK = prop->chartype == code[1 + IMM2_SIZE + 2]; break; case PT_SC: - OK = prop->script == code[5]; + OK = prop->script == code[1 + IMM2_SIZE + 2]; break; /* These are specials for combination cases. */ case PT_ALNUM: - OK = _pcre_ucp_gentype[prop->chartype] == ucp_L || - _pcre_ucp_gentype[prop->chartype] == ucp_N; + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || + PRIV(ucp_gentype)[prop->chartype] == ucp_N; break; case PT_SPACE: /* Perl space */ - OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z || + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; break; case PT_PXSPACE: /* POSIX space */ - OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z || + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || c == CHAR_FF || c == CHAR_CR; break; case PT_WORD: - OK = _pcre_ucp_gentype[prop->chartype] == ucp_L || - _pcre_ucp_gentype[prop->chartype] == ucp_N || + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || + PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE; break; + case PT_CLIST: + cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2]; + for (;;) + { + if (c < *cp) { OK = FALSE; break; } + if (c == *cp++) { OK = TRUE; break; } + } + break; + + case PT_UCNC: + OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || + c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || + c >= 0xe000; + break; + /* Should never occur, but keep compilers from grumbling. */ default: @@ -1786,8 +1903,8 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } - if (++count >= GET2(code, 1)) - { ADD_NEW(state_offset + 6, 0); } + if (++count >= (int)GET2(code, 1)) + { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); } else { ADD_NEW(state_offset, count); } } @@ -1800,28 +1917,33 @@ for (;;) case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO: case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO: if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT) - { ADD_ACTIVE(state_offset + 4, 0); } + { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } count = current_state->count; /* Number already matched */ - if (clen > 0 && UCD_CATEGORY(c) != ucp_M) + if (clen > 0) { - const uschar *nptr = ptr + clen; + int lgb, rgb; + const pcre_uchar *nptr = ptr + clen; int ncount = 0; if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO) { active_count--; /* Remove non-match possibility */ next_active_state--; } + lgb = UCD_GRAPHBREAK(c); while (nptr < end_subject) { - int nd; - int ndlen = 1; - GETCHARLEN(nd, nptr, ndlen); - if (UCD_CATEGORY(nd) != ucp_M) break; + dlen = 1; + if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } + rgb = UCD_GRAPHBREAK(d); + if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; ncount++; - nptr += ndlen; + lgb = rgb; + nptr += dlen; } - if (++count >= GET2(code, 1)) - { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); } + if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0) + reset_could_continue = TRUE; + if (++count >= (int)GET2(code, 1)) + { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); } else { ADD_NEW_DATA(-state_offset, count, ncount); } } @@ -1834,34 +1956,36 @@ for (;;) case OP_ANYNL_EXTRA + OP_TYPEMINUPTO: case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO: if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT) - { ADD_ACTIVE(state_offset + 4, 0); } + { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } count = current_state->count; /* Number already matched */ if (clen > 0) { int ncount = 0; switch (c) { - case 0x000b: - case 0x000c: - case 0x0085: + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; goto ANYNL03; - case 0x000d: - if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; + case CHAR_CR: + if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1; /* Fall through */ ANYNL03: - case 0x000a: + case CHAR_LF: if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO) { active_count--; /* Remove non-match possibility */ next_active_state--; } - if (++count >= GET2(code, 1)) - { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); } + if (++count >= (int)GET2(code, 1)) + { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); } else { ADD_NEW_DATA(-state_offset, count, ncount); } break; @@ -1878,20 +2002,14 @@ for (;;) case OP_VSPACE_EXTRA + OP_TYPEMINUPTO: case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO: if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT) - { ADD_ACTIVE(state_offset + 4, 0); } + { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } count = current_state->count; /* Number already matched */ if (clen > 0) { BOOL OK; switch (c) { - case 0x000a: - case 0x000b: - case 0x000c: - case 0x000d: - case 0x0085: - case 0x2028: - case 0x2029: + VSPACE_CASES: OK = TRUE; break; @@ -1906,8 +2024,8 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } - if (++count >= GET2(code, 1)) - { ADD_NEW_DATA(-(state_offset + 4), 0, 0); } + if (++count >= (int)GET2(code, 1)) + { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); } else { ADD_NEW_DATA(-state_offset, count, 0); } } @@ -1920,32 +2038,14 @@ for (;;) case OP_HSPACE_EXTRA + OP_TYPEMINUPTO: case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO: if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT) - { ADD_ACTIVE(state_offset + 4, 0); } + { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } count = current_state->count; /* Number already matched */ if (clen > 0) { BOOL OK; switch (c) { - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ + HSPACE_CASES: OK = TRUE; break; @@ -1961,8 +2061,8 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } - if (++count >= GET2(code, 1)) - { ADD_NEW_DATA(-(state_offset + 4), 0, 0); } + if (++count >= (int)GET2(code, 1)) + { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); } else { ADD_NEW_DATA(-state_offset, count, 0); } } @@ -1984,32 +2084,32 @@ for (;;) case OP_CHARI: if (clen == 0) break; -#ifdef SUPPORT_UTF8 - if (utf8) +#ifdef SUPPORT_UTF + if (utf) { if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else { unsigned int othercase; - if (c < 128) othercase = fcc[c]; else - - /* If we have Unicode property support, we can use it to test the - other case of the character. */ - + if (c < 128) + othercase = fcc[c]; + else + /* If we have Unicode property support, we can use it to test the + other case of the character. */ #ifdef SUPPORT_UCP - othercase = UCD_OTHERCASE(c); + othercase = UCD_OTHERCASE(c); #else - othercase = NOTACHAR; + othercase = NOTACHAR; #endif if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); } } } else -#endif /* SUPPORT_UTF8 */ - - /* Non-UTF-8 mode */ +#endif /* SUPPORT_UTF */ + /* Not UTF mode */ { - if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); } + if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d)) + { ADD_NEW(state_offset + 2, 0); } } break; @@ -2021,18 +2121,24 @@ for (;;) to wait for them to pass before continuing. */ case OP_EXTUNI: - if (clen > 0 && UCD_CATEGORY(c) != ucp_M) + if (clen > 0) { - const uschar *nptr = ptr + clen; + int lgb, rgb; + const pcre_uchar *nptr = ptr + clen; int ncount = 0; + lgb = UCD_GRAPHBREAK(c); while (nptr < end_subject) { - int nclen = 1; - GETCHARLEN(c, nptr, nclen); - if (UCD_CATEGORY(c) != ucp_M) break; + dlen = 1; + if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } + rgb = UCD_GRAPHBREAK(d); + if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; ncount++; - nptr += nclen; + lgb = rgb; + nptr += dlen; } + if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0) + reset_could_continue = TRUE; ADD_NEW_DATA(-(state_offset + 1), 0, ncount); } break; @@ -2046,20 +2152,28 @@ for (;;) case OP_ANYNL: if (clen > 0) switch(c) { - case 0x000b: - case 0x000c: - case 0x0085: + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; - case 0x000a: + case CHAR_LF: ADD_NEW(state_offset + 1, 0); break; - case 0x000d: - if (ptr + 1 < end_subject && ptr[1] == 0x0a) + case CHAR_CR: + if (ptr + 1 >= end_subject) { + ADD_NEW(state_offset + 1, 0); + if ((md->moptions & PCRE_PARTIAL_HARD) != 0) + reset_could_continue = TRUE; + } + else if (RAWUCHARTEST(ptr + 1) == CHAR_LF) + { ADD_NEW_DATA(-(state_offset + 1), 0, 1); } else @@ -2074,13 +2188,7 @@ for (;;) case OP_NOT_VSPACE: if (clen > 0) switch(c) { - case 0x000a: - case 0x000b: - case 0x000c: - case 0x000d: - case 0x0085: - case 0x2028: - case 0x2029: + VSPACE_CASES: break; default: @@ -2093,17 +2201,12 @@ for (;;) case OP_VSPACE: if (clen > 0) switch(c) { - case 0x000a: - case 0x000b: - case 0x000c: - case 0x000d: - case 0x0085: - case 0x2028: - case 0x2029: + VSPACE_CASES: ADD_NEW(state_offset + 1, 0); break; - default: break; + default: + break; } break; @@ -2111,25 +2214,7 @@ for (;;) case OP_NOT_HSPACE: if (clen > 0) switch(c) { - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ + HSPACE_CASES: break; default: @@ -2142,47 +2227,42 @@ for (;;) case OP_HSPACE: if (clen > 0) switch(c) { - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ + HSPACE_CASES: ADD_NEW(state_offset + 1, 0); break; + + default: + break; } break; /*-----------------------------------------------------------------*/ - /* Match a negated single character casefully. This is only used for - one-byte characters, that is, we know that d < 256. The character we are - checking (c) can be multibyte. */ + /* Match a negated single character casefully. */ case OP_NOT: if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); } break; /*-----------------------------------------------------------------*/ - /* Match a negated single character caselessly. This is only used for - one-byte characters, that is, we know that d < 256. The character we are - checking (c) can be multibyte. */ + /* Match a negated single character caselessly. */ case OP_NOTI: - if (clen > 0 && c != d && c != fcc[d]) - { ADD_NEW(state_offset + dlen + 1, 0); } + if (clen > 0) + { + unsigned int otherd; +#ifdef SUPPORT_UTF + if (utf && d >= 128) + { +#ifdef SUPPORT_UCP + otherd = UCD_OTHERCASE(d); +#endif /* SUPPORT_UCP */ + } + else +#endif /* SUPPORT_UTF */ + otherd = TABLE_GET(d, fcc, d); + if (c != d && c != otherd) + { ADD_NEW(state_offset + dlen + 1, 0); } + } break; /*-----------------------------------------------------------------*/ @@ -2206,19 +2286,19 @@ for (;;) if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); } if (clen > 0) { - unsigned int otherd = NOTACHAR; + pcre_uint32 otherd = NOTACHAR; if (caseless) { -#ifdef SUPPORT_UTF8 - if (utf8 && d >= 128) +#ifdef SUPPORT_UTF + if (utf && d >= 128) { #ifdef SUPPORT_UCP otherd = UCD_OTHERCASE(d); #endif /* SUPPORT_UCP */ } else -#endif /* SUPPORT_UTF8 */ - otherd = fcc[d]; +#endif /* SUPPORT_UTF */ + otherd = TABLE_GET(d, fcc, d); } if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) { @@ -2253,19 +2333,19 @@ for (;;) ADD_ACTIVE(state_offset + dlen + 1, 0); if (clen > 0) { - unsigned int otherd = NOTACHAR; + pcre_uint32 otherd = NOTACHAR; if (caseless) { -#ifdef SUPPORT_UTF8 - if (utf8 && d >= 128) +#ifdef SUPPORT_UTF + if (utf && d >= 128) { #ifdef SUPPORT_UCP otherd = UCD_OTHERCASE(d); #endif /* SUPPORT_UCP */ } else -#endif /* SUPPORT_UTF8 */ - otherd = fcc[d]; +#endif /* SUPPORT_UTF */ + otherd = TABLE_GET(d, fcc, d); } if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) { @@ -2298,19 +2378,19 @@ for (;;) ADD_ACTIVE(state_offset + dlen + 1, 0); if (clen > 0) { - unsigned int otherd = NOTACHAR; + pcre_uint32 otherd = NOTACHAR; if (caseless) { -#ifdef SUPPORT_UTF8 - if (utf8 && d >= 128) +#ifdef SUPPORT_UTF + if (utf && d >= 128) { #ifdef SUPPORT_UCP otherd = UCD_OTHERCASE(d); #endif /* SUPPORT_UCP */ } else -#endif /* SUPPORT_UTF8 */ - otherd = fcc[d]; +#endif /* SUPPORT_UTF */ + otherd = TABLE_GET(d, fcc, d); } if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) { @@ -2335,24 +2415,24 @@ for (;;) count = current_state->count; /* Number already matched */ if (clen > 0) { - unsigned int otherd = NOTACHAR; + pcre_uint32 otherd = NOTACHAR; if (caseless) { -#ifdef SUPPORT_UTF8 - if (utf8 && d >= 128) +#ifdef SUPPORT_UTF + if (utf && d >= 128) { #ifdef SUPPORT_UCP otherd = UCD_OTHERCASE(d); #endif /* SUPPORT_UCP */ } else -#endif /* SUPPORT_UTF8 */ - otherd = fcc[d]; +#endif /* SUPPORT_UTF */ + otherd = TABLE_GET(d, fcc, d); } if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) { - if (++count >= GET2(code, 1)) - { ADD_NEW(state_offset + dlen + 3, 0); } + if (++count >= (int)GET2(code, 1)) + { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); } else { ADD_NEW(state_offset, count); } } @@ -2375,23 +2455,23 @@ for (;;) case OP_NOTUPTO: case OP_NOTMINUPTO: case OP_NOTPOSUPTO: - ADD_ACTIVE(state_offset + dlen + 3, 0); + ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0); count = current_state->count; /* Number already matched */ if (clen > 0) { - unsigned int otherd = NOTACHAR; + pcre_uint32 otherd = NOTACHAR; if (caseless) { -#ifdef SUPPORT_UTF8 - if (utf8 && d >= 128) +#ifdef SUPPORT_UTF + if (utf && d >= 128) { #ifdef SUPPORT_UCP otherd = UCD_OTHERCASE(d); #endif /* SUPPORT_UCP */ } else -#endif /* SUPPORT_UTF8 */ - otherd = fcc[d]; +#endif /* SUPPORT_UTF */ + otherd = TABLE_GET(d, fcc, d); } if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) { @@ -2400,8 +2480,8 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } - if (++count >= GET2(code, 1)) - { ADD_NEW(state_offset + dlen + 3, 0); } + if (++count >= (int)GET2(code, 1)) + { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); } else { ADD_NEW(state_offset, count); } } @@ -2418,18 +2498,18 @@ for (;;) { BOOL isinclass = FALSE; int next_state_offset; - const uschar *ecode; + const pcre_uchar *ecode; /* For a simple class, there is always just a 32-byte table, and we can set isinclass from it. */ if (codevalue != OP_XCLASS) { - ecode = code + 33; + ecode = code + 1 + (32 / sizeof(pcre_uchar)); if (clen > 0) { isinclass = (c > 255)? (codevalue == OP_NCLASS) : - ((code[1 + c/8] & (1 << (c&7))) != 0); + ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0); } } @@ -2440,7 +2520,7 @@ for (;;) else { ecode = code + GET(code, 1); - if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE); + if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf); } /* At this point, isinclass is set for all kinds of class, and ecode @@ -2473,13 +2553,13 @@ for (;;) case OP_CRRANGE: case OP_CRMINRANGE: count = current_state->count; /* Already matched */ - if (count >= GET2(ecode, 1)) - { ADD_ACTIVE(next_state_offset + 5, 0); } + if (count >= (int)GET2(ecode, 1)) + { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); } if (isinclass) { - int max = GET2(ecode, 3); + int max = (int)GET2(ecode, 1 + IMM2_SIZE); if (++count >= max && max != 0) /* Max 0 => no limit */ - { ADD_NEW(next_state_offset + 5, 0); } + { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); } else { ADD_NEW(state_offset, count); } } @@ -2510,7 +2590,7 @@ for (;;) int rc; int local_offsets[2]; int local_workspace[1000]; - const uschar *endasscode = code + GET(code, 1); + const pcre_uchar *endasscode = code + GET(code, 1); while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1); @@ -2547,13 +2627,19 @@ for (;;) if (code[LINK_SIZE+1] == OP_CALLOUT) { rrc = 0; - if (pcre_callout != NULL) + if (PUBL(callout) != NULL) { - pcre_callout_block cb; + PUBL(callout_block) cb; cb.version = 1; /* Version 1 of the callout block */ cb.callout_number = code[LINK_SIZE+2]; cb.offset_vector = offsets; +#if defined COMPILE_PCRE8 cb.subject = (PCRE_SPTR)start_subject; +#elif defined COMPILE_PCRE16 + cb.subject = (PCRE_SPTR16)start_subject; +#elif defined COMPILE_PCRE32 + cb.subject = (PCRE_SPTR32)start_subject; +#endif cb.subject_length = (int)(end_subject - start_subject); cb.start_match = (int)(current_subject - start_subject); cb.current_position = (int)(ptr - start_subject); @@ -2563,10 +2649,10 @@ for (;;) cb.capture_last = -1; cb.callout_data = md->callout_data; cb.mark = NULL; /* No (*MARK) support */ - if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */ + if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */ } if (rrc > 0) break; /* Fail this thread */ - code += _pcre_OP_lengths[OP_CALLOUT]; /* Skip callout data */ + code += PRIV(OP_lengths)[OP_CALLOUT]; /* Skip callout data */ } condcode = code[LINK_SIZE+1]; @@ -2587,10 +2673,10 @@ for (;;) else if (condcode == OP_RREF || condcode == OP_NRREF) { - int value = GET2(code, LINK_SIZE+2); + int value = GET2(code, LINK_SIZE + 2); if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND; if (md->recursive != NULL) - { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); } + { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); } else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); } } @@ -2599,8 +2685,8 @@ for (;;) else { int rc; - const uschar *asscode = code + LINK_SIZE + 1; - const uschar *endasscode = asscode + GET(asscode, 1); + const pcre_uchar *asscode = code + LINK_SIZE + 1; + const pcre_uchar *endasscode = asscode + GET(asscode, 1); while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1); @@ -2631,7 +2717,7 @@ for (;;) dfa_recursion_info *ri; int local_offsets[1000]; int local_workspace[1000]; - const uschar *callpat = start_code + GET(code, 1); + const pcre_uchar *callpat = start_code + GET(code, 1); int recno = (callpat == md->start_code)? 0 : GET2(callpat, 1 + LINK_SIZE); int rc; @@ -2682,10 +2768,15 @@ for (;;) { for (rc = rc*2 - 2; rc >= 0; rc -= 2) { - const uschar *p = start_subject + local_offsets[rc]; - const uschar *pp = start_subject + local_offsets[rc+1]; int charcount = local_offsets[rc+1] - local_offsets[rc]; - while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--; +#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 + if (utf) + { + const pcre_uchar *p = start_subject + local_offsets[rc]; + const pcre_uchar *pp = start_subject + local_offsets[rc+1]; + while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--; + } +#endif if (charcount > 0) { ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1)); @@ -2708,7 +2799,7 @@ for (;;) case OP_BRAPOSZERO: { int charcount, matched_count; - const uschar *local_ptr = ptr; + const pcre_uchar *local_ptr = ptr; BOOL allow_zero; if (codevalue == OP_BRAPOSZERO) @@ -2758,7 +2849,7 @@ for (;;) if (matched_count > 0 || allow_zero) { - const uschar *end_subpattern = code; + const pcre_uchar *end_subpattern = code; int next_state_offset; do { end_subpattern += GET(end_subpattern, 1); } @@ -2779,10 +2870,12 @@ for (;;) } else { - const uschar *p = ptr; - const uschar *pp = local_ptr; + const pcre_uchar *p = ptr; + const pcre_uchar *pp = local_ptr; charcount = (int)(pp - p); - while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--; +#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 + if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--; +#endif ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1)); } } @@ -2809,7 +2902,7 @@ for (;;) if (rc >= 0) { - const uschar *end_subpattern = code; + const pcre_uchar *end_subpattern = code; int charcount = local_offsets[1] - local_offsets[0]; int next_state_offset, repeat_state_offset; @@ -2862,9 +2955,14 @@ for (;;) } else { - const uschar *p = start_subject + local_offsets[0]; - const uschar *pp = start_subject + local_offsets[1]; - while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--; +#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 + if (utf) + { + const pcre_uchar *p = start_subject + local_offsets[0]; + const pcre_uchar *pp = start_subject + local_offsets[1]; + while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--; + } +#endif ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1)); if (repeat_state_offset >= 0) { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); } @@ -2880,13 +2978,19 @@ for (;;) case OP_CALLOUT: rrc = 0; - if (pcre_callout != NULL) + if (PUBL(callout) != NULL) { - pcre_callout_block cb; + PUBL(callout_block) cb; cb.version = 1; /* Version 1 of the callout block */ cb.callout_number = code[1]; cb.offset_vector = offsets; +#if defined COMPILE_PCRE8 cb.subject = (PCRE_SPTR)start_subject; +#elif defined COMPILE_PCRE16 + cb.subject = (PCRE_SPTR16)start_subject; +#elif defined COMPILE_PCRE32 + cb.subject = (PCRE_SPTR32)start_subject; +#endif cb.subject_length = (int)(end_subject - start_subject); cb.start_match = (int)(current_subject - start_subject); cb.current_position = (int)(ptr - start_subject); @@ -2896,10 +3000,10 @@ for (;;) cb.capture_last = -1; cb.callout_data = md->callout_data; cb.mark = NULL; /* No (*MARK) support */ - if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */ + if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */ } if (rrc == 0) - { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); } + { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); } break; @@ -2928,7 +3032,7 @@ for (;;) if (new_count <= 0) { if (rlevel == 1 && /* Top level, and */ - could_continue && /* Some could go on */ + could_continue && /* Some could go on, and */ forced_fail != workspace[1] && /* Not all forced fail & */ ( /* either... */ (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */ @@ -2936,17 +3040,14 @@ for (;;) ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */ match_count < 0) /* no matches */ ) && /* And... */ - ptr >= end_subject && /* Reached end of subject */ - ptr > md->start_used_ptr) /* Inspected non-empty string */ - { - if (offsetcount >= 2) - { - offsets[0] = (int)(md->start_used_ptr - start_subject); - offsets[1] = (int)(end_subject - start_subject); - } + ( + partial_newline || /* Either partial NL */ + ( /* or ... */ + ptr >= end_subject && /* End of subject and */ + ptr > md->start_used_ptr) /* Inspected non-empty string */ + ) + ) match_count = PCRE_ERROR_PARTIAL; - } - DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count, rlevel*2-2, SP)); @@ -2996,28 +3097,38 @@ Returns: > 0 => number of match offset pairs < -1 => some kind of unexpected problem */ +#if defined COMPILE_PCRE8 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data, const char *subject, int length, int start_offset, int options, int *offsets, int offsetcount, int *workspace, int wscount) +#elif defined COMPILE_PCRE16 +PCRE_EXP_DEFN int PCRE_CALL_CONVENTION +pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data, + PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets, + int offsetcount, int *workspace, int wscount) +#elif defined COMPILE_PCRE32 +PCRE_EXP_DEFN int PCRE_CALL_CONVENTION +pcre32_dfa_exec(const pcre32 *argument_re, const pcre32_extra *extra_data, + PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets, + int offsetcount, int *workspace, int wscount) +#endif { -real_pcre *re = (real_pcre *)argument_re; +REAL_PCRE *re = (REAL_PCRE *)argument_re; dfa_match_data match_block; dfa_match_data *md = &match_block; -BOOL utf8, anchored, startline, firstline; -const uschar *current_subject, *end_subject, *lcc; - -pcre_study_data internal_study; +BOOL utf, anchored, startline, firstline; +const pcre_uchar *current_subject, *end_subject; const pcre_study_data *study = NULL; -real_pcre internal_re; -const uschar *req_byte_ptr; -const uschar *start_bits = NULL; -BOOL first_byte_caseless = FALSE; -BOOL req_byte_caseless = FALSE; -int first_byte = -1; -int req_byte = -1; -int req_byte2 = -1; +const pcre_uchar *req_char_ptr; +const pcre_uint8 *start_bits = NULL; +BOOL has_first_char = FALSE; +BOOL has_req_char = FALSE; +pcre_uchar first_char = 0; +pcre_uchar first_char2 = 0; +pcre_uchar req_char = 0; +pcre_uchar req_char2 = 0; int newline; /* Plausibility checks */ @@ -3027,13 +3138,31 @@ if (re == NULL || subject == NULL || workspace == NULL (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE; +if (length < 0) return PCRE_ERROR_BADLENGTH; if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET; -/* We need to find the pointer to any study data before we test for byte -flipping, so we scan the extra_data block first. This may set two fields in the -match block, so we must initialize them beforehand. However, the other fields -in the match block must not be set until after the byte flipping. */ +/* Check that the first field in the block is the magic number. If it is not, +return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to +REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which +means that the pattern is likely compiled with different endianness. */ +if (re->magic_number != MAGIC_NUMBER) + return re->magic_number == REVERSED_MAGIC_NUMBER? + PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC; +if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE; + +/* If restarting after a partial match, do some sanity checks on the contents +of the workspace. */ + +if ((options & PCRE_DFA_RESTART) != 0) + { + if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 || + workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK) + return PCRE_ERROR_DFA_BADRESTART; + } + +/* Set up study, callout, and table data */ + md->tables = re->tables; md->callout_data = NULL; @@ -3051,28 +3180,17 @@ if (extra_data != NULL) md->tables = extra_data->tables; } -/* Check that the first field in the block is the magic number. If it is not, -test for a regex that was compiled on a host of opposite endianness. If this is -the case, flipped values are put in internal_re and internal_study if there was -study data too. */ - -if (re->magic_number != MAGIC_NUMBER) - { - re = _pcre_try_flipped(re, &internal_re, study, &internal_study); - if (re == NULL) return PCRE_ERROR_BADMAGIC; - if (study != NULL) study = &internal_study; - } - /* Set some local values */ -current_subject = (const unsigned char *)subject + start_offset; -end_subject = (const unsigned char *)subject + length; -req_byte_ptr = current_subject - 1; +current_subject = (const pcre_uchar *)subject + start_offset; +end_subject = (const pcre_uchar *)subject + length; +req_char_ptr = current_subject - 1; -#ifdef SUPPORT_UTF8 -utf8 = (re->options & PCRE_UTF8) != 0; +#ifdef SUPPORT_UTF +/* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */ +utf = (re->options & PCRE_UTF8) != 0; #else -utf8 = FALSE; +utf = FALSE; #endif anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 || @@ -3080,9 +3198,9 @@ anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART) /* The remaining fixed data for passing around. */ -md->start_code = (const uschar *)argument_re + +md->start_code = (const pcre_uchar *)argument_re + re->name_table_offset + re->name_count * re->name_entry_size; -md->start_subject = (const unsigned char *)subject; +md->start_subject = (const pcre_uchar *)subject; md->end_subject = end_subject; md->start_offset = start_offset; md->moptions = options; @@ -3143,11 +3261,11 @@ else /* Check a UTF-8 string if required. Unfortunately there's no way of passing back the character offset. */ -#ifdef SUPPORT_UTF8 -if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) +#ifdef SUPPORT_UTF +if (utf && (options & PCRE_NO_UTF8_CHECK) == 0) { int erroroffset; - int errorcode = _pcre_valid_utf8((uschar *)subject, length, &erroroffset); + int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset); if (errorcode != 0) { if (offsetcount >= 2) @@ -3155,12 +3273,21 @@ if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) offsets[0] = erroroffset; offsets[1] = errorcode; } - return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)? +#if defined COMPILE_PCRE8 + return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0) ? PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8; +#elif defined COMPILE_PCRE16 + return (errorcode <= PCRE_UTF16_ERR1 && (options & PCRE_PARTIAL_HARD) != 0) ? + PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16; +#elif defined COMPILE_PCRE32 + return PCRE_ERROR_BADUTF32; +#endif } +#if defined COMPILE_PCRE8 || defined COMPILE_PCRE16 if (start_offset > 0 && start_offset < length && - (((USPTR)subject)[start_offset] & 0xc0) == 0x80) + NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset])) return PCRE_ERROR_BADUTF8_OFFSET; +#endif } #endif @@ -3168,12 +3295,11 @@ if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) is a feature that makes it possible to save compiled regex and re-use them in other programs later. */ -if (md->tables == NULL) md->tables = _pcre_default_tables; +if (md->tables == NULL) md->tables = PRIV(default_tables); -/* The lower casing table and the "must be at the start of a line" flag are -used in a loop when finding where to start. */ +/* The "must be at the start of a line" flags are used in a loop when finding +where to start. */ -lcc = md->tables + lcc_offset; startline = (re->flags & PCRE_STARTLINE) != 0; firstline = (re->options & PCRE_FIRSTLINE) != 0; @@ -3187,9 +3313,16 @@ if (!anchored) { if ((re->flags & PCRE_FIRSTSET) != 0) { - first_byte = re->first_byte & 255; - if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE) - first_byte = lcc[first_byte]; + has_first_char = TRUE; + first_char = first_char2 = (pcre_uchar)(re->first_char); + if ((re->flags & PCRE_FCH_CASELESS) != 0) + { + first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char); +#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) + if (utf && first_char > 127) + first_char2 = UCD_OTHERCASE(first_char); +#endif + } } else { @@ -3204,9 +3337,16 @@ character" set. */ if ((re->flags & PCRE_REQCHSET) != 0) { - req_byte = re->req_byte & 255; - req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; - req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */ + has_req_char = TRUE; + req_char = req_char2 = (pcre_uchar)(re->req_char); + if ((re->flags & PCRE_RCH_CASELESS) != 0) + { + req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char); +#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) + if (utf && req_char > 127) + req_char2 = UCD_OTHERCASE(req_char); +#endif + } } /* Call the main matching function, looping for a non-anchored regex after a @@ -3219,7 +3359,7 @@ for (;;) if ((options & PCRE_DFA_RESTART) == 0) { - const uschar *save_end_subject = end_subject; + const pcre_uchar *save_end_subject = end_subject; /* If firstline is TRUE, the start of the match is constrained to the first line of a multiline string. Implement this by temporarily adjusting @@ -3228,14 +3368,14 @@ for (;;) if (firstline) { - USPTR t = current_subject; -#ifdef SUPPORT_UTF8 - if (utf8) + PCRE_PUCHAR t = current_subject; +#ifdef SUPPORT_UTF + if (utf) { while (t < md->end_subject && !IS_NEWLINE(t)) { t++; - while (t < end_subject && (*t & 0xc0) == 0x80) t++; + ACROSSCHAR(t < end_subject, *t, t++); } } else @@ -3252,17 +3392,20 @@ for (;;) if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0) { - /* Advance to a known first byte. */ + /* Advance to a known first char. */ - if (first_byte >= 0) + if (has_first_char) { - if (first_byte_caseless) + if (first_char != first_char2) + { + pcre_uchar csc; while (current_subject < end_subject && - lcc[*current_subject] != first_byte) + (csc = RAWUCHARTEST(current_subject)) != first_char && csc != first_char2) current_subject++; + } else while (current_subject < end_subject && - *current_subject != first_byte) + RAWUCHARTEST(current_subject) != first_char) current_subject++; } @@ -3272,16 +3415,15 @@ for (;;) { if (current_subject > md->start_subject + start_offset) { -#ifdef SUPPORT_UTF8 - if (utf8) +#ifdef SUPPORT_UTF + if (utf) { while (current_subject < end_subject && !WAS_NEWLINE(current_subject)) { current_subject++; - while(current_subject < end_subject && - (*current_subject & 0xc0) == 0x80) - current_subject++; + ACROSSCHAR(current_subject < end_subject, *current_subject, + current_subject++); } } else @@ -3293,10 +3435,10 @@ for (;;) ANYCRLF, and we are now at a LF, advance the match position by one more character. */ - if (current_subject[-1] == CHAR_CR && + if (RAWUCHARTEST(current_subject - 1) == CHAR_CR && (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) && current_subject < end_subject && - *current_subject == CHAR_NL) + RAWUCHARTEST(current_subject) == CHAR_NL) current_subject++; } } @@ -3307,14 +3449,19 @@ for (;;) { while (current_subject < end_subject) { - register unsigned int c = *current_subject; + register pcre_uint32 c = RAWUCHARTEST(current_subject); +#ifndef COMPILE_PCRE8 + if (c > 255) c = 255; +#endif if ((start_bits[c/8] & (1 << (c&7))) == 0) { current_subject++; -#ifdef SUPPORT_UTF8 - if (utf8) - while(current_subject < end_subject && - (*current_subject & 0xc0) == 0x80) current_subject++; +#if defined SUPPORT_UTF && defined COMPILE_PCRE8 + /* In non 8-bit mode, the iteration will stop for + characters > 255 at the beginning or not stop at all. */ + if (utf) + ACROSSCHAR(current_subject < end_subject, *current_subject, + current_subject++); #endif } else break; @@ -3342,8 +3489,8 @@ for (;;) (pcre_uint32)(end_subject - current_subject) < study->minlength) return PCRE_ERROR_NOMATCH; - /* If req_byte is set, we know that that character must appear in the - subject for the match to succeed. If the first character is set, req_byte + /* If req_char is set, we know that that character must appear in the + subject for the match to succeed. If the first character is set, req_char must be later in the subject; otherwise the test starts at the match point. This optimization can save a huge amount of work in patterns with nested unlimited repeats that aren't going to match. Writing separate @@ -3355,28 +3502,28 @@ for (;;) patterns. This showed up when somebody was matching /^C/ on a 32-megabyte string... so we don't do this when the string is sufficiently long. */ - if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX) + if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX) { - register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0); + register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0); /* We don't need to repeat the search if we haven't yet reached the place we found it at last time. */ - if (p > req_byte_ptr) + if (p > req_char_ptr) { - if (req_byte_caseless) + if (req_char != req_char2) { while (p < end_subject) { - register int pp = *p++; - if (pp == req_byte || pp == req_byte2) { p--; break; } + register pcre_uint32 pp = RAWUCHARINCTEST(p); + if (pp == req_char || pp == req_char2) { p--; break; } } } else { while (p < end_subject) { - if (*p++ == req_byte) { p--; break; } + if (RAWUCHARINCTEST(p) == req_char) { p--; break; } } } @@ -3389,7 +3536,7 @@ for (;;) found it, so that we don't search again next time round the loop if the start hasn't passed this character yet. */ - req_byte_ptr = p; + req_char_ptr = p; } } } @@ -3414,27 +3561,39 @@ for (;;) /* Anything other than "no match" means we are done, always; otherwise, carry on only if not anchored. */ - if (rc != PCRE_ERROR_NOMATCH || anchored) return rc; + if (rc != PCRE_ERROR_NOMATCH || anchored) + { + if (rc == PCRE_ERROR_PARTIAL && offsetcount >= 2) + { + offsets[0] = (int)(md->start_used_ptr - (PCRE_PUCHAR)subject); + offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject); + if (offsetcount > 2) + offsets[2] = (int)(current_subject - (PCRE_PUCHAR)subject); + } + return rc; + } /* Advance to the next subject character unless we are at the end of a line and firstline is set. */ if (firstline && IS_NEWLINE(current_subject)) break; current_subject++; - if (utf8) +#ifdef SUPPORT_UTF + if (utf) { - while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80) - current_subject++; + ACROSSCHAR(current_subject < end_subject, *current_subject, + current_subject++); } +#endif if (current_subject > end_subject) break; /* If we have just passed a CR and we are now at a LF, and the pattern does not contain any explicit matches for \r or \n, and the newline option is CRLF or ANY or ANYCRLF, advance the match position by one more character. */ - if (current_subject[-1] == CHAR_CR && + if (RAWUCHARTEST(current_subject - 1) == CHAR_CR && current_subject < end_subject && - *current_subject == CHAR_NL && + RAWUCHARTEST(current_subject) == CHAR_NL && (re->flags & PCRE_HASCRORLF) == 0 && (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF ||