--- embedaddon/pcre/pcre_exec.c 2012/10/09 09:19:17 1.1.1.3 +++ embedaddon/pcre/pcre_exec.c 2013/07/22 08:25:56 1.1.1.4 @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2012 University of Cambridge + Copyright (c) 1997-2013 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -56,6 +56,20 @@ possible. There are also some static supporting functi #undef min #undef max +/* The md->capture_last field uses the lower 16 bits for the last captured +substring (which can never be greater than 65535) and a bit in the top half +to mean "capture vector overflowed". This odd way of doing things was +implemented when it was realized that preserving and restoring the overflow bit +whenever the last capture number was saved/restored made for a neater +interface, and doing it this way saved on (a) another variable, which would +have increased the stack frame size (a big NO-NO in PCRE) and (b) another +separate set of save/restore instructions. The following defines are used in +implementing this. */ + +#define CAPLMASK 0x0000ffff /* The bits used for last_capture */ +#define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */ +#define OVFLBIT 0x00010000 /* The bit that is set for overflow */ + /* Values for setting in md->match_function_type to indicate two special types of call to match(). We do it this way to save on using another stack variable, as stack usage is to be discouraged. */ @@ -73,13 +87,17 @@ defined PCRE_ERROR_xxx codes, which are all negative. negative to avoid the external error codes. */ #define MATCH_ACCEPT (-999) -#define MATCH_COMMIT (-998) -#define MATCH_KETRPOS (-997) -#define MATCH_ONCE (-996) +#define MATCH_KETRPOS (-998) +#define MATCH_ONCE (-997) +/* The next 5 must be kept together and in sequence so that a test that checks +for any one of them can use a range. */ +#define MATCH_COMMIT (-996) #define MATCH_PRUNE (-995) #define MATCH_SKIP (-994) #define MATCH_SKIP_ARG (-993) #define MATCH_THEN (-992) +#define MATCH_BACKTRACK_MAX MATCH_THEN +#define MATCH_BACKTRACK_MIN MATCH_COMMIT /* Maximum number of ints of offset to save on the stack for recursive calls. If the offset vector is bigger, malloc is used. This should be a multiple of 3, @@ -92,8 +110,6 @@ because the offset vector is always a multiple of 3 lo static const char rep_min[] = { 0, 0, 1, 1, 0, 0 }; static const char rep_max[] = { 0, 0, 0, 0, 1, 1 }; - - #ifdef PCRE_DEBUG /************************************************* * Debugging function to print chars * @@ -114,10 +130,11 @@ Returns: nothing static void pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md) { -unsigned int c; +pcre_uint32 c; +BOOL utf = md->utf; if (is_subject && length > md->end_subject - p) length = md->end_subject - p; while (length-- > 0) - if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c); + if (isprint(c = RAWUCHARINCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c); } #endif @@ -150,6 +167,9 @@ match_ref(int offset, register PCRE_PUCHAR eptr, int l { PCRE_PUCHAR eptr_start = eptr; register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset]; +#ifdef SUPPORT_UTF +BOOL utf = md->utf; +#endif #ifdef PCRE_DEBUG if (eptr >= md->end_subject) @@ -177,24 +197,35 @@ if (caseless) { #ifdef SUPPORT_UTF #ifdef SUPPORT_UCP - if (md->utf) + if (utf) { /* Match characters up to the end of the reference. NOTE: the number of - bytes matched may differ, because there are some characters whose upper and - lower case versions code as different numbers of bytes. For example, U+023A - (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8); - a sequence of 3 of the former uses 6 bytes, as does a sequence of two of - the latter. It is important, therefore, to check the length along the - reference, not along the subject (earlier code did this wrong). */ + data units matched may differ, because in UTF-8 there are some characters + whose upper and lower case versions code have different numbers of bytes. + For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65 + (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a + sequence of two of the latter. It is important, therefore, to check the + length along the reference, not along the subject (earlier code did this + wrong). */ PCRE_PUCHAR endptr = p + length; while (p < endptr) { - int c, d; + pcre_uint32 c, d; + const ucd_record *ur; if (eptr >= md->end_subject) return -2; /* Partial match */ GETCHARINC(c, eptr); GETCHARINC(d, p); - if (c != d && c != UCD_OTHERCASE(d)) return -1; + ur = GET_UCD(d); + if (c != d && c != d + ur->other_case) + { + const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset; + for (;;) + { + if (c < *pp) return -1; + if (c == *pp++) break; + } + } } } else @@ -206,8 +237,11 @@ if (caseless) { while (length-- > 0) { + pcre_uint32 cc, cp; if (eptr >= md->end_subject) return -2; /* Partial match */ - if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1; + cc = RAWUCHARTEST(eptr); + cp = RAWUCHARTEST(p); + if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1; p++; eptr++; } @@ -222,7 +256,7 @@ else while (length-- > 0) { if (eptr >= md->end_subject) return -2; /* Partial match */ - if (*p++ != *eptr++) return -1; + if (RAWUCHARINCTEST(p) != RAWUCHARINCTEST(eptr)) return -1; } } @@ -278,7 +312,7 @@ enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40, RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50, RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60, - RM61, RM62, RM63, RM64, RM65, RM66 }; + RM61, RM62, RM63, RM64, RM65, RM66, RM67, RM68 }; /* These versions of the macros use the stack, as normal. There are debugging versions and production versions. Note that the "rw" argument of RMATCH isn't @@ -296,7 +330,7 @@ actually used in this definition. */ } #define RRETURN(ra) \ { \ - printf("match() returned %d from line %d ", ra, __LINE__); \ + printf("match() returned %d from line %d\n", ra, __LINE__); \ return ra; \ } #else @@ -387,7 +421,7 @@ typedef struct heapframe { #ifdef SUPPORT_UCP int Xprop_type; - int Xprop_value; + unsigned int Xprop_value; int Xprop_fail_result; int Xoclength; pcre_uchar Xocchars[6]; @@ -400,10 +434,10 @@ typedef struct heapframe { int Xlength; int Xmax; int Xmin; - int Xnumber; + unsigned int Xnumber; int Xoffset; - int Xop; - int Xsave_capture_last; + unsigned int Xop; + pcre_int32 Xsave_capture_last; int Xsave_offset1, Xsave_offset2, Xsave_offset3; int Xstacksave[REC_STACK_SAVE_MAX]; @@ -488,7 +522,7 @@ so they can be ordinary variables in all cases. Mark s register int rrc; /* Returns from recursive calls */ register int i; /* Used for loops not involving calls to RMATCH() */ -register unsigned int c; /* Character values not kept over RMATCH() calls */ +register pcre_uint32 c; /* Character values not kept over RMATCH() calls */ register BOOL utf; /* Local copy of UTF flag for speed */ BOOL minimize, possessive; /* Quantifier options */ @@ -605,7 +639,7 @@ BOOL prev_is_word; #ifdef SUPPORT_UCP int prop_type; -int prop_value; +unsigned int prop_value; int prop_fail_result; int oclength; pcre_uchar occhars[6]; @@ -616,10 +650,10 @@ int ctype; int length; int max; int min; -int number; +unsigned int number; int offset; -int op; -int save_capture_last; +unsigned int op; +pcre_int32 save_capture_last; int save_offset1, save_offset2, save_offset3; int stacksave[REC_STACK_SAVE_MAX]; @@ -737,7 +771,7 @@ for (;;) unaltered. */ else if (rrc == MATCH_SKIP_ARG && - STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0) + STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0) { md->start_match_ptr = eptr; RRETURN(MATCH_SKIP); @@ -747,23 +781,16 @@ for (;;) case OP_FAIL: RRETURN(MATCH_NOMATCH); - /* COMMIT overrides PRUNE, SKIP, and THEN */ - case OP_COMMIT: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb, RM52); - if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && - rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG && - rrc != MATCH_THEN) - RRETURN(rrc); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); RRETURN(MATCH_COMMIT); - /* PRUNE overrides THEN */ - case OP_PRUNE: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb, RM51); - if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); RRETURN(MATCH_PRUNE); case OP_PRUNE_ARG: @@ -773,38 +800,39 @@ for (;;) eptrb, RM56); if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) && md->mark == NULL) md->mark = ecode + 2; - if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); RRETURN(MATCH_PRUNE); - /* SKIP overrides PRUNE and THEN */ - case OP_SKIP: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb, RM53); - if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN) - RRETURN(rrc); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); md->start_match_ptr = eptr; /* Pass back current position */ RRETURN(MATCH_SKIP); /* Note that, for Perl compatibility, SKIP with an argument does NOT set - nomatch_mark. There is a flag that disables this opcode when re-matching a - pattern that ended with a SKIP for which there was not a matching MARK. */ + nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was + not a matching mark, we have to re-run the match, ignoring the SKIP_ARG + that failed and any that precede it (either they also failed, or were not + triggered). To do this, we maintain a count of executed SKIP_ARGs. If a + SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg + set to the count of the one that failed. */ case OP_SKIP_ARG: - if (md->ignore_skip_arg) + md->skip_arg_count++; + if (md->skip_arg_count <= md->ignore_skip_arg) { ecode += PRIV(OP_lengths)[*ecode] + ecode[1]; break; } RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md, eptrb, RM57); - if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN) - RRETURN(rrc); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); /* Pass back the current skip name by overloading md->start_match_ptr and returning the special MATCH_SKIP_ARG return code. This will either be caught by a matching MARK, or get to the top, where it causes a rematch - with the md->ignore_skip_arg flag set. */ + with md->ignore_skip_arg set to the value of md->skip_arg_count. */ md->start_match_ptr = ecode + 2; RRETURN(MATCH_SKIP_ARG); @@ -1050,6 +1078,7 @@ for (;;) /* In all other cases, we have to make another call to match(). */ save_mark = md->mark; + save_capture_last = md->capture_last; RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb, RM2); @@ -1081,6 +1110,7 @@ for (;;) ecode += GET(ecode, 1); md->mark = save_mark; if (*ecode != OP_ALT) break; + md->capture_last = save_capture_last; } RRETURN(MATCH_NOMATCH); @@ -1202,6 +1232,7 @@ for (;;) POSSESSIVE_NON_CAPTURE: matched_once = FALSE; code_offset = (int)(ecode - md->start_code); + save_capture_last = md->capture_last; for (;;) { @@ -1231,6 +1262,7 @@ for (;;) if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode += GET(ecode, 1); if (*ecode != OP_ALT) break; + md->capture_last = save_capture_last; } if (matched_once || allow_zero) @@ -1262,10 +1294,12 @@ for (;;) cb.version = 2; /* Version 1 of the callout block */ cb.callout_number = ecode[LINK_SIZE+2]; cb.offset_vector = md->offset_vector; -#ifdef COMPILE_PCRE8 +#if defined COMPILE_PCRE8 cb.subject = (PCRE_SPTR)md->start_subject; -#else +#elif defined COMPILE_PCRE16 cb.subject = (PCRE_SPTR16)md->start_subject; +#elif defined COMPILE_PCRE32 + cb.subject = (PCRE_SPTR32)md->start_subject; #endif cb.subject_length = (int)(md->end_subject - md->start_subject); cb.start_match = (int)(mstart - md->start_subject); @@ -1273,13 +1307,16 @@ for (;;) cb.pattern_position = GET(ecode, LINK_SIZE + 3); cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE); cb.capture_top = offset_top/2; - cb.capture_last = md->capture_last; + cb.capture_last = md->capture_last & CAPLMASK; + /* Internal change requires this for API compatibility. */ + if (cb.capture_last == 0) cb.capture_last = -1; cb.callout_data = md->callout_data; cb.mark = md->nomatch_mark; if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH); if (rrc < 0) RRETURN(rrc); } ecode += PRIV(OP_lengths)[OP_CALLOUT]; + codelink -= PRIV(OP_lengths)[OP_CALLOUT]; } condcode = ecode[LINK_SIZE+1]; @@ -1295,7 +1332,7 @@ for (;;) } else { - int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/ + unsigned int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/ condition = (recno == RREF_ANY || recno == md->recursive->group_num); /* If the test is for recursion into a specific subpattern, and it is @@ -1367,7 +1404,7 @@ for (;;) if (!condition && condcode == OP_NCREF) { - int refno = offset >> 1; + unsigned int refno = offset >> 1; pcre_uchar *slotA = md->name_table; for (i = 0; i < md->name_count; i++) @@ -1495,7 +1532,7 @@ for (;;) to close any currently open capturing brackets. */ case OP_CLOSE: - number = GET2(ecode, 1); + number = GET2(ecode, 1); /* Must be less than 65536 */ offset = number << 1; #ifdef PCRE_DEBUG @@ -1503,8 +1540,8 @@ for (;;) printf("\n"); #endif - md->capture_last = number; - if (offset >= md->offset_max) md->offset_overflow = TRUE; else + md->capture_last = (md->capture_last & OVFLMASK) | number; + if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else { md->offset_vector[offset] = md->offset_vector[md->offset_end - number]; @@ -1566,46 +1603,66 @@ for (;;) } else condassert = FALSE; + /* Loop for each branch */ + do { RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4); + + /* A match means that the assertion is true; break out of the loop + that matches its alternatives. */ + if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) { mstart = md->start_match_ptr; /* In case \K reset it */ break; } + + /* If not matched, restore the previous mark setting. */ + md->mark = save_mark; - /* A COMMIT failure must fail the entire assertion, without trying any - subsequent branches. */ + /* See comment in the code for capturing groups above about handling + THEN. */ - if (rrc == MATCH_COMMIT) RRETURN(MATCH_NOMATCH); + if (rrc == MATCH_THEN) + { + next = ecode + GET(ecode,1); + if (md->start_match_ptr < next && + (*ecode == OP_ALT || *next == OP_ALT)) + rrc = MATCH_NOMATCH; + } - /* PCRE does not allow THEN to escape beyond an assertion; it - is treated as NOMATCH. */ + /* Anything other than NOMATCH causes the entire assertion to fail, + passing back the return code. This includes COMMIT, SKIP, PRUNE and an + uncaptured THEN, which means they take their normal effect. This + consistent approach does not always have exactly the same effect as in + Perl. */ - if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode += GET(ecode, 1); } - while (*ecode == OP_ALT); + while (*ecode == OP_ALT); /* Continue for next alternative */ + /* If we have tried all the alternative branches, the assertion has + failed. If not, we broke out after a match. */ + if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH); /* If checking an assertion for a condition, return MATCH_MATCH. */ if (condassert) RRETURN(MATCH_MATCH); - /* Continue from after the assertion, updating the offsets high water - mark, since extracts may have been taken during the assertion. */ + /* Continue from after a successful assertion, updating the offsets high + water mark, since extracts may have been taken during the assertion. */ do ecode += GET(ecode,1); while (*ecode == OP_ALT); ecode += 1 + LINK_SIZE; offset_top = md->end_offset_top; continue; - /* Negative assertion: all branches must fail to match. Encountering SKIP, - PRUNE, or COMMIT means we must assume failure without checking subsequent - branches. */ + /* Negative assertion: all branches must fail to match for the assertion to + succeed. */ case OP_ASSERT_NOT: case OP_ASSERTBACK_NOT: @@ -1617,28 +1674,64 @@ for (;;) } else condassert = FALSE; + /* Loop for each alternative branch. */ + do { RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5); - md->mark = save_mark; - if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH); - if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT) + md->mark = save_mark; /* Always restore the mark setting */ + + switch(rrc) { - do ecode += GET(ecode,1); while (*ecode == OP_ALT); + case MATCH_MATCH: /* A successful match means */ + case MATCH_ACCEPT: /* the assertion has failed. */ + RRETURN(MATCH_NOMATCH); + + case MATCH_NOMATCH: /* Carry on with next branch */ break; + + /* See comment in the code for capturing groups above about handling + THEN. */ + + case MATCH_THEN: + next = ecode + GET(ecode,1); + if (md->start_match_ptr < next && + (*ecode == OP_ALT || *next == OP_ALT)) + { + rrc = MATCH_NOMATCH; + break; + } + /* Otherwise fall through. */ + + /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole + assertion to fail to match, without considering any more alternatives. + Failing to match means the assertion is true. This is a consistent + approach, but does not always have the same effect as in Perl. */ + + case MATCH_COMMIT: + case MATCH_SKIP: + case MATCH_SKIP_ARG: + case MATCH_PRUNE: + do ecode += GET(ecode,1); while (*ecode == OP_ALT); + goto NEG_ASSERT_TRUE; /* Break out of alternation loop */ + + /* Anything else is an error */ + + default: + RRETURN(rrc); } - /* PCRE does not allow THEN to escape beyond an assertion; it is treated - as NOMATCH. */ + /* Continue with next branch */ - if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); ecode += GET(ecode,1); } while (*ecode == OP_ALT); - if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */ + /* All branches in the assertion failed to match. */ - ecode += 1 + LINK_SIZE; + NEG_ASSERT_TRUE: + if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */ + ecode += 1 + LINK_SIZE; /* Continue with current branch */ continue; /* Move the subject pointer back. This occurs only at the start of @@ -1685,10 +1778,12 @@ for (;;) cb.version = 2; /* Version 1 of the callout block */ cb.callout_number = ecode[1]; cb.offset_vector = md->offset_vector; -#ifdef COMPILE_PCRE8 +#if defined COMPILE_PCRE8 cb.subject = (PCRE_SPTR)md->start_subject; -#else +#elif defined COMPILE_PCRE16 cb.subject = (PCRE_SPTR16)md->start_subject; +#elif defined COMPILE_PCRE32 + cb.subject = (PCRE_SPTR32)md->start_subject; #endif cb.subject_length = (int)(md->end_subject - md->start_subject); cb.start_match = (int)(mstart - md->start_subject); @@ -1696,7 +1791,9 @@ for (;;) cb.pattern_position = GET(ecode, 2); cb.next_item_length = GET(ecode, 2 + LINK_SIZE); cb.capture_top = offset_top/2; - cb.capture_last = md->capture_last; + cb.capture_last = md->capture_last & CAPLMASK; + /* Internal change requires this for API compatibility. */ + if (cb.capture_last == 0) cb.capture_last = -1; cb.callout_data = md->callout_data; cb.mark = md->nomatch_mark; if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH); @@ -1725,7 +1822,7 @@ for (;;) case OP_RECURSE: { recursion_info *ri; - int recno; + unsigned int recno; callpat = md->start_code + GET(ecode, 1); recno = (callpat == md->start_code)? 0 : @@ -1742,6 +1839,7 @@ for (;;) /* Add to "recursing stack" */ new_recursive.group_num = recno; + new_recursive.saved_capture_last = md->capture_last; new_recursive.subject_position = eptr; new_recursive.prevrec = md->recursive; md->recursive = &new_recursive; @@ -1765,8 +1863,9 @@ for (;;) new_recursive.saved_max * sizeof(int)); /* OK, now we can do the recursion. After processing each alternative, - restore the offset data. If there were nested recursions, md->recursive - might be changed, so reset it before looping. */ + restore the offset data and the last captured value. If there were nested + recursions, md->recursive might be changed, so reset it before looping. + */ DPRINTF(("Recursing into group %d\n", new_recursive.group_num)); cbegroup = (*callpat >= OP_SBRA); @@ -1777,6 +1876,7 @@ for (;;) md, eptrb, RM6); memcpy(md->offset_vector, new_recursive.offset_save, new_recursive.saved_max * sizeof(int)); + md->capture_last = new_recursive.saved_capture_last; md->recursive = new_recursive.prevrec; if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) { @@ -1793,11 +1893,16 @@ for (;;) goto RECURSION_MATCHED; /* Exit loop; end processing */ } - /* PCRE does not allow THEN or COMMIT to escape beyond a recursion; it - is treated as NOMATCH. */ + /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a + recursion; they cause a NOMATCH for the entire recursion. These codes + are defined in a range that can be tested for. */ - else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN && - rrc != MATCH_COMMIT) + if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX) + RRETURN(MATCH_NOMATCH); + + /* Any return code other than NOMATCH is an error. */ + + if (rrc != MATCH_NOMATCH) { DPRINTF(("Recursion gave error %d\n", rrc)); if (new_recursive.offset_save != stacksave) @@ -1927,8 +2032,8 @@ for (;;) /* Deal with capturing */ - md->capture_last = number; - if (offset >= md->offset_max) md->offset_overflow = TRUE; else + md->capture_last = (md->capture_last & OVFLMASK) | number; + if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else { /* If offset is greater than offset_top, it means that we are "skipping" a capturing group, and that group's offsets must be marked @@ -2079,7 +2184,7 @@ for (;;) eptr + 1 >= md->end_subject && NLBLOCK->nltype == NLTYPE_FIXED && NLBLOCK->nllen == 2 && - *eptr == NLBLOCK->nl[0]) + RAWUCHARTEST(eptr) == NLBLOCK->nl[0]) { md->hitend = TRUE; if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); @@ -2123,7 +2228,7 @@ for (;;) eptr + 1 >= md->end_subject && NLBLOCK->nltype == NLTYPE_FIXED && NLBLOCK->nllen == 2 && - *eptr == NLBLOCK->nl[0]) + RAWUCHARTEST(eptr) == NLBLOCK->nl[0]) { md->hitend = TRUE; if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); @@ -2266,7 +2371,7 @@ for (;;) eptr + 1 >= md->end_subject && NLBLOCK->nltype == NLTYPE_FIXED && NLBLOCK->nllen == 2 && - *eptr == NLBLOCK->nl[0]) + RAWUCHARTEST(eptr) == NLBLOCK->nl[0]) { md->hitend = TRUE; if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); @@ -2415,22 +2520,24 @@ for (;;) { default: RRETURN(MATCH_NOMATCH); - case 0x000d: + case CHAR_CR: if (eptr >= md->end_subject) { SCHECK_PARTIAL(); } - else if (*eptr == 0x0a) eptr++; + else if (RAWUCHARTEST(eptr) == CHAR_LF) eptr++; break; - case 0x000a: + case CHAR_LF: break; - case 0x000b: - case 0x000c: - case 0x0085: + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); break; } @@ -2446,27 +2553,8 @@ for (;;) GETCHARINCTEST(c, eptr); switch(c) { + HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */ default: break; - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ - RRETURN(MATCH_NOMATCH); } ecode++; break; @@ -2480,27 +2568,8 @@ for (;;) GETCHARINCTEST(c, eptr); switch(c) { + HSPACE_CASES: break; /* Byte and multibyte cases */ default: RRETURN(MATCH_NOMATCH); - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ - break; } ecode++; break; @@ -2514,15 +2583,8 @@ for (;;) GETCHARINCTEST(c, eptr); switch(c) { + VSPACE_CASES: RRETURN(MATCH_NOMATCH); default: break; - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ - case 0x2028: /* LINE SEPARATOR */ - case 0x2029: /* PARAGRAPH SEPARATOR */ - RRETURN(MATCH_NOMATCH); } ecode++; break; @@ -2536,15 +2598,8 @@ for (;;) GETCHARINCTEST(c, eptr); switch(c) { + VSPACE_CASES: break; default: RRETURN(MATCH_NOMATCH); - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ - case 0x2028: /* LINE SEPARATOR */ - case 0x2029: /* PARAGRAPH SEPARATOR */ - break; } ecode++; break; @@ -2562,6 +2617,7 @@ for (;;) } GETCHARINCTEST(c, eptr); { + const pcre_uint32 *cp; const ucd_record *prop = GET_UCD(c); switch(ecode[1]) @@ -2622,6 +2678,24 @@ for (;;) RRETURN(MATCH_NOMATCH); break; + case PT_CLIST: + cp = PRIV(ucd_caseless_sets) + ecode[2]; + for (;;) + { + if (c < *cp) + { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; } + if (c == *cp++) + { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } } + } + break; + + case PT_UCNC: + if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || + c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || + c >= 0xe000) == (op == OP_NOTPROP)) + RRETURN(MATCH_NOMATCH); + break; + /* This should never occur */ default: @@ -2641,19 +2715,25 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - GETCHARINCTEST(c, eptr); - if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH); - while (eptr < md->end_subject) + else { - int len = 1; - if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } - if (UCD_CATEGORY(c) != ucp_M) break; - eptr += len; + int lgb, rgb; + GETCHARINCTEST(c, eptr); + lgb = UCD_GRAPHBREAK(c); + while (eptr < md->end_subject) + { + int len = 1; + if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } + rgb = UCD_GRAPHBREAK(c); + if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; + lgb = rgb; + eptr += len; + } } CHECK_PARTIAL(); ecode++; break; -#endif +#endif /* SUPPORT_UCP */ /* Match a back reference, possibly repeatedly. Look past the end of the @@ -3162,7 +3242,7 @@ for (;;) CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */ RRETURN(MATCH_NOMATCH); } - while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH); + while (length-- > 0) if (*ecode++ != RAWUCHARINC(eptr)) RRETURN(MATCH_NOMATCH); } else #endif @@ -3202,8 +3282,8 @@ for (;;) if (fc < 128) { - if (md->lcc[fc] - != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH); + pcre_uint32 cc = RAWUCHAR(eptr); + if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH); ecode++; eptr++; } @@ -3214,7 +3294,7 @@ for (;;) else { - unsigned int dc; + pcre_uint32 dc; GETCHARINC(dc, eptr); ecode += length; @@ -3307,8 +3387,23 @@ for (;;) max = rep_max[c]; /* zero for max => infinity */ if (max == 0) max = INT_MAX; - /* Common code for all repeated single-character matches. */ + /* Common code for all repeated single-character matches. We first check + for the minimum number of characters. If the minimum equals the maximum, we + are done. Otherwise, if minimizing, check the rest of the pattern for a + match; if there isn't one, advance up to the maximum, one character at a + time. + If maximizing, advance up to the maximum number of matching characters, + until eptr is past the end of the maximum run. If possessive, we are + then done (no backing up). Otherwise, match at this position; anything + other than no match is immediately returned. For nomatch, back up one + character, unless we are matching \R and the last thing matched was + \r\n, in which case, back up two bytes. When we reach the first optional + character position, we can save stack by doing a tail recurse. + + The various UTF/non-UTF and caseful/caseless cases are handled separately, + for speed. */ + REPEATCHAR: #ifdef SUPPORT_UTF if (utf) @@ -3324,7 +3419,7 @@ for (;;) if (length > 1) { #ifdef SUPPORT_UCP - unsigned int othercase; + pcre_uint32 othercase; if (op >= OP_STARI && /* Caseless */ (othercase = UCD_OTHERCASE(fc)) != fc) oclength = PRIV(ord2utf)(othercase, occhars); @@ -3391,13 +3486,12 @@ for (;;) } } - if (possessive) continue; - + if (possessive) continue; /* No backtracking */ for(;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM23); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr == pp) { RRETURN(MATCH_NOMATCH); } #ifdef SUPPORT_UCP eptr--; BACKCHAR(eptr); @@ -3451,12 +3545,14 @@ for (;;) for (i = 1; i <= min; i++) { + pcre_uint32 cc; /* Faster than pcre_uchar */ if (eptr >= md->end_subject) { SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH); + cc = RAWUCHARTEST(eptr); + if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH); eptr++; } if (min == max) continue; @@ -3464,6 +3560,7 @@ for (;;) { for (fi = min;; fi++) { + pcre_uint32 cc; /* Faster than pcre_uchar */ RMATCH(eptr, ecode, offset_top, md, eptrb, RM24); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max) RRETURN(MATCH_NOMATCH); @@ -3472,7 +3569,8 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH); + cc = RAWUCHARTEST(eptr); + if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH); eptr++; } /* Control never gets here */ @@ -3482,19 +3580,21 @@ for (;;) pp = eptr; for (i = min; i < max; i++) { + pcre_uint32 cc; /* Faster than pcre_uchar */ if (eptr >= md->end_subject) { SCHECK_PARTIAL(); break; } - if (fc != *eptr && foc != *eptr) break; + cc = RAWUCHARTEST(eptr); + if (fc != cc && foc != cc) break; eptr++; } - if (possessive) continue; - - while (eptr >= pp) + if (possessive) continue; /* No backtracking */ + for (;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM25); eptr--; if (rrc != MATCH_NOMATCH) RRETURN(rrc); @@ -3515,7 +3615,7 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if (fc != *eptr++) RRETURN(MATCH_NOMATCH); + if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH); } if (min == max) continue; @@ -3532,7 +3632,7 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if (fc != *eptr++) RRETURN(MATCH_NOMATCH); + if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH); } /* Control never gets here */ } @@ -3546,13 +3646,13 @@ for (;;) SCHECK_PARTIAL(); break; } - if (fc != *eptr) break; + if (fc != RAWUCHARTEST(eptr)) break; eptr++; } - if (possessive) continue; - - while (eptr >= pp) + if (possessive) continue; /* No backtracking */ + for (;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM27); eptr--; if (rrc != MATCH_NOMATCH) RRETURN(rrc); @@ -3575,7 +3675,7 @@ for (;;) #ifdef SUPPORT_UTF if (utf) { - register unsigned int ch, och; + register pcre_uint32 ch, och; ecode++; GETCHARINC(ch, ecode); @@ -3602,7 +3702,7 @@ for (;;) else #endif { - register unsigned int ch = ecode[1]; + register pcre_uint32 ch = ecode[1]; c = *eptr++; if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c)) RRETURN(MATCH_NOMATCH); @@ -3716,7 +3816,7 @@ for (;;) #ifdef SUPPORT_UTF if (utf) { - register unsigned int d; + register pcre_uint32 d; for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) @@ -3729,7 +3829,7 @@ for (;;) } } else -#endif +#endif /* SUPPORT_UTF */ /* Not UTF mode */ { for (i = 1; i <= min; i++) @@ -3751,7 +3851,7 @@ for (;;) #ifdef SUPPORT_UTF if (utf) { - register unsigned int d; + register pcre_uint32 d; for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, eptrb, RM28); @@ -3767,7 +3867,7 @@ for (;;) } } else -#endif +#endif /*SUPPORT_UTF */ /* Not UTF mode */ { for (fi = min;; fi++) @@ -3796,7 +3896,7 @@ for (;;) #ifdef SUPPORT_UTF if (utf) { - register unsigned int d; + register pcre_uint32 d; for (i = min; i < max; i++) { int len = 1; @@ -3809,17 +3909,18 @@ for (;;) if (fc == d || (unsigned int)foc == d) break; eptr += len; } - if (possessive) continue; + if (possessive) continue; /* No backtracking */ for(;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM30); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr-- == pp) break; /* Stop if tried at original pos */ + eptr--; BACKCHAR(eptr); } } else -#endif +#endif /* SUPPORT_UTF */ /* Not UTF mode */ { for (i = min; i < max; i++) @@ -3832,9 +3933,10 @@ for (;;) if (fc == *eptr || foc == *eptr) break; eptr++; } - if (possessive) continue; - while (eptr >= pp) + if (possessive) continue; /* No backtracking */ + for (;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM31); if (rrc != MATCH_NOMATCH) RRETURN(rrc); eptr--; @@ -3853,7 +3955,7 @@ for (;;) #ifdef SUPPORT_UTF if (utf) { - register unsigned int d; + register pcre_uint32 d; for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) @@ -3887,7 +3989,7 @@ for (;;) #ifdef SUPPORT_UTF if (utf) { - register unsigned int d; + register pcre_uint32 d; for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, eptrb, RM32); @@ -3931,7 +4033,7 @@ for (;;) #ifdef SUPPORT_UTF if (utf) { - register unsigned int d; + register pcre_uint32 d; for (i = min; i < max; i++) { int len = 1; @@ -3944,12 +4046,13 @@ for (;;) if (fc == d) break; eptr += len; } - if (possessive) continue; + if (possessive) continue; /* No backtracking */ for(;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM34); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr-- == pp) break; /* Stop if tried at original pos */ + eptr--; BACKCHAR(eptr); } } @@ -3967,9 +4070,10 @@ for (;;) if (fc == *eptr) break; eptr++; } - if (possessive) continue; - while (eptr >= pp) + if (possessive) continue; /* No backtracking */ + for (;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM35); if (rrc != MATCH_NOMATCH) RRETURN(rrc); eptr--; @@ -4207,6 +4311,43 @@ for (;;) } break; + case PT_CLIST: + for (i = 1; i <= min; i++) + { + const pcre_uint32 *cp; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(c, eptr); + cp = PRIV(ucd_caseless_sets) + prop_value; + for (;;) + { + if (c < *cp) + { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } } + if (c == *cp++) + { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; } + } + } + break; + + case PT_UCNC: + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(c, eptr); + if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || + c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || + c >= 0xe000) == prop_fail_result) + RRETURN(MATCH_NOMATCH); + } + break; + /* This should not occur */ default: @@ -4226,14 +4367,20 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - GETCHARINCTEST(c, eptr); - if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH); - while (eptr < md->end_subject) + else { - int len = 1; - if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } - if (UCD_CATEGORY(c) != ucp_M) break; - eptr += len; + int lgb, rgb; + GETCHARINCTEST(c, eptr); + lgb = UCD_GRAPHBREAK(c); + while (eptr < md->end_subject) + { + int len = 1; + if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } + rgb = UCD_GRAPHBREAK(c); + if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; + lgb = rgb; + eptr += len; + } } CHECK_PARTIAL(); } @@ -4260,7 +4407,7 @@ for (;;) eptr + 1 >= md->end_subject && NLBLOCK->nltype == NLTYPE_FIXED && NLBLOCK->nllen == 2 && - *eptr == NLBLOCK->nl[0]) + RAWUCHAR(eptr) == NLBLOCK->nl[0]) { md->hitend = TRUE; if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); @@ -4301,18 +4448,20 @@ for (;;) { default: RRETURN(MATCH_NOMATCH); - case 0x000d: - if (eptr < md->end_subject && *eptr == 0x0a) eptr++; + case CHAR_CR: + if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++; break; - case 0x000a: + case CHAR_LF: break; - case 0x000b: - case 0x000c: - case 0x0085: + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); break; } @@ -4330,27 +4479,8 @@ for (;;) GETCHARINC(c, eptr); switch(c) { + HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */ default: break; - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ - RRETURN(MATCH_NOMATCH); } } break; @@ -4366,27 +4496,8 @@ for (;;) GETCHARINC(c, eptr); switch(c) { + HSPACE_CASES: break; /* Byte and multibyte cases */ default: RRETURN(MATCH_NOMATCH); - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ - break; } } break; @@ -4402,15 +4513,8 @@ for (;;) GETCHARINC(c, eptr); switch(c) { + VSPACE_CASES: RRETURN(MATCH_NOMATCH); default: break; - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ - case 0x2028: /* LINE SEPARATOR */ - case 0x2029: /* PARAGRAPH SEPARATOR */ - RRETURN(MATCH_NOMATCH); } } break; @@ -4426,15 +4530,8 @@ for (;;) GETCHARINC(c, eptr); switch(c) { + VSPACE_CASES: break; default: RRETURN(MATCH_NOMATCH); - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ - case 0x2028: /* LINE SEPARATOR */ - case 0x2029: /* PARAGRAPH SEPARATOR */ - break; } } break; @@ -4456,12 +4553,14 @@ for (;;) case OP_DIGIT: for (i = 1; i <= min; i++) { + pcre_uint32 cc; if (eptr >= md->end_subject) { SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_digit) == 0) + cc = RAWUCHAR(eptr); + if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); eptr++; /* No need to skip more bytes - we know it's a 1-byte character */ @@ -4471,12 +4570,14 @@ for (;;) case OP_NOT_WHITESPACE: for (i = 1; i <= min; i++) { + pcre_uint32 cc; if (eptr >= md->end_subject) { SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0) + cc = RAWUCHAR(eptr); + if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); eptr++; ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); @@ -4486,12 +4587,14 @@ for (;;) case OP_WHITESPACE: for (i = 1; i <= min; i++) { + pcre_uint32 cc; if (eptr >= md->end_subject) { SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_space) == 0) + cc = RAWUCHAR(eptr); + if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); eptr++; /* No need to skip more bytes - we know it's a 1-byte character */ @@ -4501,12 +4604,14 @@ for (;;) case OP_NOT_WORDCHAR: for (i = 1; i <= min; i++) { + pcre_uint32 cc; if (eptr >= md->end_subject) { SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0) + cc = RAWUCHAR(eptr); + if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0) RRETURN(MATCH_NOMATCH); eptr++; ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); @@ -4516,12 +4621,14 @@ for (;;) case OP_WORDCHAR: for (i = 1; i <= min; i++) { + pcre_uint32 cc; if (eptr >= md->end_subject) { SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_word) == 0) + cc = RAWUCHAR(eptr); + if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); eptr++; /* No need to skip more bytes - we know it's a 1-byte character */ @@ -4592,17 +4699,17 @@ for (;;) { default: RRETURN(MATCH_NOMATCH); - case 0x000d: - if (eptr < md->end_subject && *eptr == 0x0a) eptr++; + case CHAR_CR: + if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++; break; - case 0x000a: + case CHAR_LF: break; - case 0x000b: - case 0x000c: - case 0x0085: -#ifdef COMPILE_PCRE16 + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 case 0x2028: case 0x2029: #endif @@ -4623,26 +4730,9 @@ for (;;) switch(*eptr++) { default: break; - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ -#ifdef COMPILE_PCRE16 - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ + HSPACE_BYTE_CASES: +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + HSPACE_MULTIBYTE_CASES: #endif RRETURN(MATCH_NOMATCH); } @@ -4660,26 +4750,9 @@ for (;;) switch(*eptr++) { default: RRETURN(MATCH_NOMATCH); - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ -#ifdef COMPILE_PCRE16 - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ + HSPACE_BYTE_CASES: +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + HSPACE_MULTIBYTE_CASES: #endif break; } @@ -4696,17 +4769,12 @@ for (;;) } switch(*eptr++) { - default: break; - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ -#ifdef COMPILE_PCRE16 - case 0x2028: /* LINE SEPARATOR */ - case 0x2029: /* PARAGRAPH SEPARATOR */ + VSPACE_BYTE_CASES: +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + VSPACE_MULTIBYTE_CASES: #endif RRETURN(MATCH_NOMATCH); + default: break; } } break; @@ -4722,14 +4790,9 @@ for (;;) switch(*eptr++) { default: RRETURN(MATCH_NOMATCH); - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ -#ifdef COMPILE_PCRE16 - case 0x2028: /* LINE SEPARATOR */ - case 0x2029: /* PARAGRAPH SEPARATOR */ + VSPACE_BYTE_CASES: +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + VSPACE_MULTIBYTE_CASES: #endif break; } @@ -5007,8 +5070,50 @@ for (;;) } /* Control never gets here */ - /* This should never occur */ + case PT_CLIST: + for (fi = min;; fi++) + { + const pcre_uint32 *cp; + RMATCH(eptr, ecode, offset_top, md, eptrb, RM67); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(c, eptr); + cp = PRIV(ucd_caseless_sets) + prop_value; + for (;;) + { + if (c < *cp) + { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } } + if (c == *cp++) + { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; } + } + } + /* Control never gets here */ + case PT_UCNC: + for (fi = min;; fi++) + { + RMATCH(eptr, ecode, offset_top, md, eptrb, RM68); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(c, eptr); + if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || + c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || + c >= 0xe000) == prop_fail_result) + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + /* This should never occur */ default: RRETURN(PCRE_ERROR_INTERNAL); } @@ -5029,14 +5134,20 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - GETCHARINCTEST(c, eptr); - if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH); - while (eptr < md->end_subject) + else { - int len = 1; - if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } - if (UCD_CATEGORY(c) != ucp_M) break; - eptr += len; + int lgb, rgb; + GETCHARINCTEST(c, eptr); + lgb = UCD_GRAPHBREAK(c); + while (eptr < md->end_subject) + { + int len = 1; + if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } + rgb = UCD_GRAPHBREAK(c); + if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; + lgb = rgb; + eptr += len; + } } CHECK_PARTIAL(); } @@ -5082,17 +5193,20 @@ for (;;) switch(c) { default: RRETURN(MATCH_NOMATCH); - case 0x000d: - if (eptr < md->end_subject && *eptr == 0x0a) eptr++; + case CHAR_CR: + if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++; break; - case 0x000a: + + case CHAR_LF: break; - case 0x000b: - case 0x000c: - case 0x0085: + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); break; } @@ -5101,84 +5215,32 @@ for (;;) case OP_NOT_HSPACE: switch(c) { + HSPACE_CASES: RRETURN(MATCH_NOMATCH); default: break; - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ - RRETURN(MATCH_NOMATCH); } break; case OP_HSPACE: switch(c) { + HSPACE_CASES: break; default: RRETURN(MATCH_NOMATCH); - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ - break; } break; case OP_NOT_VSPACE: switch(c) { + VSPACE_CASES: RRETURN(MATCH_NOMATCH); default: break; - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ - case 0x2028: /* LINE SEPARATOR */ - case 0x2029: /* PARAGRAPH SEPARATOR */ - RRETURN(MATCH_NOMATCH); } break; case OP_VSPACE: switch(c) { + VSPACE_CASES: break; default: RRETURN(MATCH_NOMATCH); - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ - case 0x2028: /* LINE SEPARATOR */ - case 0x2029: /* PARAGRAPH SEPARATOR */ - break; } break; @@ -5256,17 +5318,17 @@ for (;;) switch(c) { default: RRETURN(MATCH_NOMATCH); - case 0x000d: - if (eptr < md->end_subject && *eptr == 0x0a) eptr++; + case CHAR_CR: + if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++; break; - case 0x000a: + case CHAR_LF: break; - case 0x000b: - case 0x000c: - case 0x0085: -#ifdef COMPILE_PCRE16 + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 case 0x2028: case 0x2029: #endif @@ -5279,26 +5341,9 @@ for (;;) switch(c) { default: break; - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ -#ifdef COMPILE_PCRE16 - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ + HSPACE_BYTE_CASES: +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + HSPACE_MULTIBYTE_CASES: #endif RRETURN(MATCH_NOMATCH); } @@ -5308,26 +5353,9 @@ for (;;) switch(c) { default: RRETURN(MATCH_NOMATCH); - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ -#ifdef COMPILE_PCRE16 - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ + HSPACE_BYTE_CASES: +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + HSPACE_MULTIBYTE_CASES: #endif break; } @@ -5337,14 +5365,9 @@ for (;;) switch(c) { default: break; - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ -#ifdef COMPILE_PCRE16 - case 0x2028: /* LINE SEPARATOR */ - case 0x2029: /* PARAGRAPH SEPARATOR */ + VSPACE_BYTE_CASES: +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + VSPACE_MULTIBYTE_CASES: #endif RRETURN(MATCH_NOMATCH); } @@ -5354,14 +5377,9 @@ for (;;) switch(c) { default: RRETURN(MATCH_NOMATCH); - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ -#ifdef COMPILE_PCRE16 - case 0x2028: /* LINE SEPARATOR */ - case 0x2029: /* PARAGRAPH SEPARATOR */ + VSPACE_BYTE_CASES: +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + VSPACE_MULTIBYTE_CASES: #endif break; } @@ -5565,18 +5583,61 @@ for (;;) } break; + case PT_CLIST: + for (i = min; i < max; i++) + { + const pcre_uint32 *cp; + int len = 1; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLENTEST(c, eptr, len); + cp = PRIV(ucd_caseless_sets) + prop_value; + for (;;) + { + if (c < *cp) + { if (prop_fail_result) break; else goto GOT_MAX; } + if (c == *cp++) + { if (prop_fail_result) goto GOT_MAX; else break; } + } + eptr += len; + } + GOT_MAX: + break; + + case PT_UCNC: + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLENTEST(c, eptr, len); + if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || + c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || + c >= 0xe000) == prop_fail_result) + break; + eptr += len; + } + break; + default: RRETURN(PCRE_ERROR_INTERNAL); } /* eptr is now past the end of the maximum run */ - if (possessive) continue; + if (possessive) continue; /* No backtracking */ for(;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM44); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr-- == pp) break; /* Stop if tried at original pos */ + eptr--; if (utf) BACKCHAR(eptr); } } @@ -5588,34 +5649,38 @@ for (;;) { for (i = min; i < max; i++) { - int len = 1; if (eptr >= md->end_subject) { SCHECK_PARTIAL(); break; } - if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } - if (UCD_CATEGORY(c) == ucp_M) break; - eptr += len; - while (eptr < md->end_subject) + else { - len = 1; - if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } - if (UCD_CATEGORY(c) != ucp_M) break; - eptr += len; + int lgb, rgb; + GETCHARINCTEST(c, eptr); + lgb = UCD_GRAPHBREAK(c); + while (eptr < md->end_subject) + { + int len = 1; + if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } + rgb = UCD_GRAPHBREAK(c); + if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; + lgb = rgb; + eptr += len; + } } CHECK_PARTIAL(); } /* eptr is now past the end of the maximum run */ - if (possessive) continue; - + if (possessive) continue; /* No backtracking */ for(;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM45); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr-- == pp) break; /* Stop if tried at original pos */ + eptr--; for (;;) /* Move back over one extended */ { if (!utf) c = *eptr; else @@ -5652,7 +5717,7 @@ for (;;) eptr + 1 >= md->end_subject && NLBLOCK->nltype == NLTYPE_FIXED && NLBLOCK->nllen == 2 && - *eptr == NLBLOCK->nl[0]) + RAWUCHAR(eptr) == NLBLOCK->nl[0]) { md->hitend = TRUE; if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); @@ -5678,7 +5743,7 @@ for (;;) eptr + 1 >= md->end_subject && NLBLOCK->nltype == NLTYPE_FIXED && NLBLOCK->nllen == 2 && - *eptr == NLBLOCK->nl[0]) + RAWUCHAR(eptr) == NLBLOCK->nl[0]) { md->hitend = TRUE; if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); @@ -5732,17 +5797,20 @@ for (;;) break; } GETCHARLEN(c, eptr, len); - if (c == 0x000d) + if (c == CHAR_CR) { if (++eptr >= md->end_subject) break; - if (*eptr == 0x000a) eptr++; + if (RAWUCHAR(eptr) == CHAR_LF) eptr++; } else { - if (c != 0x000a && + if (c != CHAR_LF && (md->bsr_anycrlf || - (c != 0x000b && c != 0x000c && - c != 0x0085 && c != 0x2028 && c != 0x2029))) + (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL +#ifndef EBCDIC + && c != 0x2028 && c != 0x2029 +#endif /* Not EBCDIC */ + ))) break; eptr += len; } @@ -5763,28 +5831,8 @@ for (;;) GETCHARLEN(c, eptr, len); switch(c) { + HSPACE_CASES: gotspace = TRUE; break; default: gotspace = FALSE; break; - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ - gotspace = TRUE; - break; } if (gotspace == (ctype == OP_NOT_HSPACE)) break; eptr += len; @@ -5805,16 +5853,8 @@ for (;;) GETCHARLEN(c, eptr, len); switch(c) { + VSPACE_CASES: gotspace = TRUE; break; default: gotspace = FALSE; break; - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ - case 0x2028: /* LINE SEPARATOR */ - case 0x2029: /* PARAGRAPH SEPARATOR */ - gotspace = TRUE; - break; } if (gotspace == (ctype == OP_NOT_VSPACE)) break; eptr += len; @@ -5915,21 +5955,16 @@ for (;;) RRETURN(PCRE_ERROR_INTERNAL); } - /* eptr is now past the end of the maximum run. If possessive, we are - done (no backing up). Otherwise, match at this position; anything other - than no match is immediately returned. For nomatch, back up one - character, unless we are matching \R and the last thing matched was - \r\n, in which case, back up two bytes. */ - - if (possessive) continue; + if (possessive) continue; /* No backtracking */ for(;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM46); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr-- == pp) break; /* Stop if tried at original pos */ + eptr--; BACKCHAR(eptr); - if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' && - eptr[-1] == '\r') eptr--; + if (ctype == OP_ANYNL && eptr > pp && RAWUCHAR(eptr) == CHAR_NL && + RAWUCHAR(eptr - 1) == CHAR_CR) eptr--; } } else @@ -5980,19 +6015,19 @@ for (;;) break; } c = *eptr; - if (c == 0x000d) + if (c == CHAR_CR) { if (++eptr >= md->end_subject) break; - if (*eptr == 0x000a) eptr++; + if (*eptr == CHAR_LF) eptr++; } else { - if (c != 0x000a && (md->bsr_anycrlf || - (c != 0x000b && c != 0x000c && c != 0x0085 -#ifdef COMPILE_PCRE16 - && c != 0x2028 && c != 0x2029 + if (c != CHAR_LF && (md->bsr_anycrlf || + (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + && c != 0x2028 && c != 0x2029 #endif - ))) break; + ))) break; eptr++; } } @@ -6006,15 +6041,17 @@ for (;;) SCHECK_PARTIAL(); break; } - c = *eptr; - if (c == 0x09 || c == 0x20 || c == 0xa0 -#ifdef COMPILE_PCRE16 - || c == 0x1680 || c == 0x180e || (c >= 0x2000 && c <= 0x200A) - || c == 0x202f || c == 0x205f || c == 0x3000 + switch(*eptr) + { + default: eptr++; break; + HSPACE_BYTE_CASES: +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + HSPACE_MULTIBYTE_CASES: #endif - ) break; - eptr++; + goto ENDLOOP00; + } } + ENDLOOP00: break; case OP_HSPACE: @@ -6025,15 +6062,17 @@ for (;;) SCHECK_PARTIAL(); break; } - c = *eptr; - if (c != 0x09 && c != 0x20 && c != 0xa0 -#ifdef COMPILE_PCRE16 - && c != 0x1680 && c != 0x180e && (c < 0x2000 || c > 0x200A) - && c != 0x202f && c != 0x205f && c != 0x3000 + switch(*eptr) + { + default: goto ENDLOOP01; + HSPACE_BYTE_CASES: +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + HSPACE_MULTIBYTE_CASES: #endif - ) break; - eptr++; + eptr++; break; + } } + ENDLOOP01: break; case OP_NOT_VSPACE: @@ -6044,14 +6083,17 @@ for (;;) SCHECK_PARTIAL(); break; } - c = *eptr; - if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85 -#ifdef COMPILE_PCRE16 - || c == 0x2028 || c == 0x2029 + switch(*eptr) + { + default: eptr++; break; + VSPACE_BYTE_CASES: +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + VSPACE_MULTIBYTE_CASES: #endif - ) break; - eptr++; + goto ENDLOOP02; + } } + ENDLOOP02: break; case OP_VSPACE: @@ -6062,14 +6104,17 @@ for (;;) SCHECK_PARTIAL(); break; } - c = *eptr; - if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85 -#ifdef COMPILE_PCRE16 - && c != 0x2028 && c != 0x2029 + switch(*eptr) + { + default: goto ENDLOOP03; + VSPACE_BYTE_CASES: +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + VSPACE_MULTIBYTE_CASES: #endif - ) break; - eptr++; + eptr++; break; + } } + ENDLOOP03: break; case OP_NOT_DIGIT: @@ -6154,20 +6199,15 @@ for (;;) RRETURN(PCRE_ERROR_INTERNAL); } - /* eptr is now past the end of the maximum run. If possessive, we are - done (no backing up). Otherwise, match at this position; anything other - than no match is immediately returned. For nomatch, back up one - character (byte), unless we are matching \R and the last thing matched - was \r\n, in which case, back up two bytes. */ - - if (possessive) continue; - while (eptr >= pp) + if (possessive) continue; /* No backtracking */ + for (;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM47); if (rrc != MATCH_NOMATCH) RRETURN(rrc); eptr--; - if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' && - eptr[-1] == '\r') eptr--; + if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF && + eptr[-1] == CHAR_CR) eptr--; } } @@ -6217,14 +6257,11 @@ switch (frame->Xwhere) LBL(32) LBL(34) LBL(42) LBL(46) #ifdef SUPPORT_UCP LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45) - LBL(59) LBL(60) LBL(61) LBL(62) + LBL(59) LBL(60) LBL(61) LBL(62) LBL(67) LBL(68) #endif /* SUPPORT_UCP */ #endif /* SUPPORT_UTF */ default: DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere)); - -printf("+++jump error in pcre match: label %d non-existent\n", frame->Xwhere); - return PCRE_ERROR_INTERNAL; } #undef LBL @@ -6336,16 +6373,21 @@ Returns: > 0 => success; value is the number < -1 => some kind of unexpected problem */ -#ifdef COMPILE_PCRE8 +#if defined COMPILE_PCRE8 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_exec(const pcre *argument_re, const pcre_extra *extra_data, PCRE_SPTR subject, int length, int start_offset, int options, int *offsets, int offsetcount) -#else +#elif defined COMPILE_PCRE16 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data, PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets, int offsetcount) +#elif defined COMPILE_PCRE32 +PCRE_EXP_DEFN int PCRE_CALL_CONVENTION +pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data, + PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets, + int offsetcount) #endif { int rc, ocount, arg_offset_max; @@ -6368,6 +6410,7 @@ const pcre_uint8 *start_bits = NULL; PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset; PCRE_PUCHAR end_subject; PCRE_PUCHAR start_partial = NULL; +PCRE_PUCHAR match_partial; PCRE_PUCHAR req_char_ptr = start_match - 1; const pcre_study_data *study; @@ -6399,6 +6442,7 @@ if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; +if (length < 0) return PCRE_ERROR_BADLENGTH; if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET; /* Check that the first field in the block is the magic number. If it is not, @@ -6436,19 +6480,22 @@ if (utf && (options & PCRE_NO_UTF8_CHECK) == 0) offsets[0] = erroroffset; offsets[1] = errorcode; } -#ifdef COMPILE_PCRE16 - return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)? - PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16; -#else +#if defined COMPILE_PCRE8 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)? PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8; +#elif defined COMPILE_PCRE16 + return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)? + PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16; +#elif defined COMPILE_PCRE32 + return PCRE_ERROR_BADUTF32; #endif } - +#if defined COMPILE_PCRE8 || defined COMPILE_PCRE16 /* Check that a start_offset points to the start of a UTF character. */ if (start_offset > 0 && start_offset < length && NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset])) return PCRE_ERROR_BADUTF8_OFFSET; +#endif } #endif @@ -6462,17 +6509,15 @@ if (extra_data != NULL && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT | PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT && extra_data->executable_jit != NULL - && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL | - PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART | - PCRE_PARTIAL_SOFT | PCRE_PARTIAL_HARD)) == 0) + && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0) { - rc = PRIV(jit_exec)(re, extra_data, (const pcre_uchar *)subject, length, + rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length, start_offset, options, offsets, offsetcount); /* PCRE_ERROR_NULL means that the selected normal or partial matching mode is not compiled. In this case we simply fallback to interpreter. */ - if (rc != PCRE_ERROR_NULL) return rc; + if (rc != PCRE_ERROR_JIT_BADOPTION) return rc; } #endif @@ -6495,6 +6540,8 @@ md->callout_data = NULL; tables = re->tables; +/* The two limit values override the defaults, whatever their value. */ + if (extra_data != NULL) { register unsigned int flags = extra_data->flags; @@ -6509,6 +6556,15 @@ if (extra_data != NULL) if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables; } +/* Limits in the regex override only if they are smaller. */ + +if ((re->flags & PCRE_MLSET) != 0 && re->limit_match < md->match_limit) + md->match_limit = re->limit_match; + +if ((re->flags & PCRE_RLSET) != 0 && + re->limit_recursion < md->match_limit_recursion) + md->match_limit_recursion = re->limit_recursion; + /* If the exec call supplied NULL for tables, use the inbuilt ones. This is a feature that makes it possible to save compiled regex and re-use them in other programs later. */ @@ -6534,7 +6590,7 @@ end_subject = md->end_subject; md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; md->use_ucp = (re->options & PCRE_UCP) != 0; md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0; -md->ignore_skip_arg = FALSE; +md->ignore_skip_arg = 0; /* Some options are unpacked into BOOL variables in the hope that testing them will be faster than individual option bits. */ @@ -6644,11 +6700,9 @@ if (re->top_backref > 0 && re->top_backref >= ocount/3 DPRINTF(("Got memory to hold back references\n")); } else md->offset_vector = offsets; - md->offset_end = ocount; md->offset_max = (2*ocount)/3; -md->offset_overflow = FALSE; -md->capture_last = -1; +md->capture_last = 0; /* Reset the working variable associated with each extraction. These should never be used unless previously set, but they get saved and restored, and so we @@ -6756,12 +6810,14 @@ for(;;) if (has_first_char) { + pcre_uchar smc; + if (first_char != first_char2) while (start_match < end_subject && - *start_match != first_char && *start_match != first_char2) + (smc = RAWUCHARTEST(start_match)) != first_char && smc != first_char2) start_match++; else - while (start_match < end_subject && *start_match != first_char) + while (start_match < end_subject && RAWUCHARTEST(start_match) != first_char) start_match++; } @@ -6793,7 +6849,7 @@ for(;;) if (start_match[-1] == CHAR_CR && (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) && start_match < end_subject && - *start_match == CHAR_NL) + RAWUCHARTEST(start_match) == CHAR_NL) start_match++; } } @@ -6804,7 +6860,7 @@ for(;;) { while (start_match < end_subject) { - register unsigned int c = *start_match; + register pcre_uint32 c = RAWUCHARTEST(start_match); #ifndef COMPILE_PCRE8 if (c > 255) c = 255; #endif @@ -6872,7 +6928,7 @@ for(;;) { while (p < end_subject) { - register int pp = *p++; + register pcre_uint32 pp = RAWUCHARINCTEST(p); if (pp == req_char || pp == req_char2) { p--; break; } } } @@ -6880,7 +6936,7 @@ for(;;) { while (p < end_subject) { - if (*p++ == req_char) { p--; break; } + if (RAWUCHARINCTEST(p) == req_char) { p--; break; } } } @@ -6916,8 +6972,13 @@ for(;;) md->match_call_count = 0; md->match_function_type = 0; md->end_offset_top = 0; + md->skip_arg_count = 0; rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0); - if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr; + if (md->hitend && start_partial == NULL) + { + start_partial = md->start_used_ptr; + match_partial = start_match; + } switch(rc) { @@ -6930,14 +6991,14 @@ for(;;) case MATCH_SKIP_ARG: new_start_match = start_match; - md->ignore_skip_arg = TRUE; + md->ignore_skip_arg = md->skip_arg_count; break; - /* SKIP passes back the next starting point explicitly, but if it is the - same as the match we have just done, treat it as NOMATCH. */ + /* SKIP passes back the next starting point explicitly, but if it is no + greater than the match we have just done, treat it as NOMATCH. */ case MATCH_SKIP: - if (md->start_match_ptr != start_match) + if (md->start_match_ptr > start_match) { new_start_match = md->start_match_ptr; break; @@ -6945,12 +7006,12 @@ for(;;) /* Fall through */ /* NOMATCH and PRUNE advance by one character. THEN at this level acts - exactly like PRUNE. Unset the ignore SKIP-with-argument flag. */ + exactly like PRUNE. Unset ignore SKIP-with-argument. */ case MATCH_NOMATCH: case MATCH_PRUNE: case MATCH_THEN: - md->ignore_skip_arg = FALSE; + md->ignore_skip_arg = 0; new_start_match = start_match + 1; #ifdef SUPPORT_UTF if (utf) @@ -7043,7 +7104,7 @@ if (rc == MATCH_MATCH || rc == MATCH_ACCEPT) (arg_offset_max - 2) * sizeof(int)); DPRINTF(("Copied offsets from temporary memory\n")); } - if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE; + if (md->end_offset_top > arg_offset_max) md->capture_last |= OVFLBIT; DPRINTF(("Freeing temporary memory\n")); (PUBL(free))(md->offset_vector); } @@ -7051,7 +7112,8 @@ if (rc == MATCH_MATCH || rc == MATCH_ACCEPT) /* Set the return code to the number of captured strings, or 0 if there were too many to fit into the vector. */ - rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)? + rc = ((md->capture_last & OVFLBIT) != 0 && + md->end_offset_top >= arg_offset_max)? 0 : md->end_offset_top/2; /* If there is space in the offset vector, set any unused pairs at the end of @@ -7124,6 +7186,8 @@ if (start_partial != NULL) { offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject); offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject); + if (offsetcount > 2) + offsets[2] = (int)(match_partial - (PCRE_PUCHAR)subject); } rc = PCRE_ERROR_PARTIAL; }