--- embedaddon/pcre/pcre_exec.c 2012/02/21 23:50:25 1.1.1.2 +++ embedaddon/pcre/pcre_exec.c 2014/06/15 19:46:04 1.1.1.5 @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2012 University of Cambridge + Copyright (c) 1997-2013 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -37,7 +37,6 @@ POSSIBILITY OF SUCH DAMAGE. ----------------------------------------------------------------------------- */ - /* This module contains pcre_exec(), the externally visible function that does pattern matching using an NFA algorithm, trying to mimic Perl as closely as possible. There are also some static supporting functions. */ @@ -57,6 +56,20 @@ possible. There are also some static supporting functi #undef min #undef max +/* The md->capture_last field uses the lower 16 bits for the last captured +substring (which can never be greater than 65535) and a bit in the top half +to mean "capture vector overflowed". This odd way of doing things was +implemented when it was realized that preserving and restoring the overflow bit +whenever the last capture number was saved/restored made for a neater +interface, and doing it this way saved on (a) another variable, which would +have increased the stack frame size (a big NO-NO in PCRE) and (b) another +separate set of save/restore instructions. The following defines are used in +implementing this. */ + +#define CAPLMASK 0x0000ffff /* The bits used for last_capture */ +#define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */ +#define OVFLBIT 0x00010000 /* The bit that is set for overflow */ + /* Values for setting in md->match_function_type to indicate two special types of call to match(). We do it this way to save on using another stack variable, as stack usage is to be discouraged. */ @@ -74,13 +87,17 @@ defined PCRE_ERROR_xxx codes, which are all negative. negative to avoid the external error codes. */ #define MATCH_ACCEPT (-999) -#define MATCH_COMMIT (-998) -#define MATCH_KETRPOS (-997) -#define MATCH_ONCE (-996) +#define MATCH_KETRPOS (-998) +#define MATCH_ONCE (-997) +/* The next 5 must be kept together and in sequence so that a test that checks +for any one of them can use a range. */ +#define MATCH_COMMIT (-996) #define MATCH_PRUNE (-995) #define MATCH_SKIP (-994) #define MATCH_SKIP_ARG (-993) #define MATCH_THEN (-992) +#define MATCH_BACKTRACK_MAX MATCH_THEN +#define MATCH_BACKTRACK_MIN MATCH_COMMIT /* Maximum number of ints of offset to save on the stack for recursive calls. If the offset vector is bigger, malloc is used. This should be a multiple of 3, @@ -90,11 +107,9 @@ because the offset vector is always a multiple of 3 lo /* Min and max values for the common repeats; for the maxima, 0 => infinity */ -static const char rep_min[] = { 0, 0, 1, 1, 0, 0 }; -static const char rep_max[] = { 0, 0, 0, 0, 1, 1 }; +static const char rep_min[] = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, }; +static const char rep_max[] = { 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, }; - - #ifdef PCRE_DEBUG /************************************************* * Debugging function to print chars * @@ -115,10 +130,11 @@ Returns: nothing static void pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md) { -unsigned int c; +pcre_uint32 c; +BOOL utf = md->utf; if (is_subject && length > md->end_subject - p) length = md->end_subject - p; while (length-- > 0) - if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c); + if (isprint(c = RAWUCHARINCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c); } #endif @@ -140,7 +156,9 @@ Arguments: md points to match data block caseless TRUE if caseless -Returns: < 0 if not matched, otherwise the number of subject bytes matched +Returns: >= 0 the number of subject bytes matched + -1 no match + -2 partial match; always given if at end subject */ static int @@ -149,6 +167,9 @@ match_ref(int offset, register PCRE_PUCHAR eptr, int l { PCRE_PUCHAR eptr_start = eptr; register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset]; +#if defined SUPPORT_UTF && defined SUPPORT_UCP +BOOL utf = md->utf; +#endif #ifdef PCRE_DEBUG if (eptr >= md->end_subject) @@ -163,7 +184,8 @@ pchars(p, length, FALSE, md); printf("\n"); #endif -/* Always fail if reference not set (and not JavaScript compatible). */ +/* Always fail if reference not set (and not JavaScript compatible - in that +case the length is passed as zero). */ if (length < 0) return -1; @@ -173,39 +195,51 @@ ASCII characters. */ if (caseless) { -#ifdef SUPPORT_UTF -#ifdef SUPPORT_UCP - if (md->utf) +#if defined SUPPORT_UTF && defined SUPPORT_UCP + if (utf) { /* Match characters up to the end of the reference. NOTE: the number of - bytes matched may differ, because there are some characters whose upper and - lower case versions code as different numbers of bytes. For example, U+023A - (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8); - a sequence of 3 of the former uses 6 bytes, as does a sequence of two of - the latter. It is important, therefore, to check the length along the - reference, not along the subject (earlier code did this wrong). */ + data units matched may differ, because in UTF-8 there are some characters + whose upper and lower case versions code have different numbers of bytes. + For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65 + (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a + sequence of two of the latter. It is important, therefore, to check the + length along the reference, not along the subject (earlier code did this + wrong). */ PCRE_PUCHAR endptr = p + length; while (p < endptr) { - int c, d; - if (eptr >= md->end_subject) return -1; + pcre_uint32 c, d; + const ucd_record *ur; + if (eptr >= md->end_subject) return -2; /* Partial match */ GETCHARINC(c, eptr); GETCHARINC(d, p); - if (c != d && c != UCD_OTHERCASE(d)) return -1; + ur = GET_UCD(d); + if (c != d && c != d + ur->other_case) + { + const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset; + for (;;) + { + if (c < *pp) return -1; + if (c == *pp++) break; + } + } } } else #endif -#endif /* The same code works when not in UTF-8 mode and in UTF-8 mode when there is no UCP support. */ { - if (eptr + length > md->end_subject) return -1; while (length-- > 0) { - if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1; + pcre_uint32 cc, cp; + if (eptr >= md->end_subject) return -2; /* Partial match */ + cc = RAWUCHARTEST(eptr); + cp = RAWUCHARTEST(p); + if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1; p++; eptr++; } @@ -217,8 +251,11 @@ are in UTF-8 mode. */ else { - if (eptr + length > md->end_subject) return -1; - while (length-- > 0) if (*p++ != *eptr++) return -1; + while (length-- > 0) + { + if (eptr >= md->end_subject) return -2; /* Partial match */ + if (RAWUCHARINCTEST(p) != RAWUCHARINCTEST(eptr)) return -1; + } } return (int)(eptr - eptr_start); @@ -273,7 +310,7 @@ enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40, RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50, RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60, - RM61, RM62, RM63, RM64, RM65, RM66 }; + RM61, RM62, RM63, RM64, RM65, RM66, RM67 }; /* These versions of the macros use the stack, as normal. There are debugging versions and production versions. Note that the "rw" argument of RMATCH isn't @@ -291,7 +328,7 @@ actually used in this definition. */ } #define RRETURN(ra) \ { \ - printf("match() returned %d from line %d ", ra, __LINE__); \ + printf("match() returned %d from line %d\n", ra, __LINE__); \ return ra; \ } #else @@ -311,9 +348,15 @@ argument of match(), which never changes. */ #define RMATCH(ra,rb,rc,rd,re,rw)\ {\ - heapframe *newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\ - if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\ - frame->Xwhere = rw; \ + heapframe *newframe = frame->Xnextframe;\ + if (newframe == NULL)\ + {\ + newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\ + if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\ + newframe->Xnextframe = NULL;\ + frame->Xnextframe = newframe;\ + }\ + frame->Xwhere = rw;\ newframe->Xeptr = ra;\ newframe->Xecode = rb;\ newframe->Xmstart = mstart;\ @@ -332,7 +375,6 @@ argument of match(), which never changes. */ {\ heapframe *oldframe = frame;\ frame = oldframe->Xprevframe;\ - if (oldframe != &frame_zero) (PUBL(stack_free))(oldframe);\ if (frame != NULL)\ {\ rrc = ra;\ @@ -346,6 +388,7 @@ argument of match(), which never changes. */ typedef struct heapframe { struct heapframe *Xprevframe; + struct heapframe *Xnextframe; /* Function arguments that may change */ @@ -376,7 +419,7 @@ typedef struct heapframe { #ifdef SUPPORT_UCP int Xprop_type; - int Xprop_value; + unsigned int Xprop_value; int Xprop_fail_result; int Xoclength; pcre_uchar Xocchars[6]; @@ -389,10 +432,10 @@ typedef struct heapframe { int Xlength; int Xmax; int Xmin; - int Xnumber; + unsigned int Xnumber; int Xoffset; - int Xop; - int Xsave_capture_last; + unsigned int Xop; + pcre_int32 Xsave_capture_last; int Xsave_offset1, Xsave_offset2, Xsave_offset3; int Xstacksave[REC_STACK_SAVE_MAX]; @@ -477,7 +520,7 @@ so they can be ordinary variables in all cases. Mark s register int rrc; /* Returns from recursive calls */ register int i; /* Used for loops not involving calls to RMATCH() */ -register unsigned int c; /* Character values not kept over RMATCH() calls */ +register pcre_uint32 c; /* Character values not kept over RMATCH() calls */ register BOOL utf; /* Local copy of UTF flag for speed */ BOOL minimize, possessive; /* Quantifier options */ @@ -492,9 +535,7 @@ the top-level on the stack rather than malloc-ing them boost in many cases where there is not much "recursion". */ #ifdef NO_RECURSE -heapframe frame_zero; -heapframe *frame = &frame_zero; -frame->Xprevframe = NULL; /* Marks the top level */ +heapframe *frame = (heapframe *)md->match_frames_base; /* Copy in the original argument variables */ @@ -596,7 +637,7 @@ BOOL prev_is_word; #ifdef SUPPORT_UCP int prop_type; -int prop_value; +unsigned int prop_value; int prop_fail_result; int oclength; pcre_uchar occhars[6]; @@ -607,10 +648,10 @@ int ctype; int length; int max; int min; -int number; +unsigned int number; int offset; -int op; -int save_capture_last; +unsigned int op; +pcre_int32 save_capture_last; int save_offset1, save_offset2, save_offset3; int stacksave[REC_STACK_SAVE_MAX]; @@ -728,7 +769,7 @@ for (;;) unaltered. */ else if (rrc == MATCH_SKIP_ARG && - STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0) + STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0) { md->start_match_ptr = eptr; RRETURN(MATCH_SKIP); @@ -738,23 +779,16 @@ for (;;) case OP_FAIL: RRETURN(MATCH_NOMATCH); - /* COMMIT overrides PRUNE, SKIP, and THEN */ - case OP_COMMIT: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb, RM52); - if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && - rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG && - rrc != MATCH_THEN) - RRETURN(rrc); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); RRETURN(MATCH_COMMIT); - /* PRUNE overrides THEN */ - case OP_PRUNE: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb, RM51); - if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); RRETURN(MATCH_PRUNE); case OP_PRUNE_ARG: @@ -764,38 +798,39 @@ for (;;) eptrb, RM56); if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) && md->mark == NULL) md->mark = ecode + 2; - if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); RRETURN(MATCH_PRUNE); - /* SKIP overrides PRUNE and THEN */ - case OP_SKIP: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb, RM53); - if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN) - RRETURN(rrc); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); md->start_match_ptr = eptr; /* Pass back current position */ RRETURN(MATCH_SKIP); /* Note that, for Perl compatibility, SKIP with an argument does NOT set - nomatch_mark. There is a flag that disables this opcode when re-matching a - pattern that ended with a SKIP for which there was not a matching MARK. */ + nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was + not a matching mark, we have to re-run the match, ignoring the SKIP_ARG + that failed and any that precede it (either they also failed, or were not + triggered). To do this, we maintain a count of executed SKIP_ARGs. If a + SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg + set to the count of the one that failed. */ case OP_SKIP_ARG: - if (md->ignore_skip_arg) + md->skip_arg_count++; + if (md->skip_arg_count <= md->ignore_skip_arg) { ecode += PRIV(OP_lengths)[*ecode] + ecode[1]; break; } RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md, eptrb, RM57); - if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN) - RRETURN(rrc); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); /* Pass back the current skip name by overloading md->start_match_ptr and returning the special MATCH_SKIP_ARG return code. This will either be caught by a matching MARK, or get to the top, where it causes a rematch - with the md->ignore_skip_arg flag set. */ + with md->ignore_skip_arg set to the value of md->skip_arg_count. */ md->start_match_ptr = ecode + 2; RRETURN(MATCH_SKIP_ARG); @@ -897,7 +932,6 @@ for (;;) } else /* OP_KETRMAX */ { - md->match_function_type = MATCH_CBEGROUP; RMATCH(eptr, prev, offset_top, md, eptrb, RM66); if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode += 1 + LINK_SIZE; @@ -1026,7 +1060,8 @@ for (;;) for (;;) { - if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP; + if (op >= OP_SBRA || op == OP_ONCE) + md->match_function_type = MATCH_CBEGROUP; /* If this is not a possibly empty group, and there are no (*THEN)s in the pattern, and this is the final alternative, optimize as described @@ -1041,6 +1076,7 @@ for (;;) /* In all other cases, we have to make another call to match(). */ save_mark = md->mark; + save_capture_last = md->capture_last; RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb, RM2); @@ -1072,6 +1108,7 @@ for (;;) ecode += GET(ecode, 1); md->mark = save_mark; if (*ecode != OP_ALT) break; + md->capture_last = save_capture_last; } RRETURN(MATCH_NOMATCH); @@ -1134,6 +1171,7 @@ for (;;) ecode = md->start_code + code_offset; save_capture_last = md->capture_last; matched_once = TRUE; + mstart = md->start_match_ptr; /* In case \K changed it */ continue; } @@ -1193,6 +1231,7 @@ for (;;) POSSESSIVE_NON_CAPTURE: matched_once = FALSE; code_offset = (int)(ecode - md->start_code); + save_capture_last = md->capture_last; for (;;) { @@ -1205,6 +1244,7 @@ for (;;) eptr = md->end_match_ptr; ecode = md->start_code + code_offset; matched_once = TRUE; + mstart = md->start_match_ptr; /* In case \K reset it */ continue; } @@ -1222,6 +1262,7 @@ for (;;) if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode += GET(ecode, 1); if (*ecode != OP_ALT) break; + md->capture_last = save_capture_last; } if (matched_once || allow_zero) @@ -1233,243 +1274,167 @@ for (;;) /* Control never reaches here. */ - /* Conditional group: compilation checked that there are no more than - two branches. If the condition is false, skipping the first branch takes us - past the end if there is only one branch, but that's OK because that is - exactly what going to the ket would do. */ + /* Conditional group: compilation checked that there are no more than two + branches. If the condition is false, skipping the first branch takes us + past the end of the item if there is only one branch, but that's exactly + what we want. */ case OP_COND: case OP_SCOND: - codelink = GET(ecode, 1); + /* The variable codelink will be added to ecode when the condition is + false, to get to the second branch. Setting it to the offset to the ALT + or KET, then incrementing ecode achieves this effect. We now have ecode + pointing to the condition or callout. */ + + codelink = GET(ecode, 1); /* Offset to the second branch */ + ecode += 1 + LINK_SIZE; /* From this opcode */ + /* Because of the way auto-callout works during compile, a callout item is inserted between OP_COND and an assertion condition. */ - if (ecode[LINK_SIZE+1] == OP_CALLOUT) + if (*ecode == OP_CALLOUT) { if (PUBL(callout) != NULL) { PUBL(callout_block) cb; cb.version = 2; /* Version 1 of the callout block */ - cb.callout_number = ecode[LINK_SIZE+2]; + cb.callout_number = ecode[1]; cb.offset_vector = md->offset_vector; -#ifdef COMPILE_PCRE8 +#if defined COMPILE_PCRE8 cb.subject = (PCRE_SPTR)md->start_subject; -#else +#elif defined COMPILE_PCRE16 cb.subject = (PCRE_SPTR16)md->start_subject; +#elif defined COMPILE_PCRE32 + cb.subject = (PCRE_SPTR32)md->start_subject; #endif cb.subject_length = (int)(md->end_subject - md->start_subject); cb.start_match = (int)(mstart - md->start_subject); cb.current_position = (int)(eptr - md->start_subject); - cb.pattern_position = GET(ecode, LINK_SIZE + 3); - cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE); + cb.pattern_position = GET(ecode, 2); + cb.next_item_length = GET(ecode, 2 + LINK_SIZE); cb.capture_top = offset_top/2; - cb.capture_last = md->capture_last; + cb.capture_last = md->capture_last & CAPLMASK; + /* Internal change requires this for API compatibility. */ + if (cb.capture_last == 0) cb.capture_last = -1; cb.callout_data = md->callout_data; cb.mark = md->nomatch_mark; if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH); if (rrc < 0) RRETURN(rrc); } + + /* Advance ecode past the callout, so it now points to the condition. We + must adjust codelink so that the value of ecode+codelink is unchanged. */ + ecode += PRIV(OP_lengths)[OP_CALLOUT]; + codelink -= PRIV(OP_lengths)[OP_CALLOUT]; } - condcode = ecode[LINK_SIZE+1]; + /* Test the various possible conditions */ - /* Now see what the actual condition is */ - - if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */ + condition = FALSE; + switch(condcode = *ecode) { - if (md->recursive == NULL) /* Not recursing => FALSE */ + case OP_RREF: /* Numbered group recursion test */ + if (md->recursive != NULL) /* Not recursing => FALSE */ { - condition = FALSE; - ecode += GET(ecode, 1); - } - else - { - int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/ + unsigned int recno = GET2(ecode, 1); /* Recursion group number*/ condition = (recno == RREF_ANY || recno == md->recursive->group_num); + } + break; - /* If the test is for recursion into a specific subpattern, and it is - false, but the test was set up by name, scan the table to see if the - name refers to any other numbers, and test them. The condition is true - if any one is set. */ - - if (!condition && condcode == OP_NRREF) + case OP_DNRREF: /* Duplicate named group recursion test */ + if (md->recursive != NULL) + { + int count = GET2(ecode, 1 + IMM2_SIZE); + pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size; + while (count-- > 0) { - pcre_uchar *slotA = md->name_table; - for (i = 0; i < md->name_count; i++) - { - if (GET2(slotA, 0) == recno) break; - slotA += md->name_entry_size; - } - - /* Found a name for the number - there can be only one; duplicate - names for different numbers are allowed, but not vice versa. First - scan down for duplicates. */ - - if (i < md->name_count) - { - pcre_uchar *slotB = slotA; - while (slotB > md->name_table) - { - slotB -= md->name_entry_size; - if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0) - { - condition = GET2(slotB, 0) == md->recursive->group_num; - if (condition) break; - } - else break; - } - - /* Scan up for duplicates */ - - if (!condition) - { - slotB = slotA; - for (i++; i < md->name_count; i++) - { - slotB += md->name_entry_size; - if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0) - { - condition = GET2(slotB, 0) == md->recursive->group_num; - if (condition) break; - } - else break; - } - } - } + unsigned int recno = GET2(slot, 0); + condition = recno == md->recursive->group_num; + if (condition) break; + slot += md->name_entry_size; } - - /* Chose branch according to the condition */ - - ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1); } - } + break; - else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */ - { - offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */ + case OP_CREF: /* Numbered group used test */ + offset = GET2(ecode, 1) << 1; /* Doubled ref number */ condition = offset < offset_top && md->offset_vector[offset] >= 0; + break; - /* If the numbered capture is unset, but the reference was by name, - scan the table to see if the name refers to any other numbers, and test - them. The condition is true if any one is set. This is tediously similar - to the code above, but not close enough to try to amalgamate. */ - - if (!condition && condcode == OP_NCREF) + case OP_DNCREF: /* Duplicate named group used test */ { - int refno = offset >> 1; - pcre_uchar *slotA = md->name_table; - - for (i = 0; i < md->name_count; i++) + int count = GET2(ecode, 1 + IMM2_SIZE); + pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size; + while (count-- > 0) { - if (GET2(slotA, 0) == refno) break; - slotA += md->name_entry_size; + offset = GET2(slot, 0) << 1; + condition = offset < offset_top && md->offset_vector[offset] >= 0; + if (condition) break; + slot += md->name_entry_size; } - - /* Found a name for the number - there can be only one; duplicate names - for different numbers are allowed, but not vice versa. First scan down - for duplicates. */ - - if (i < md->name_count) - { - pcre_uchar *slotB = slotA; - while (slotB > md->name_table) - { - slotB -= md->name_entry_size; - if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0) - { - offset = GET2(slotB, 0) << 1; - condition = offset < offset_top && - md->offset_vector[offset] >= 0; - if (condition) break; - } - else break; - } - - /* Scan up for duplicates */ - - if (!condition) - { - slotB = slotA; - for (i++; i < md->name_count; i++) - { - slotB += md->name_entry_size; - if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0) - { - offset = GET2(slotB, 0) << 1; - condition = offset < offset_top && - md->offset_vector[offset] >= 0; - if (condition) break; - } - else break; - } - } - } } + break; - /* Chose branch according to the condition */ + case OP_DEF: /* DEFINE - always false */ + break; - ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1); - } + /* The condition is an assertion. Call match() to evaluate it - setting + md->match_function_type to MATCH_CONDASSERT causes it to stop at the end + of an assertion. */ - else if (condcode == OP_DEF) /* DEFINE - always false */ - { - condition = FALSE; - ecode += GET(ecode, 1); - } - - /* The condition is an assertion. Call match() to evaluate it - setting - md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of - an assertion. */ - - else - { + default: md->match_function_type = MATCH_CONDASSERT; - RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3); + RMATCH(eptr, ecode, offset_top, md, NULL, RM3); if (rrc == MATCH_MATCH) { if (md->end_offset_top > offset_top) offset_top = md->end_offset_top; /* Captures may have happened */ condition = TRUE; - ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2); + + /* Advance ecode past the assertion to the start of the first branch, + but adjust it so that the general choosing code below works. */ + + ecode += GET(ecode, 1); while (*ecode == OP_ALT) ecode += GET(ecode, 1); + ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode]; } /* PCRE doesn't allow the effect of (*THEN) to escape beyond an - assertion; it is therefore treated as NOMATCH. */ + assertion; it is therefore treated as NOMATCH. Any other return is an + error. */ else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) { RRETURN(rrc); /* Need braces because of following else */ } - else - { - condition = FALSE; - ecode += codelink; - } + break; } - /* We are now at the branch that is to be obeyed. As there is only one, can - use tail recursion to avoid using another stack frame, except when there is - unlimited repeat of a possibly empty group. In the latter case, a recursive - call to match() is always required, unless the second alternative doesn't - exist, in which case we can just plough on. Note that, for compatibility - with Perl, the | in a conditional group is NOT treated as creating two - alternatives. If a THEN is encountered in the branch, it propagates out to - the enclosing alternative (unless nested in a deeper set of alternatives, - of course). */ + /* Choose branch according to the condition */ - if (condition || *ecode == OP_ALT) + ecode += condition? PRIV(OP_lengths)[condcode] : codelink; + + /* We are now at the branch that is to be obeyed. As there is only one, we + can use tail recursion to avoid using another stack frame, except when + there is unlimited repeat of a possibly empty group. In the latter case, a + recursive call to match() is always required, unless the second alternative + doesn't exist, in which case we can just plough on. Note that, for + compatibility with Perl, the | in a conditional group is NOT treated as + creating two alternatives. If a THEN is encountered in the branch, it + propagates out to the enclosing alternative (unless nested in a deeper set + of alternatives, of course). */ + + if (condition || ecode[-(1+LINK_SIZE)] == OP_ALT) { if (op != OP_SCOND) { - ecode += 1 + LINK_SIZE; goto TAIL_RECURSE; } md->match_function_type = MATCH_CBEGROUP; - RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49); + RMATCH(eptr, ecode, offset_top, md, eptrb, RM49); RRETURN(rrc); } @@ -1477,7 +1442,6 @@ for (;;) else { - ecode += 1 + LINK_SIZE; } break; @@ -1486,7 +1450,7 @@ for (;;) to close any currently open capturing brackets. */ case OP_CLOSE: - number = GET2(ecode, 1); + number = GET2(ecode, 1); /* Must be less than 65536 */ offset = number << 1; #ifdef PCRE_DEBUG @@ -1494,8 +1458,8 @@ for (;;) printf("\n"); #endif - md->capture_last = number; - if (offset >= md->offset_max) md->offset_overflow = TRUE; else + md->capture_last = (md->capture_last & OVFLMASK) | number; + if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else { md->offset_vector[offset] = md->offset_vector[md->offset_end - number]; @@ -1557,41 +1521,66 @@ for (;;) } else condassert = FALSE; + /* Loop for each branch */ + do { RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4); + + /* A match means that the assertion is true; break out of the loop + that matches its alternatives. */ + if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) { mstart = md->start_match_ptr; /* In case \K reset it */ break; } - /* PCRE does not allow THEN to escape beyond an assertion; it is treated - as NOMATCH. */ + /* If not matched, restore the previous mark setting. */ - if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); - ecode += GET(ecode, 1); md->mark = save_mark; + + /* See comment in the code for capturing groups above about handling + THEN. */ + + if (rrc == MATCH_THEN) + { + next = ecode + GET(ecode,1); + if (md->start_match_ptr < next && + (*ecode == OP_ALT || *next == OP_ALT)) + rrc = MATCH_NOMATCH; + } + + /* Anything other than NOMATCH causes the entire assertion to fail, + passing back the return code. This includes COMMIT, SKIP, PRUNE and an + uncaptured THEN, which means they take their normal effect. This + consistent approach does not always have exactly the same effect as in + Perl. */ + + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + ecode += GET(ecode, 1); } - while (*ecode == OP_ALT); + while (*ecode == OP_ALT); /* Continue for next alternative */ + /* If we have tried all the alternative branches, the assertion has + failed. If not, we broke out after a match. */ + if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH); /* If checking an assertion for a condition, return MATCH_MATCH. */ if (condassert) RRETURN(MATCH_MATCH); - /* Continue from after the assertion, updating the offsets high water - mark, since extracts may have been taken during the assertion. */ + /* Continue from after a successful assertion, updating the offsets high + water mark, since extracts may have been taken during the assertion. */ do ecode += GET(ecode,1); while (*ecode == OP_ALT); ecode += 1 + LINK_SIZE; offset_top = md->end_offset_top; continue; - /* Negative assertion: all branches must fail to match. Encountering SKIP, - PRUNE, or COMMIT means we must assume failure without checking subsequent - branches. */ + /* Negative assertion: all branches must fail to match for the assertion to + succeed. */ case OP_ASSERT_NOT: case OP_ASSERTBACK_NOT: @@ -1603,28 +1592,64 @@ for (;;) } else condassert = FALSE; + /* Loop for each alternative branch. */ + do { RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5); - md->mark = save_mark; - if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH); - if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT) + md->mark = save_mark; /* Always restore the mark setting */ + + switch(rrc) { - do ecode += GET(ecode,1); while (*ecode == OP_ALT); + case MATCH_MATCH: /* A successful match means */ + case MATCH_ACCEPT: /* the assertion has failed. */ + RRETURN(MATCH_NOMATCH); + + case MATCH_NOMATCH: /* Carry on with next branch */ break; + + /* See comment in the code for capturing groups above about handling + THEN. */ + + case MATCH_THEN: + next = ecode + GET(ecode,1); + if (md->start_match_ptr < next && + (*ecode == OP_ALT || *next == OP_ALT)) + { + rrc = MATCH_NOMATCH; + break; + } + /* Otherwise fall through. */ + + /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole + assertion to fail to match, without considering any more alternatives. + Failing to match means the assertion is true. This is a consistent + approach, but does not always have the same effect as in Perl. */ + + case MATCH_COMMIT: + case MATCH_SKIP: + case MATCH_SKIP_ARG: + case MATCH_PRUNE: + do ecode += GET(ecode,1); while (*ecode == OP_ALT); + goto NEG_ASSERT_TRUE; /* Break out of alternation loop */ + + /* Anything else is an error */ + + default: + RRETURN(rrc); } - /* PCRE does not allow THEN to escape beyond an assertion; it is treated - as NOMATCH. */ + /* Continue with next branch */ - if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); ecode += GET(ecode,1); } while (*ecode == OP_ALT); - if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */ + /* All branches in the assertion failed to match. */ - ecode += 1 + LINK_SIZE; + NEG_ASSERT_TRUE: + if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */ + ecode += 1 + LINK_SIZE; /* Continue with current branch */ continue; /* Move the subject pointer back. This occurs only at the start of @@ -1671,10 +1696,12 @@ for (;;) cb.version = 2; /* Version 1 of the callout block */ cb.callout_number = ecode[1]; cb.offset_vector = md->offset_vector; -#ifdef COMPILE_PCRE8 +#if defined COMPILE_PCRE8 cb.subject = (PCRE_SPTR)md->start_subject; -#else +#elif defined COMPILE_PCRE16 cb.subject = (PCRE_SPTR16)md->start_subject; +#elif defined COMPILE_PCRE32 + cb.subject = (PCRE_SPTR32)md->start_subject; #endif cb.subject_length = (int)(md->end_subject - md->start_subject); cb.start_match = (int)(mstart - md->start_subject); @@ -1682,7 +1709,9 @@ for (;;) cb.pattern_position = GET(ecode, 2); cb.next_item_length = GET(ecode, 2 + LINK_SIZE); cb.capture_top = offset_top/2; - cb.capture_last = md->capture_last; + cb.capture_last = md->capture_last & CAPLMASK; + /* Internal change requires this for API compatibility. */ + if (cb.capture_last == 0) cb.capture_last = -1; cb.callout_data = md->callout_data; cb.mark = md->nomatch_mark; if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH); @@ -1711,7 +1740,7 @@ for (;;) case OP_RECURSE: { recursion_info *ri; - int recno; + unsigned int recno; callpat = md->start_code + GET(ecode, 1); recno = (callpat == md->start_code)? 0 : @@ -1728,6 +1757,7 @@ for (;;) /* Add to "recursing stack" */ new_recursive.group_num = recno; + new_recursive.saved_capture_last = md->capture_last; new_recursive.subject_position = eptr; new_recursive.prevrec = md->recursive; md->recursive = &new_recursive; @@ -1751,8 +1781,9 @@ for (;;) new_recursive.saved_max * sizeof(int)); /* OK, now we can do the recursion. After processing each alternative, - restore the offset data. If there were nested recursions, md->recursive - might be changed, so reset it before looping. */ + restore the offset data and the last captured value. If there were nested + recursions, md->recursive might be changed, so reset it before looping. + */ DPRINTF(("Recursing into group %d\n", new_recursive.group_num)); cbegroup = (*callpat >= OP_SBRA); @@ -1763,6 +1794,7 @@ for (;;) md, eptrb, RM6); memcpy(md->offset_vector, new_recursive.offset_save, new_recursive.saved_max * sizeof(int)); + md->capture_last = new_recursive.saved_capture_last; md->recursive = new_recursive.prevrec; if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) { @@ -1779,10 +1811,16 @@ for (;;) goto RECURSION_MATCHED; /* Exit loop; end processing */ } - /* PCRE does not allow THEN to escape beyond a recursion; it is treated - as NOMATCH. */ + /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a + recursion; they cause a NOMATCH for the entire recursion. These codes + are defined in a range that can be tested for. */ - else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) + if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX) + RRETURN(MATCH_NOMATCH); + + /* Any return code other than NOMATCH is an error. */ + + if (rrc != MATCH_NOMATCH) { DPRINTF(("Recursion gave error %d\n", rrc)); if (new_recursive.offset_save != stacksave) @@ -1912,8 +1950,8 @@ for (;;) /* Deal with capturing */ - md->capture_last = number; - if (offset >= md->offset_max) md->offset_overflow = TRUE; else + md->capture_last = (md->capture_last & OVFLMASK) | number; + if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else { /* If offset is greater than offset_top, it means that we are "skipping" a capturing group, and that group's offsets must be marked @@ -1969,6 +2007,7 @@ for (;;) if (*ecode == OP_KETRPOS) { + md->start_match_ptr = mstart; /* In case \K reset it */ md->end_match_ptr = eptr; md->end_offset_top = offset_top; RRETURN(MATCH_KETRPOS); @@ -1993,7 +2032,6 @@ for (;;) } if (*prev >= OP_SBRA) /* Could match an empty string */ { - md->match_function_type = MATCH_CBEGROUP; RMATCH(eptr, prev, offset_top, md, eptrb, RM50); RRETURN(rrc); } @@ -2002,7 +2040,6 @@ for (;;) } else /* OP_KETRMAX */ { - if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP; RMATCH(eptr, prev, offset_top, md, eptrb, RM13); if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH; if (rrc != MATCH_NOMATCH) RRETURN(rrc); @@ -2059,7 +2096,21 @@ for (;;) case OP_DOLLM: if (eptr < md->end_subject) - { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); } + { + if (!IS_NEWLINE(eptr)) + { + if (md->partial != 0 && + eptr + 1 >= md->end_subject && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + RAWUCHARTEST(eptr) == NLBLOCK->nl[0]) + { + md->hitend = TRUE; + if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); + } + RRETURN(MATCH_NOMATCH); + } + } else { if (md->noteol) RRETURN(MATCH_NOMATCH); @@ -2091,7 +2142,18 @@ for (;;) ASSERT_NL_OR_EOS: if (eptr < md->end_subject && (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen)) + { + if (md->partial != 0 && + eptr + 1 >= md->end_subject && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + RAWUCHARTEST(eptr) == NLBLOCK->nl[0]) + { + md->hitend = TRUE; + if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); + } RRETURN(MATCH_NOMATCH); + } /* Either at end of string or \n before end. */ @@ -2219,12 +2281,25 @@ for (;;) } break; - /* Match a single character type; inline for speed */ + /* Match any single character type except newline; have to take care with + CRLF newlines and partial matching. */ case OP_ANY: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); + if (md->partial != 0 && + eptr + 1 >= md->end_subject && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + RAWUCHARTEST(eptr) == NLBLOCK->nl[0]) + { + md->hitend = TRUE; + if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); + } + /* Fall through */ + /* Match any single character whatsoever. */ + case OP_ALLANY: if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */ { /* not be updated before SCHECK_PARTIAL. */ @@ -2364,18 +2439,24 @@ for (;;) { default: RRETURN(MATCH_NOMATCH); - case 0x000d: - if (eptr < md->end_subject && *eptr == 0x0a) eptr++; + case CHAR_CR: + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + } + else if (RAWUCHARTEST(eptr) == CHAR_LF) eptr++; break; - case 0x000a: + case CHAR_LF: break; - case 0x000b: - case 0x000c: - case 0x0085: + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); break; } @@ -2391,27 +2472,8 @@ for (;;) GETCHARINCTEST(c, eptr); switch(c) { + HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */ default: break; - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ - RRETURN(MATCH_NOMATCH); } ecode++; break; @@ -2425,27 +2487,8 @@ for (;;) GETCHARINCTEST(c, eptr); switch(c) { + HSPACE_CASES: break; /* Byte and multibyte cases */ default: RRETURN(MATCH_NOMATCH); - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ - break; } ecode++; break; @@ -2459,15 +2502,8 @@ for (;;) GETCHARINCTEST(c, eptr); switch(c) { + VSPACE_CASES: RRETURN(MATCH_NOMATCH); default: break; - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ - case 0x2028: /* LINE SEPARATOR */ - case 0x2029: /* PARAGRAPH SEPARATOR */ - RRETURN(MATCH_NOMATCH); } ecode++; break; @@ -2481,15 +2517,8 @@ for (;;) GETCHARINCTEST(c, eptr); switch(c) { + VSPACE_CASES: break; default: RRETURN(MATCH_NOMATCH); - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ - case 0x2028: /* LINE SEPARATOR */ - case 0x2029: /* PARAGRAPH SEPARATOR */ - break; } ecode++; break; @@ -2507,6 +2536,7 @@ for (;;) } GETCHARINCTEST(c, eptr); { + const pcre_uint32 *cp; const ucd_record *prop = GET_UCD(c); switch(ecode[1]) @@ -2545,19 +2575,24 @@ for (;;) RRETURN(MATCH_NOMATCH); break; - case PT_SPACE: /* Perl space */ - if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z || - c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR) - == (op == OP_NOTPROP)) - RRETURN(MATCH_NOMATCH); - break; + /* Perl space used to exclude VT, but from Perl 5.18 it is included, + which means that Perl space and POSIX space are now identical. PCRE + was changed at release 8.34. */ + case PT_SPACE: /* Perl space */ case PT_PXSPACE: /* POSIX space */ - if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z || - c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || - c == CHAR_FF || c == CHAR_CR) - == (op == OP_NOTPROP)) - RRETURN(MATCH_NOMATCH); + switch(c) + { + HSPACE_CASES: + VSPACE_CASES: + if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH); + break; + + default: + if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == + (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH); + break; + } break; case PT_WORD: @@ -2567,6 +2602,24 @@ for (;;) RRETURN(MATCH_NOMATCH); break; + case PT_CLIST: + cp = PRIV(ucd_caseless_sets) + ecode[2]; + for (;;) + { + if (c < *cp) + { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; } + if (c == *cp++) + { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } } + } + break; + + case PT_UCNC: + if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || + c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || + c >= 0xe000) == (op == OP_NOTPROP)) + RRETURN(MATCH_NOMATCH); + break; + /* This should never occur */ default: @@ -2586,18 +2639,25 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - GETCHARINCTEST(c, eptr); - if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH); - while (eptr < md->end_subject) + else { - int len = 1; - if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } - if (UCD_CATEGORY(c) != ucp_M) break; - eptr += len; + int lgb, rgb; + GETCHARINCTEST(c, eptr); + lgb = UCD_GRAPHBREAK(c); + while (eptr < md->end_subject) + { + int len = 1; + if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } + rgb = UCD_GRAPHBREAK(c); + if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; + lgb = rgb; + eptr += len; + } } + CHECK_PARTIAL(); ecode++; break; -#endif +#endif /* SUPPORT_UCP */ /* Match a back reference, possibly repeatedly. Look past the end of the @@ -2606,16 +2666,8 @@ for (;;) similar code to character type repeats - written out again for speed. However, if the referenced string is the empty string, always treat it as matched, any number of times (otherwise there could be infinite - loops). */ + loops). If the reference is unset, there are two possibilities: - case OP_REF: - case OP_REFI: - caseless = op == OP_REFI; - offset = GET2(ecode, 1) << 1; /* Doubled ref number */ - ecode += 1 + IMM2_SIZE; - - /* If the reference is unset, there are two possibilities: - (a) In the default, Perl-compatible state, set the length negative; this ensures that every attempt at a match fails. We can't just fail here, because of the possibility of quantifiers with zero minima. @@ -2624,8 +2676,39 @@ for (;;) so that the back reference matches an empty string. Otherwise, set the length to the length of what was matched by the - referenced subpattern. */ + referenced subpattern. + The OP_REF and OP_REFI opcodes are used for a reference to a numbered group + or to a non-duplicated named group. For a duplicated named group, OP_DNREF + and OP_DNREFI are used. In this case we must scan the list of groups to + which the name refers, and use the first one that is set. */ + + case OP_DNREF: + case OP_DNREFI: + caseless = op == OP_DNREFI; + { + int count = GET2(ecode, 1+IMM2_SIZE); + pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size; + ecode += 1 + 2*IMM2_SIZE; + + while (count-- > 0) + { + offset = GET2(slot, 0) << 1; + if (offset < offset_top && md->offset_vector[offset] >= 0) break; + slot += md->name_entry_size; + } + if (count < 0) + length = (md->jscript_compat)? 0 : -1; + else + length = md->offset_vector[offset+1] - md->offset_vector[offset]; + } + goto REF_REPEAT; + + case OP_REF: + case OP_REFI: + caseless = op == OP_REFI; + offset = GET2(ecode, 1) << 1; /* Doubled ref number */ + ecode += 1 + IMM2_SIZE; if (offset >= offset_top || md->offset_vector[offset] < 0) length = (md->jscript_compat)? 0 : -1; else @@ -2633,6 +2716,7 @@ for (;;) /* Set up for repetition, or handle the non-repeated case */ + REF_REPEAT: switch (*ecode) { case OP_CRSTAR: @@ -2660,6 +2744,7 @@ for (;;) default: /* No repeat follows */ if ((length = match_ref(offset, eptr, length, md, caseless)) < 0) { + if (length == -2) eptr = md->end_subject; /* Partial match */ CHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } @@ -2685,6 +2770,7 @@ for (;;) int slength; if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0) { + if (slength == -2) eptr = md->end_subject; /* Partial match */ CHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } @@ -2708,6 +2794,7 @@ for (;;) if (fi >= max) RRETURN(MATCH_NOMATCH); if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0) { + if (slength == -2) eptr = md->end_subject; /* Partial match */ CHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } @@ -2726,11 +2813,20 @@ for (;;) int slength; if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0) { - CHECK_PARTIAL(); + /* Can't use CHECK_PARTIAL because we don't want to update eptr in + the soft partial matching case. */ + + if (slength == -2 && md->partial != 0 && + md->end_subject > md->start_used_ptr) + { + md->hitend = TRUE; + if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); + } break; } eptr += slength; } + while (eptr >= pp) { RMATCH(eptr, ecode, offset_top, md, eptrb, RM15); @@ -2769,8 +2865,12 @@ for (;;) case OP_CRMINPLUS: case OP_CRQUERY: case OP_CRMINQUERY: + case OP_CRPOSSTAR: + case OP_CRPOSPLUS: + case OP_CRPOSQUERY: c = *ecode++ - OP_CRSTAR; - minimize = (c & 1) != 0; + if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0; + else possessive = TRUE; min = rep_min[c]; /* Pick up values from tables; */ max = rep_max[c]; /* zero for max => infinity */ if (max == 0) max = INT_MAX; @@ -2778,7 +2878,9 @@ for (;;) case OP_CRRANGE: case OP_CRMINRANGE: + case OP_CRPOSRANGE: minimize = (*ecode == OP_CRMINRANGE); + possessive = (*ecode == OP_CRPOSRANGE); min = GET2(ecode, 1); max = GET2(ecode, 1 + IMM2_SIZE); if (max == 0) max = INT_MAX; @@ -2920,6 +3022,9 @@ for (;;) if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break; eptr += len; } + + if (possessive) continue; /* No backtracking */ + for (;;) { RMATCH(eptr, ecode, offset_top, md, eptrb, RM18); @@ -2950,6 +3055,9 @@ for (;;) if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break; eptr++; } + + if (possessive) continue; /* No backtracking */ + while (eptr >= pp) { RMATCH(eptr, ecode, offset_top, md, eptrb, RM19); @@ -2965,9 +3073,10 @@ for (;;) /* Control never gets here */ - /* Match an extended character class. This opcode is encountered only - when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8 - mode, because Unicode properties are supported in non-UTF-8 mode. */ + /* Match an extended character class. In the 8-bit library, this opcode is + encountered only when UTF-8 mode mode is supported. In the 16-bit and + 32-bit libraries, codepoints greater than 255 may be encountered even when + UTF is not supported. */ #if defined SUPPORT_UTF || !defined COMPILE_PCRE8 case OP_XCLASS: @@ -2983,8 +3092,12 @@ for (;;) case OP_CRMINPLUS: case OP_CRQUERY: case OP_CRMINQUERY: + case OP_CRPOSSTAR: + case OP_CRPOSPLUS: + case OP_CRPOSQUERY: c = *ecode++ - OP_CRSTAR; - minimize = (c & 1) != 0; + if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0; + else possessive = TRUE; min = rep_min[c]; /* Pick up values from tables; */ max = rep_max[c]; /* zero for max => infinity */ if (max == 0) max = INT_MAX; @@ -2992,7 +3105,9 @@ for (;;) case OP_CRRANGE: case OP_CRMINRANGE: + case OP_CRPOSRANGE: minimize = (*ecode == OP_CRMINRANGE); + possessive = (*ecode == OP_CRPOSRANGE); min = GET2(ecode, 1); max = GET2(ecode, 1 + IMM2_SIZE); if (max == 0) max = INT_MAX; @@ -3064,6 +3179,9 @@ for (;;) if (!PRIV(xclass)(c, data, utf)) break; eptr += len; } + + if (possessive) continue; /* No backtracking */ + for(;;) { RMATCH(eptr, ecode, offset_top, md, eptrb, RM21); @@ -3094,7 +3212,7 @@ for (;;) CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */ RRETURN(MATCH_NOMATCH); } - while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH); + while (length-- > 0) if (*ecode++ != RAWUCHARINC(eptr)) RRETURN(MATCH_NOMATCH); } else #endif @@ -3134,8 +3252,8 @@ for (;;) if (fc < 128) { - if (md->lcc[fc] - != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH); + pcre_uint32 cc = RAWUCHAR(eptr); + if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH); ecode++; eptr++; } @@ -3146,7 +3264,7 @@ for (;;) else { - unsigned int dc; + pcre_uint32 dc; GETCHARINC(dc, eptr); ecode += length; @@ -3239,8 +3357,23 @@ for (;;) max = rep_max[c]; /* zero for max => infinity */ if (max == 0) max = INT_MAX; - /* Common code for all repeated single-character matches. */ + /* Common code for all repeated single-character matches. We first check + for the minimum number of characters. If the minimum equals the maximum, we + are done. Otherwise, if minimizing, check the rest of the pattern for a + match; if there isn't one, advance up to the maximum, one character at a + time. + If maximizing, advance up to the maximum number of matching characters, + until eptr is past the end of the maximum run. If possessive, we are + then done (no backing up). Otherwise, match at this position; anything + other than no match is immediately returned. For nomatch, back up one + character, unless we are matching \R and the last thing matched was + \r\n, in which case, back up two bytes. When we reach the first optional + character position, we can save stack by doing a tail recurse. + + The various UTF/non-UTF and caseful/caseless cases are handled separately, + for speed. */ + REPEATCHAR: #ifdef SUPPORT_UTF if (utf) @@ -3256,7 +3389,7 @@ for (;;) if (length > 1) { #ifdef SUPPORT_UCP - unsigned int othercase; + pcre_uint32 othercase; if (op >= OP_STARI && /* Caseless */ (othercase = UCD_OTHERCASE(fc)) != fc) oclength = PRIV(ord2utf)(othercase, occhars); @@ -3323,13 +3456,12 @@ for (;;) } } - if (possessive) continue; - + if (possessive) continue; /* No backtracking */ for(;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM23); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr == pp) { RRETURN(MATCH_NOMATCH); } #ifdef SUPPORT_UCP eptr--; BACKCHAR(eptr); @@ -3360,7 +3492,7 @@ for (;;) maximizing, find the maximum number of characters and work backwards. */ DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max, - max, eptr)); + max, (char *)eptr)); if (op >= OP_STARI) /* Caseless */ { @@ -3383,12 +3515,14 @@ for (;;) for (i = 1; i <= min; i++) { + pcre_uint32 cc; /* Faster than pcre_uchar */ if (eptr >= md->end_subject) { SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH); + cc = RAWUCHARTEST(eptr); + if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH); eptr++; } if (min == max) continue; @@ -3396,6 +3530,7 @@ for (;;) { for (fi = min;; fi++) { + pcre_uint32 cc; /* Faster than pcre_uchar */ RMATCH(eptr, ecode, offset_top, md, eptrb, RM24); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max) RRETURN(MATCH_NOMATCH); @@ -3404,7 +3539,8 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH); + cc = RAWUCHARTEST(eptr); + if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH); eptr++; } /* Control never gets here */ @@ -3414,26 +3550,26 @@ for (;;) pp = eptr; for (i = min; i < max; i++) { + pcre_uint32 cc; /* Faster than pcre_uchar */ if (eptr >= md->end_subject) { SCHECK_PARTIAL(); break; } - if (fc != *eptr && foc != *eptr) break; + cc = RAWUCHARTEST(eptr); + if (fc != cc && foc != cc) break; eptr++; } - - if (possessive) continue; - - while (eptr >= pp) + if (possessive) continue; /* No backtracking */ + for (;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM25); eptr--; if (rrc != MATCH_NOMATCH) RRETURN(rrc); } - RRETURN(MATCH_NOMATCH); + /* Control never gets here */ } - /* Control never gets here */ } /* Caseful comparisons (includes all multi-byte characters) */ @@ -3447,7 +3583,7 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if (fc != *eptr++) RRETURN(MATCH_NOMATCH); + if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH); } if (min == max) continue; @@ -3464,7 +3600,7 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if (fc != *eptr++) RRETURN(MATCH_NOMATCH); + if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH); } /* Control never gets here */ } @@ -3478,18 +3614,18 @@ for (;;) SCHECK_PARTIAL(); break; } - if (fc != *eptr) break; + if (fc != RAWUCHARTEST(eptr)) break; eptr++; } - if (possessive) continue; - - while (eptr >= pp) + if (possessive) continue; /* No backtracking */ + for (;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM27); eptr--; if (rrc != MATCH_NOMATCH) RRETURN(rrc); } - RRETURN(MATCH_NOMATCH); + /* Control never gets here */ } } /* Control never gets here */ @@ -3504,33 +3640,41 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - ecode++; - GETCHARINCTEST(c, eptr); - if (op == OP_NOTI) /* The caseless case */ - { - register unsigned int ch, och; - ch = *ecode++; -#ifdef COMPILE_PCRE8 - /* ch must be < 128 if UTF is enabled. */ - och = md->fcc[ch]; -#else #ifdef SUPPORT_UTF + if (utf) + { + register pcre_uint32 ch, och; + + ecode++; + GETCHARINC(ch, ecode); + GETCHARINC(c, eptr); + + if (op == OP_NOT) + { + if (ch == c) RRETURN(MATCH_NOMATCH); + } + else + { #ifdef SUPPORT_UCP - if (utf && ch > 127) - och = UCD_OTHERCASE(ch); + if (ch > 127) + och = UCD_OTHERCASE(ch); #else - if (utf && ch > 127) - och = ch; + if (ch > 127) + och = ch; #endif /* SUPPORT_UCP */ - else -#endif /* SUPPORT_UTF */ - och = TABLE_GET(ch, md->fcc, ch); -#endif /* COMPILE_PCRE8 */ - if (ch == c || och == c) RRETURN(MATCH_NOMATCH); + else + och = TABLE_GET(ch, md->fcc, ch); + if (ch == c || och == c) RRETURN(MATCH_NOMATCH); + } } - else /* Caseful */ + else +#endif { - if (*ecode++ == c) RRETURN(MATCH_NOMATCH); + register pcre_uint32 ch = ecode[1]; + c = *eptr++; + if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c)) + RRETURN(MATCH_NOMATCH); + ecode += 2; } break; @@ -3610,7 +3754,7 @@ for (;;) /* Common code for all repeated single-byte matches. */ REPEATNOTCHAR: - fc = *ecode++; + GETCHARINCTEST(fc, ecode); /* The code is duplicated for the caseless and caseful cases, for speed, since matching characters is likely to be quite common. First, ensure the @@ -3621,14 +3765,10 @@ for (;;) characters and work backwards. */ DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max, - max, eptr)); + max, (char *)eptr)); if (op >= OP_NOTSTARI) /* Caseless */ { -#ifdef COMPILE_PCRE8 - /* fc must be < 128 if UTF is enabled. */ - foc = md->fcc[fc]; -#else #ifdef SUPPORT_UTF #ifdef SUPPORT_UCP if (utf && fc > 127) @@ -3640,12 +3780,11 @@ for (;;) else #endif /* SUPPORT_UTF */ foc = TABLE_GET(fc, md->fcc, fc); -#endif /* COMPILE_PCRE8 */ #ifdef SUPPORT_UTF if (utf) { - register unsigned int d; + register pcre_uint32 d; for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) @@ -3654,11 +3793,11 @@ for (;;) RRETURN(MATCH_NOMATCH); } GETCHARINC(d, eptr); - if (fc == d || (unsigned int) foc == d) RRETURN(MATCH_NOMATCH); + if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH); } } else -#endif +#endif /* SUPPORT_UTF */ /* Not UTF mode */ { for (i = 1; i <= min; i++) @@ -3680,7 +3819,7 @@ for (;;) #ifdef SUPPORT_UTF if (utf) { - register unsigned int d; + register pcre_uint32 d; for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, eptrb, RM28); @@ -3696,7 +3835,7 @@ for (;;) } } else -#endif +#endif /*SUPPORT_UTF */ /* Not UTF mode */ { for (fi = min;; fi++) @@ -3725,7 +3864,7 @@ for (;;) #ifdef SUPPORT_UTF if (utf) { - register unsigned int d; + register pcre_uint32 d; for (i = min; i < max; i++) { int len = 1; @@ -3738,17 +3877,18 @@ for (;;) if (fc == d || (unsigned int)foc == d) break; eptr += len; } - if (possessive) continue; + if (possessive) continue; /* No backtracking */ for(;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM30); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr-- == pp) break; /* Stop if tried at original pos */ + eptr--; BACKCHAR(eptr); } } else -#endif +#endif /* SUPPORT_UTF */ /* Not UTF mode */ { for (i = min; i < max; i++) @@ -3761,18 +3901,17 @@ for (;;) if (fc == *eptr || foc == *eptr) break; eptr++; } - if (possessive) continue; - while (eptr >= pp) + if (possessive) continue; /* No backtracking */ + for (;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM31); if (rrc != MATCH_NOMATCH) RRETURN(rrc); eptr--; } } - - RRETURN(MATCH_NOMATCH); + /* Control never gets here */ } - /* Control never gets here */ } /* Caseful comparisons */ @@ -3782,7 +3921,7 @@ for (;;) #ifdef SUPPORT_UTF if (utf) { - register unsigned int d; + register pcre_uint32 d; for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) @@ -3816,7 +3955,7 @@ for (;;) #ifdef SUPPORT_UTF if (utf) { - register unsigned int d; + register pcre_uint32 d; for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, eptrb, RM32); @@ -3860,7 +3999,7 @@ for (;;) #ifdef SUPPORT_UTF if (utf) { - register unsigned int d; + register pcre_uint32 d; for (i = min; i < max; i++) { int len = 1; @@ -3873,12 +4012,13 @@ for (;;) if (fc == d) break; eptr += len; } - if (possessive) continue; + if (possessive) continue; /* No backtracking */ for(;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM34); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr-- == pp) break; /* Stop if tried at original pos */ + eptr--; BACKCHAR(eptr); } } @@ -3896,16 +4036,16 @@ for (;;) if (fc == *eptr) break; eptr++; } - if (possessive) continue; - while (eptr >= pp) + if (possessive) continue; /* No backtracking */ + for (;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM35); if (rrc != MATCH_NOMATCH) RRETURN(rrc); eptr--; } } - - RRETURN(MATCH_NOMATCH); + /* Control never gets here */ } } /* Control never gets here */ @@ -4087,7 +4227,12 @@ for (;;) } break; + /* Perl space used to exclude VT, but from Perl 5.18 it is included, + which means that Perl space and POSIX space are now identical. PCRE + was changed at release 8.34. */ + case PT_SPACE: /* Perl space */ + case PT_PXSPACE: /* POSIX space */ for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) @@ -4096,43 +4241,72 @@ for (;;) RRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); - if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL || - c == CHAR_FF || c == CHAR_CR) - == prop_fail_result) - RRETURN(MATCH_NOMATCH); + switch(c) + { + HSPACE_CASES: + VSPACE_CASES: + if (prop_fail_result) RRETURN(MATCH_NOMATCH); + break; + + default: + if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result) + RRETURN(MATCH_NOMATCH); + break; + } } break; - case PT_PXSPACE: /* POSIX space */ + case PT_WORD: for (i = 1; i <= min; i++) { + int category; if (eptr >= md->end_subject) { SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); - if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL || - c == CHAR_VT || c == CHAR_FF || c == CHAR_CR) + category = UCD_CATEGORY(c); + if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE) == prop_fail_result) RRETURN(MATCH_NOMATCH); } break; - case PT_WORD: + case PT_CLIST: for (i = 1; i <= min; i++) { - int category; + const pcre_uint32 *cp; if (eptr >= md->end_subject) { SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); - category = UCD_CATEGORY(c); - if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE) - == prop_fail_result) + cp = PRIV(ucd_caseless_sets) + prop_value; + for (;;) + { + if (c < *cp) + { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } } + if (c == *cp++) + { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; } + } + } + break; + + case PT_UCNC: + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(c, eptr); + if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || + c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || + c >= 0xe000) == prop_fail_result) + RRETURN(MATCH_NOMATCH); } break; @@ -4155,15 +4329,22 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - GETCHARINCTEST(c, eptr); - if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH); - while (eptr < md->end_subject) + else { - int len = 1; - if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } - if (UCD_CATEGORY(c) != ucp_M) break; - eptr += len; + int lgb, rgb; + GETCHARINCTEST(c, eptr); + lgb = UCD_GRAPHBREAK(c); + while (eptr < md->end_subject) + { + int len = 1; + if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } + rgb = UCD_GRAPHBREAK(c); + if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; + lgb = rgb; + eptr += len; + } } + CHECK_PARTIAL(); } } @@ -4184,6 +4365,15 @@ for (;;) RRETURN(MATCH_NOMATCH); } if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); + if (md->partial != 0 && + eptr + 1 >= md->end_subject && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + RAWUCHAR(eptr) == NLBLOCK->nl[0]) + { + md->hitend = TRUE; + if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); + } eptr++; ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); } @@ -4220,18 +4410,20 @@ for (;;) { default: RRETURN(MATCH_NOMATCH); - case 0x000d: - if (eptr < md->end_subject && *eptr == 0x0a) eptr++; + case CHAR_CR: + if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++; break; - case 0x000a: + case CHAR_LF: break; - case 0x000b: - case 0x000c: - case 0x0085: + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); break; } @@ -4249,27 +4441,8 @@ for (;;) GETCHARINC(c, eptr); switch(c) { + HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */ default: break; - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ - RRETURN(MATCH_NOMATCH); } } break; @@ -4285,27 +4458,8 @@ for (;;) GETCHARINC(c, eptr); switch(c) { + HSPACE_CASES: break; /* Byte and multibyte cases */ default: RRETURN(MATCH_NOMATCH); - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ - break; } } break; @@ -4321,15 +4475,8 @@ for (;;) GETCHARINC(c, eptr); switch(c) { + VSPACE_CASES: RRETURN(MATCH_NOMATCH); default: break; - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ - case 0x2028: /* LINE SEPARATOR */ - case 0x2029: /* PARAGRAPH SEPARATOR */ - RRETURN(MATCH_NOMATCH); } } break; @@ -4345,15 +4492,8 @@ for (;;) GETCHARINC(c, eptr); switch(c) { + VSPACE_CASES: break; default: RRETURN(MATCH_NOMATCH); - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ - case 0x2028: /* LINE SEPARATOR */ - case 0x2029: /* PARAGRAPH SEPARATOR */ - break; } } break; @@ -4375,12 +4515,14 @@ for (;;) case OP_DIGIT: for (i = 1; i <= min; i++) { + pcre_uint32 cc; if (eptr >= md->end_subject) { SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_digit) == 0) + cc = RAWUCHAR(eptr); + if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); eptr++; /* No need to skip more bytes - we know it's a 1-byte character */ @@ -4390,12 +4532,14 @@ for (;;) case OP_NOT_WHITESPACE: for (i = 1; i <= min; i++) { + pcre_uint32 cc; if (eptr >= md->end_subject) { SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0) + cc = RAWUCHAR(eptr); + if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); eptr++; ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); @@ -4405,12 +4549,14 @@ for (;;) case OP_WHITESPACE: for (i = 1; i <= min; i++) { + pcre_uint32 cc; if (eptr >= md->end_subject) { SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_space) == 0) + cc = RAWUCHAR(eptr); + if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); eptr++; /* No need to skip more bytes - we know it's a 1-byte character */ @@ -4420,12 +4566,14 @@ for (;;) case OP_NOT_WORDCHAR: for (i = 1; i <= min; i++) { + pcre_uint32 cc; if (eptr >= md->end_subject) { SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0) + cc = RAWUCHAR(eptr); + if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0) RRETURN(MATCH_NOMATCH); eptr++; ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); @@ -4435,12 +4583,14 @@ for (;;) case OP_WORDCHAR: for (i = 1; i <= min; i++) { + pcre_uint32 cc; if (eptr >= md->end_subject) { SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_word) == 0) + cc = RAWUCHAR(eptr); + if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); eptr++; /* No need to skip more bytes - we know it's a 1-byte character */ @@ -4468,6 +4618,15 @@ for (;;) RRETURN(MATCH_NOMATCH); } if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); + if (md->partial != 0 && + eptr + 1 >= md->end_subject && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + *eptr == NLBLOCK->nl[0]) + { + md->hitend = TRUE; + if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); + } eptr++; } break; @@ -4502,17 +4661,17 @@ for (;;) { default: RRETURN(MATCH_NOMATCH); - case 0x000d: - if (eptr < md->end_subject && *eptr == 0x0a) eptr++; + case CHAR_CR: + if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++; break; - case 0x000a: + case CHAR_LF: break; - case 0x000b: - case 0x000c: - case 0x0085: -#ifdef COMPILE_PCRE16 + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 case 0x2028: case 0x2029: #endif @@ -4533,26 +4692,9 @@ for (;;) switch(*eptr++) { default: break; - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ -#ifdef COMPILE_PCRE16 - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ + HSPACE_BYTE_CASES: +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + HSPACE_MULTIBYTE_CASES: #endif RRETURN(MATCH_NOMATCH); } @@ -4570,26 +4712,9 @@ for (;;) switch(*eptr++) { default: RRETURN(MATCH_NOMATCH); - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ -#ifdef COMPILE_PCRE16 - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ + HSPACE_BYTE_CASES: +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + HSPACE_MULTIBYTE_CASES: #endif break; } @@ -4606,17 +4731,12 @@ for (;;) } switch(*eptr++) { - default: break; - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ -#ifdef COMPILE_PCRE16 - case 0x2028: /* LINE SEPARATOR */ - case 0x2029: /* PARAGRAPH SEPARATOR */ + VSPACE_BYTE_CASES: +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + VSPACE_MULTIBYTE_CASES: #endif RRETURN(MATCH_NOMATCH); + default: break; } } break; @@ -4632,14 +4752,9 @@ for (;;) switch(*eptr++) { default: RRETURN(MATCH_NOMATCH); - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ -#ifdef COMPILE_PCRE16 - case 0x2028: /* LINE SEPARATOR */ - case 0x2029: /* PARAGRAPH SEPARATOR */ + VSPACE_BYTE_CASES: +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + VSPACE_MULTIBYTE_CASES: #endif break; } @@ -4857,10 +4972,15 @@ for (;;) } /* Control never gets here */ + /* Perl space used to exclude VT, but from Perl 5.18 it is included, + which means that Perl space and POSIX space are now identical. PCRE + was changed at release 8.34. */ + case PT_SPACE: /* Perl space */ + case PT_PXSPACE: /* POSIX space */ for (fi = min;; fi++) { - RMATCH(eptr, ecode, offset_top, md, eptrb, RM60); + RMATCH(eptr, ecode, offset_top, md, eptrb, RM61); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max) RRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) @@ -4869,17 +4989,26 @@ for (;;) RRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); - if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL || - c == CHAR_FF || c == CHAR_CR) - == prop_fail_result) - RRETURN(MATCH_NOMATCH); + switch(c) + { + HSPACE_CASES: + VSPACE_CASES: + if (prop_fail_result) RRETURN(MATCH_NOMATCH); + break; + + default: + if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result) + RRETURN(MATCH_NOMATCH); + break; + } } /* Control never gets here */ - case PT_PXSPACE: /* POSIX space */ + case PT_WORD: for (fi = min;; fi++) { - RMATCH(eptr, ecode, offset_top, md, eptrb, RM61); + int category; + RMATCH(eptr, ecode, offset_top, md, eptrb, RM62); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max) RRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) @@ -4888,18 +5017,20 @@ for (;;) RRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); - if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL || - c == CHAR_VT || c == CHAR_FF || c == CHAR_CR) + category = UCD_CATEGORY(c); + if ((category == ucp_L || + category == ucp_N || + c == CHAR_UNDERSCORE) == prop_fail_result) RRETURN(MATCH_NOMATCH); } /* Control never gets here */ - case PT_WORD: + case PT_CLIST: for (fi = min;; fi++) { - int category; - RMATCH(eptr, ecode, offset_top, md, eptrb, RM62); + const pcre_uint32 *cp; + RMATCH(eptr, ecode, offset_top, md, eptrb, RM67); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max) RRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) @@ -4908,17 +5039,37 @@ for (;;) RRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); - category = UCD_CATEGORY(c); - if ((category == ucp_L || - category == ucp_N || - c == CHAR_UNDERSCORE) - == prop_fail_result) + cp = PRIV(ucd_caseless_sets) + prop_value; + for (;;) + { + if (c < *cp) + { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } } + if (c == *cp++) + { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; } + } + } + /* Control never gets here */ + + case PT_UCNC: + for (fi = min;; fi++) + { + RMATCH(eptr, ecode, offset_top, md, eptrb, RM60); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(c, eptr); + if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || + c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || + c >= 0xe000) == prop_fail_result) + RRETURN(MATCH_NOMATCH); } /* Control never gets here */ /* This should never occur */ - default: RRETURN(PCRE_ERROR_INTERNAL); } @@ -4939,15 +5090,22 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - GETCHARINCTEST(c, eptr); - if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH); - while (eptr < md->end_subject) + else { - int len = 1; - if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } - if (UCD_CATEGORY(c) != ucp_M) break; - eptr += len; + int lgb, rgb; + GETCHARINCTEST(c, eptr); + lgb = UCD_GRAPHBREAK(c); + while (eptr < md->end_subject) + { + int len = 1; + if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } + rgb = UCD_GRAPHBREAK(c); + if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; + lgb = rgb; + eptr += len; + } } + CHECK_PARTIAL(); } } else @@ -4971,7 +5129,18 @@ for (;;) GETCHARINC(c, eptr); switch(ctype) { - case OP_ANY: /* This is the non-NL case */ + case OP_ANY: /* This is the non-NL case */ + if (md->partial != 0 && /* Take care with CRLF partial */ + eptr >= md->end_subject && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + c == NLBLOCK->nl[0]) + { + md->hitend = TRUE; + if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); + } + break; + case OP_ALLANY: case OP_ANYBYTE: break; @@ -4980,17 +5149,20 @@ for (;;) switch(c) { default: RRETURN(MATCH_NOMATCH); - case 0x000d: - if (eptr < md->end_subject && *eptr == 0x0a) eptr++; + case CHAR_CR: + if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++; break; - case 0x000a: + + case CHAR_LF: break; - case 0x000b: - case 0x000c: - case 0x0085: + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); break; } @@ -4999,84 +5171,32 @@ for (;;) case OP_NOT_HSPACE: switch(c) { + HSPACE_CASES: RRETURN(MATCH_NOMATCH); default: break; - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ - RRETURN(MATCH_NOMATCH); } break; case OP_HSPACE: switch(c) { + HSPACE_CASES: break; default: RRETURN(MATCH_NOMATCH); - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ - break; } break; case OP_NOT_VSPACE: switch(c) { + VSPACE_CASES: RRETURN(MATCH_NOMATCH); default: break; - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ - case 0x2028: /* LINE SEPARATOR */ - case 0x2029: /* PARAGRAPH SEPARATOR */ - RRETURN(MATCH_NOMATCH); } break; case OP_VSPACE: switch(c) { + VSPACE_CASES: break; default: RRETURN(MATCH_NOMATCH); - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ - case 0x2028: /* LINE SEPARATOR */ - case 0x2029: /* PARAGRAPH SEPARATOR */ - break; } break; @@ -5134,7 +5254,18 @@ for (;;) c = *eptr++; switch(ctype) { - case OP_ANY: /* This is the non-NL case */ + case OP_ANY: /* This is the non-NL case */ + if (md->partial != 0 && /* Take care with CRLF partial */ + eptr >= md->end_subject && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + c == NLBLOCK->nl[0]) + { + md->hitend = TRUE; + if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); + } + break; + case OP_ALLANY: case OP_ANYBYTE: break; @@ -5143,17 +5274,17 @@ for (;;) switch(c) { default: RRETURN(MATCH_NOMATCH); - case 0x000d: - if (eptr < md->end_subject && *eptr == 0x0a) eptr++; + case CHAR_CR: + if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++; break; - case 0x000a: + case CHAR_LF: break; - case 0x000b: - case 0x000c: - case 0x0085: -#ifdef COMPILE_PCRE16 + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 case 0x2028: case 0x2029: #endif @@ -5166,26 +5297,9 @@ for (;;) switch(c) { default: break; - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ -#ifdef COMPILE_PCRE16 - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ + HSPACE_BYTE_CASES: +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + HSPACE_MULTIBYTE_CASES: #endif RRETURN(MATCH_NOMATCH); } @@ -5195,26 +5309,9 @@ for (;;) switch(c) { default: RRETURN(MATCH_NOMATCH); - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ -#ifdef COMPILE_PCRE16 - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ + HSPACE_BYTE_CASES: +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + HSPACE_MULTIBYTE_CASES: #endif break; } @@ -5224,14 +5321,9 @@ for (;;) switch(c) { default: break; - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ -#ifdef COMPILE_PCRE16 - case 0x2028: /* LINE SEPARATOR */ - case 0x2029: /* PARAGRAPH SEPARATOR */ + VSPACE_BYTE_CASES: +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + VSPACE_MULTIBYTE_CASES: #endif RRETURN(MATCH_NOMATCH); } @@ -5241,14 +5333,9 @@ for (;;) switch(c) { default: RRETURN(MATCH_NOMATCH); - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ -#ifdef COMPILE_PCRE16 - case 0x2028: /* LINE SEPARATOR */ - case 0x2029: /* PARAGRAPH SEPARATOR */ + VSPACE_BYTE_CASES: +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + VSPACE_MULTIBYTE_CASES: #endif break; } @@ -5397,7 +5484,12 @@ for (;;) } break; + /* Perl space used to exclude VT, but from Perl 5.18 it is included, + which means that Perl space and POSIX space are now identical. PCRE + was changed at release 8.34. */ + case PT_SPACE: /* Perl space */ + case PT_PXSPACE: /* POSIX space */ for (i = min; i < max; i++) { int len = 1; @@ -5407,17 +5499,27 @@ for (;;) break; } GETCHARLENTEST(c, eptr, len); - if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL || - c == CHAR_FF || c == CHAR_CR) - == prop_fail_result) + switch(c) + { + HSPACE_CASES: + VSPACE_CASES: + if (prop_fail_result) goto ENDLOOP99; /* Break the loop */ break; + + default: + if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result) + goto ENDLOOP99; /* Break the loop */ + break; + } eptr+= len; } + ENDLOOP99: break; - case PT_PXSPACE: /* POSIX space */ + case PT_WORD: for (i = min; i < max; i++) { + int category; int len = 1; if (eptr >= md->end_subject) { @@ -5425,18 +5527,18 @@ for (;;) break; } GETCHARLENTEST(c, eptr, len); - if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL || - c == CHAR_VT || c == CHAR_FF || c == CHAR_CR) - == prop_fail_result) + category = UCD_CATEGORY(c); + if ((category == ucp_L || category == ucp_N || + c == CHAR_UNDERSCORE) == prop_fail_result) break; eptr+= len; } break; - case PT_WORD: + case PT_CLIST: for (i = min; i < max; i++) { - int category; + const pcre_uint32 *cp; int len = 1; if (eptr >= md->end_subject) { @@ -5444,11 +5546,34 @@ for (;;) break; } GETCHARLENTEST(c, eptr, len); - category = UCD_CATEGORY(c); - if ((category == ucp_L || category == ucp_N || - c == CHAR_UNDERSCORE) == prop_fail_result) + cp = PRIV(ucd_caseless_sets) + prop_value; + for (;;) + { + if (c < *cp) + { if (prop_fail_result) break; else goto GOT_MAX; } + if (c == *cp++) + { if (prop_fail_result) goto GOT_MAX; else break; } + } + eptr += len; + } + GOT_MAX: + break; + + case PT_UCNC: + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); break; - eptr+= len; + } + GETCHARLENTEST(c, eptr, len); + if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || + c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || + c >= 0xe000) == prop_fail_result) + break; + eptr += len; } break; @@ -5458,59 +5583,85 @@ for (;;) /* eptr is now past the end of the maximum run */ - if (possessive) continue; + if (possessive) continue; /* No backtracking */ for(;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM44); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr-- == pp) break; /* Stop if tried at original pos */ + eptr--; if (utf) BACKCHAR(eptr); } } - /* Match extended Unicode sequences. We will get here only if the + /* Match extended Unicode grapheme clusters. We will get here only if the support is in the binary; otherwise a compile-time error occurs. */ else if (ctype == OP_EXTUNI) { for (i = min; i < max; i++) { - int len = 1; if (eptr >= md->end_subject) { SCHECK_PARTIAL(); break; } - if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } - if (UCD_CATEGORY(c) == ucp_M) break; - eptr += len; - while (eptr < md->end_subject) + else { - len = 1; - if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } - if (UCD_CATEGORY(c) != ucp_M) break; - eptr += len; + int lgb, rgb; + GETCHARINCTEST(c, eptr); + lgb = UCD_GRAPHBREAK(c); + while (eptr < md->end_subject) + { + int len = 1; + if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } + rgb = UCD_GRAPHBREAK(c); + if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; + lgb = rgb; + eptr += len; + } } + CHECK_PARTIAL(); } /* eptr is now past the end of the maximum run */ - if (possessive) continue; + if (possessive) continue; /* No backtracking */ for(;;) { + int lgb, rgb; + PCRE_PUCHAR fptr; + + if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */ RMATCH(eptr, ecode, offset_top, md, eptrb, RM45); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr-- == pp) break; /* Stop if tried at original pos */ - for (;;) /* Move back over one extended */ + + /* Backtracking over an extended grapheme cluster involves inspecting + the previous two characters (if present) to see if a break is + permitted between them. */ + + eptr--; + if (!utf) c = *eptr; else { - if (!utf) c = *eptr; else + BACKCHAR(eptr); + GETCHAR(c, eptr); + } + rgb = UCD_GRAPHBREAK(c); + + for (;;) + { + if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */ + fptr = eptr - 1; + if (!utf) c = *fptr; else { - BACKCHAR(eptr); - GETCHAR(c, eptr); + BACKCHAR(fptr); + GETCHAR(c, fptr); } - if (UCD_CATEGORY(c) != ucp_M) break; - eptr--; + lgb = UCD_GRAPHBREAK(c); + if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; + eptr = fptr; + rgb = lgb; } } } @@ -5534,6 +5685,15 @@ for (;;) break; } if (IS_NEWLINE(eptr)) break; + if (md->partial != 0 && /* Take care with CRLF partial */ + eptr + 1 >= md->end_subject && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + RAWUCHAR(eptr) == NLBLOCK->nl[0]) + { + md->hitend = TRUE; + if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); + } eptr++; ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); } @@ -5551,6 +5711,15 @@ for (;;) break; } if (IS_NEWLINE(eptr)) break; + if (md->partial != 0 && /* Take care with CRLF partial */ + eptr + 1 >= md->end_subject && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + RAWUCHAR(eptr) == NLBLOCK->nl[0]) + { + md->hitend = TRUE; + if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); + } eptr++; ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); } @@ -5600,17 +5769,20 @@ for (;;) break; } GETCHARLEN(c, eptr, len); - if (c == 0x000d) + if (c == CHAR_CR) { if (++eptr >= md->end_subject) break; - if (*eptr == 0x000a) eptr++; + if (RAWUCHAR(eptr) == CHAR_LF) eptr++; } else { - if (c != 0x000a && + if (c != CHAR_LF && (md->bsr_anycrlf || - (c != 0x000b && c != 0x000c && - c != 0x0085 && c != 0x2028 && c != 0x2029))) + (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL +#ifndef EBCDIC + && c != 0x2028 && c != 0x2029 +#endif /* Not EBCDIC */ + ))) break; eptr += len; } @@ -5631,28 +5803,8 @@ for (;;) GETCHARLEN(c, eptr, len); switch(c) { + HSPACE_CASES: gotspace = TRUE; break; default: gotspace = FALSE; break; - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ - gotspace = TRUE; - break; } if (gotspace == (ctype == OP_NOT_HSPACE)) break; eptr += len; @@ -5673,16 +5825,8 @@ for (;;) GETCHARLEN(c, eptr, len); switch(c) { + VSPACE_CASES: gotspace = TRUE; break; default: gotspace = FALSE; break; - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ - case 0x2028: /* LINE SEPARATOR */ - case 0x2029: /* PARAGRAPH SEPARATOR */ - gotspace = TRUE; - break; } if (gotspace == (ctype == OP_NOT_VSPACE)) break; eptr += len; @@ -5783,21 +5927,16 @@ for (;;) RRETURN(PCRE_ERROR_INTERNAL); } - /* eptr is now past the end of the maximum run. If possessive, we are - done (no backing up). Otherwise, match at this position; anything other - than no match is immediately returned. For nomatch, back up one - character, unless we are matching \R and the last thing matched was - \r\n, in which case, back up two bytes. */ - - if (possessive) continue; + if (possessive) continue; /* No backtracking */ for(;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM46); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr-- == pp) break; /* Stop if tried at original pos */ + eptr--; BACKCHAR(eptr); - if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' && - eptr[-1] == '\r') eptr--; + if (ctype == OP_ANYNL && eptr > pp && RAWUCHAR(eptr) == CHAR_NL && + RAWUCHAR(eptr - 1) == CHAR_CR) eptr--; } } else @@ -5815,6 +5954,15 @@ for (;;) break; } if (IS_NEWLINE(eptr)) break; + if (md->partial != 0 && /* Take care with CRLF partial */ + eptr + 1 >= md->end_subject && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + *eptr == NLBLOCK->nl[0]) + { + md->hitend = TRUE; + if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); + } eptr++; } break; @@ -5839,19 +5987,19 @@ for (;;) break; } c = *eptr; - if (c == 0x000d) + if (c == CHAR_CR) { if (++eptr >= md->end_subject) break; - if (*eptr == 0x000a) eptr++; + if (*eptr == CHAR_LF) eptr++; } else { - if (c != 0x000a && (md->bsr_anycrlf || - (c != 0x000b && c != 0x000c && c != 0x0085 -#ifdef COMPILE_PCRE16 - && c != 0x2028 && c != 0x2029 + if (c != CHAR_LF && (md->bsr_anycrlf || + (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + && c != 0x2028 && c != 0x2029 #endif - ))) break; + ))) break; eptr++; } } @@ -5865,15 +6013,17 @@ for (;;) SCHECK_PARTIAL(); break; } - c = *eptr; - if (c == 0x09 || c == 0x20 || c == 0xa0 -#ifdef COMPILE_PCRE16 - || c == 0x1680 || c == 0x180e || (c >= 0x2000 && c <= 0x200A) - || c == 0x202f || c == 0x205f || c == 0x3000 + switch(*eptr) + { + default: eptr++; break; + HSPACE_BYTE_CASES: +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + HSPACE_MULTIBYTE_CASES: #endif - ) break; - eptr++; + goto ENDLOOP00; + } } + ENDLOOP00: break; case OP_HSPACE: @@ -5884,15 +6034,17 @@ for (;;) SCHECK_PARTIAL(); break; } - c = *eptr; - if (c != 0x09 && c != 0x20 && c != 0xa0 -#ifdef COMPILE_PCRE16 - && c != 0x1680 && c != 0x180e && (c < 0x2000 || c > 0x200A) - && c != 0x202f && c != 0x205f && c != 0x3000 + switch(*eptr) + { + default: goto ENDLOOP01; + HSPACE_BYTE_CASES: +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + HSPACE_MULTIBYTE_CASES: #endif - ) break; - eptr++; + eptr++; break; + } } + ENDLOOP01: break; case OP_NOT_VSPACE: @@ -5903,14 +6055,17 @@ for (;;) SCHECK_PARTIAL(); break; } - c = *eptr; - if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85 -#ifdef COMPILE_PCRE16 - || c == 0x2028 || c == 0x2029 + switch(*eptr) + { + default: eptr++; break; + VSPACE_BYTE_CASES: +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + VSPACE_MULTIBYTE_CASES: #endif - ) break; - eptr++; + goto ENDLOOP02; + } } + ENDLOOP02: break; case OP_VSPACE: @@ -5921,14 +6076,17 @@ for (;;) SCHECK_PARTIAL(); break; } - c = *eptr; - if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85 -#ifdef COMPILE_PCRE16 - && c != 0x2028 && c != 0x2029 + switch(*eptr) + { + default: goto ENDLOOP03; + VSPACE_BYTE_CASES: +#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 + VSPACE_MULTIBYTE_CASES: #endif - ) break; - eptr++; + eptr++; break; + } } + ENDLOOP03: break; case OP_NOT_DIGIT: @@ -6013,28 +6171,20 @@ for (;;) RRETURN(PCRE_ERROR_INTERNAL); } - /* eptr is now past the end of the maximum run. If possessive, we are - done (no backing up). Otherwise, match at this position; anything other - than no match is immediately returned. For nomatch, back up one - character (byte), unless we are matching \R and the last thing matched - was \r\n, in which case, back up two bytes. */ - - if (possessive) continue; - while (eptr >= pp) + if (possessive) continue; /* No backtracking */ + for (;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM47); if (rrc != MATCH_NOMATCH) RRETURN(rrc); eptr--; - if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' && - eptr[-1] == '\r') eptr--; + if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF && + eptr[-1] == CHAR_CR) eptr--; } } - /* Get here if we can't make it match with any permitted repetitions */ - - RRETURN(MATCH_NOMATCH); + /* Control never gets here */ } - /* Control never gets here */ /* There's been some horrible disaster. Arrival here can only mean there is something seriously wrong in the code above or the OP_xxx definitions. */ @@ -6068,22 +6218,19 @@ switch (frame->Xwhere) LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64) LBL(65) LBL(66) #if defined SUPPORT_UTF || !defined COMPILE_PCRE8 - LBL(21) + LBL(20) LBL(21) #endif #ifdef SUPPORT_UTF - LBL(16) LBL(18) LBL(20) + LBL(16) LBL(18) LBL(22) LBL(23) LBL(28) LBL(30) LBL(32) LBL(34) LBL(42) LBL(46) #ifdef SUPPORT_UCP LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45) - LBL(59) LBL(60) LBL(61) LBL(62) + LBL(59) LBL(60) LBL(61) LBL(62) LBL(67) #endif /* SUPPORT_UCP */ #endif /* SUPPORT_UTF */ default: DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere)); - -printf("+++jump error in pcre match: label %d non-existent\n", frame->Xwhere); - return PCRE_ERROR_INTERNAL; } #undef LBL @@ -6145,7 +6292,32 @@ Undefine all the macros that were defined above to han ***************************************************************************/ +#ifdef NO_RECURSE +/************************************************* +* Release allocated heap frames * +*************************************************/ +/* This function releases all the allocated frames. The base frame is on the +machine stack, and so must not be freed. + +Argument: the address of the base frame +Returns: nothing +*/ + +static void +release_match_heapframes (heapframe *frame_base) +{ +heapframe *nextframe = frame_base->Xnextframe; +while (nextframe != NULL) + { + heapframe *oldframe = nextframe; + nextframe = nextframe->Xnextframe; + (PUBL(stack_free))(oldframe); + } +} +#endif + + /************************************************* * Execute a Regular Expression * *************************************************/ @@ -6170,16 +6342,21 @@ Returns: > 0 => success; value is the number < -1 => some kind of unexpected problem */ -#ifdef COMPILE_PCRE8 +#if defined COMPILE_PCRE8 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_exec(const pcre *argument_re, const pcre_extra *extra_data, PCRE_SPTR subject, int length, int start_offset, int options, int *offsets, int offsetcount) -#else +#elif defined COMPILE_PCRE16 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data, PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets, int offsetcount) +#elif defined COMPILE_PCRE32 +PCRE_EXP_DEFN int PCRE_CALL_CONVENTION +pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data, + PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets, + int offsetcount) #endif { int rc, ocount, arg_offset_max; @@ -6202,18 +6379,28 @@ const pcre_uint8 *start_bits = NULL; PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset; PCRE_PUCHAR end_subject; PCRE_PUCHAR start_partial = NULL; +PCRE_PUCHAR match_partial = NULL; PCRE_PUCHAR req_char_ptr = start_match - 1; const pcre_study_data *study; const REAL_PCRE *re = (const REAL_PCRE *)argument_re; +#ifdef NO_RECURSE +heapframe frame_zero; +frame_zero.Xprevframe = NULL; /* Marks the top level */ +frame_zero.Xnextframe = NULL; /* None are allocated yet */ +md->match_frames_base = &frame_zero; +#endif + /* Check for the special magic call that measures the size of the stack used -per recursive call of match(). */ +per recursive call of match(). Without the funny casting for sizeof, a Windows +compiler gave this error: "unary minus operator applied to unsigned type, +result still unsigned". Hopefully the cast fixes that. */ if (re == NULL && extra_data == NULL && subject == NULL && length == -999 && start_offset == -999) #ifdef NO_RECURSE - return -sizeof(heapframe); + return -((int)sizeof(heapframe)); #else return match(NULL, NULL, NULL, 0, NULL, NULL, 0); #endif @@ -6224,6 +6411,7 @@ if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; +if (length < 0) return PCRE_ERROR_BADLENGTH; if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET; /* Check that the first field in the block is the magic number. If it is not, @@ -6261,39 +6449,45 @@ if (utf && (options & PCRE_NO_UTF8_CHECK) == 0) offsets[0] = erroroffset; offsets[1] = errorcode; } -#ifdef COMPILE_PCRE16 - return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)? - PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16; -#else +#if defined COMPILE_PCRE8 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)? PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8; +#elif defined COMPILE_PCRE16 + return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)? + PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16; +#elif defined COMPILE_PCRE32 + return PCRE_ERROR_BADUTF32; #endif } - +#if defined COMPILE_PCRE8 || defined COMPILE_PCRE16 /* Check that a start_offset points to the start of a UTF character. */ if (start_offset > 0 && start_offset < length && NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset])) return PCRE_ERROR_BADUTF8_OFFSET; +#endif } #endif /* If the pattern was successfully studied with JIT support, run the JIT executable instead of the rest of this function. Most options must be set at compile time for the JIT code to be usable. Fallback to the normal code path if -an unsupported flag is set. In particular, JIT does not support partial -matching. */ +an unsupported flag is set. */ #ifdef SUPPORT_JIT if (extra_data != NULL - && (extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 + && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT | + PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT && extra_data->executable_jit != NULL - && (extra_data->flags & PCRE_EXTRA_TABLES) == 0 - && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL | - PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART)) == 0) - return PRIV(jit_exec)(re, extra_data->executable_jit, - (const pcre_uchar *)subject, length, start_offset, options, - ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0) - ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount); + && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0) + { + rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length, + start_offset, options, offsets, offsetcount); + + /* PCRE_ERROR_NULL means that the selected normal or partial matching + mode is not compiled. In this case we simply fallback to interpreter. */ + + if (rc != PCRE_ERROR_JIT_BADOPTION) return rc; + } #endif /* Carry on with non-JIT matching. This information is for finding all the @@ -6315,6 +6509,8 @@ md->callout_data = NULL; tables = re->tables; +/* The two limit values override the defaults, whatever their value. */ + if (extra_data != NULL) { register unsigned int flags = extra_data->flags; @@ -6329,6 +6525,15 @@ if (extra_data != NULL) if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables; } +/* Limits in the regex override only if they are smaller. */ + +if ((re->flags & PCRE_MLSET) != 0 && re->limit_match < md->match_limit) + md->match_limit = re->limit_match; + +if ((re->flags & PCRE_RLSET) != 0 && + re->limit_recursion < md->match_limit_recursion) + md->match_limit_recursion = re->limit_recursion; + /* If the exec call supplied NULL for tables, use the inbuilt ones. This is a feature that makes it possible to save compiled regex and re-use them in other programs later. */ @@ -6354,7 +6559,7 @@ end_subject = md->end_subject; md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; md->use_ucp = (re->options & PCRE_UCP) != 0; md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0; -md->ignore_skip_arg = FALSE; +md->ignore_skip_arg = 0; /* Some options are unpacked into BOOL variables in the hope that testing them will be faster than individual option bits. */ @@ -6464,11 +6669,9 @@ if (re->top_backref > 0 && re->top_backref >= ocount/3 DPRINTF(("Got memory to hold back references\n")); } else md->offset_vector = offsets; - md->offset_end = ocount; md->offset_max = (2*ocount)/3; -md->offset_overflow = FALSE; -md->capture_last = -1; +md->capture_last = 0; /* Reset the working variable associated with each extraction. These should never be used unless previously set, but they get saved and restored, and so we @@ -6576,12 +6779,14 @@ for(;;) if (has_first_char) { + pcre_uchar smc; + if (first_char != first_char2) while (start_match < end_subject && - *start_match != first_char && *start_match != first_char2) + (smc = RAWUCHARTEST(start_match)) != first_char && smc != first_char2) start_match++; else - while (start_match < end_subject && *start_match != first_char) + while (start_match < end_subject && RAWUCHARTEST(start_match) != first_char) start_match++; } @@ -6613,7 +6818,7 @@ for(;;) if (start_match[-1] == CHAR_CR && (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) && start_match < end_subject && - *start_match == CHAR_NL) + RAWUCHARTEST(start_match) == CHAR_NL) start_match++; } } @@ -6624,7 +6829,7 @@ for(;;) { while (start_match < end_subject) { - register unsigned int c = *start_match; + register pcre_uint32 c = RAWUCHARTEST(start_match); #ifndef COMPILE_PCRE8 if (c > 255) c = 255; #endif @@ -6692,7 +6897,7 @@ for(;;) { while (p < end_subject) { - register int pp = *p++; + register pcre_uint32 pp = RAWUCHARINCTEST(p); if (pp == req_char || pp == req_char2) { p--; break; } } } @@ -6700,7 +6905,7 @@ for(;;) { while (p < end_subject) { - if (*p++ == req_char) { p--; break; } + if (RAWUCHARINCTEST(p) == req_char) { p--; break; } } } @@ -6736,8 +6941,13 @@ for(;;) md->match_call_count = 0; md->match_function_type = 0; md->end_offset_top = 0; + md->skip_arg_count = 0; rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0); - if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr; + if (md->hitend && start_partial == NULL) + { + start_partial = md->start_used_ptr; + match_partial = start_match; + } switch(rc) { @@ -6750,14 +6960,14 @@ for(;;) case MATCH_SKIP_ARG: new_start_match = start_match; - md->ignore_skip_arg = TRUE; + md->ignore_skip_arg = md->skip_arg_count; break; - /* SKIP passes back the next starting point explicitly, but if it is the - same as the match we have just done, treat it as NOMATCH. */ + /* SKIP passes back the next starting point explicitly, but if it is no + greater than the match we have just done, treat it as NOMATCH. */ case MATCH_SKIP: - if (md->start_match_ptr != start_match) + if (md->start_match_ptr > start_match) { new_start_match = md->start_match_ptr; break; @@ -6765,12 +6975,12 @@ for(;;) /* Fall through */ /* NOMATCH and PRUNE advance by one character. THEN at this level acts - exactly like PRUNE. Unset the ignore SKIP-with-argument flag. */ + exactly like PRUNE. Unset ignore SKIP-with-argument. */ case MATCH_NOMATCH: case MATCH_PRUNE: case MATCH_THEN: - md->ignore_skip_arg = FALSE; + md->ignore_skip_arg = 0; new_start_match = start_match + 1; #ifdef SUPPORT_UTF if (utf) @@ -6863,7 +7073,7 @@ if (rc == MATCH_MATCH || rc == MATCH_ACCEPT) (arg_offset_max - 2) * sizeof(int)); DPRINTF(("Copied offsets from temporary memory\n")); } - if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE; + if (md->end_offset_top > arg_offset_max) md->capture_last |= OVFLBIT; DPRINTF(("Freeing temporary memory\n")); (PUBL(free))(md->offset_vector); } @@ -6871,7 +7081,8 @@ if (rc == MATCH_MATCH || rc == MATCH_ACCEPT) /* Set the return code to the number of captured strings, or 0 if there were too many to fit into the vector. */ - rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)? + rc = ((md->capture_last & OVFLBIT) != 0 && + md->end_offset_top >= arg_offset_max)? 0 : md->end_offset_top/2; /* If there is space in the offset vector, set any unused pairs at the end of @@ -6887,7 +7098,7 @@ if (rc == MATCH_MATCH || rc == MATCH_ACCEPT) { register int *iptr, *iend; int resetcount = 2 + re->top_bracket * 2; - if (resetcount > offsetcount) resetcount = ocount; + if (resetcount > offsetcount) resetcount = offsetcount; iptr = offsets + md->end_offset_top; iend = offsets + resetcount; while (iptr < iend) *iptr++ = -1; @@ -6908,6 +7119,9 @@ if (rc == MATCH_MATCH || rc == MATCH_ACCEPT) if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0) *(extra_data->mark) = (pcre_uchar *)md->mark; DPRINTF((">>>> returning %d\n", rc)); +#ifdef NO_RECURSE + release_match_heapframes(&frame_zero); +#endif return rc; } @@ -6925,12 +7139,15 @@ if (using_temporary_offsets) if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL) { DPRINTF((">>>> error: returning %d\n", rc)); +#ifdef NO_RECURSE + release_match_heapframes(&frame_zero); +#endif return rc; } /* Handle partial matches - disable any mark data */ -if (start_partial != NULL) +if (match_partial != NULL) { DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n")); md->mark = NULL; @@ -6938,6 +7155,8 @@ if (start_partial != NULL) { offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject); offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject); + if (offsetcount > 2) + offsets[2] = (int)(match_partial - (PCRE_PUCHAR)subject); } rc = PCRE_ERROR_PARTIAL; } @@ -6954,6 +7173,9 @@ else if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0) *(extra_data->mark) = (pcre_uchar *)md->nomatch_mark; +#ifdef NO_RECURSE + release_match_heapframes(&frame_zero); +#endif return rc; }