--- embedaddon/pcre/pcre_dfa_exec.c 2012/10/09 09:19:17 1.1.1.3 +++ embedaddon/pcre/pcre_dfa_exec.c 2014/06/15 19:46:04 1.1.1.5 @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the below for why this module is different). Written by Philip Hazel - Copyright (c) 1997-2012 University of Cambridge + Copyright (c) 1997-2013 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -120,7 +120,7 @@ static const pcre_uint8 coptable[] = { 0, 0, /* \P, \p */ 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */ 0, /* \X */ - 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */ + 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */ 1, /* Char */ 1, /* Chari */ 1, /* not */ @@ -151,11 +151,14 @@ static const pcre_uint8 coptable[] = { /* Character class & ref repeats */ 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */ 0, 0, /* CRRANGE, CRMINRANGE */ + 0, 0, 0, 0, /* Possessive *+, ++, ?+, CRPOSRANGE */ 0, /* CLASS */ 0, /* NCLASS */ 0, /* XCLASS - variable length */ 0, /* REF */ 0, /* REFI */ + 0, /* DNREF */ + 0, /* DNREFI */ 0, /* RECURSE */ 0, /* CALLOUT */ 0, /* Alt */ @@ -171,8 +174,8 @@ static const pcre_uint8 coptable[] = { 0, 0, /* ONCE, ONCE_NC */ 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */ 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */ - 0, 0, /* CREF, NCREF */ - 0, 0, /* RREF, NRREF */ + 0, 0, /* CREF, DNCREF */ + 0, 0, /* RREF, DNRREF */ 0, /* DEF */ 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */ @@ -194,7 +197,7 @@ static const pcre_uint8 poptable[] = { 1, 1, /* \P, \p */ 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ 1, /* \X */ - 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */ + 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */ 1, /* Char */ 1, /* Chari */ 1, /* not */ @@ -220,11 +223,14 @@ static const pcre_uint8 poptable[] = { /* Character class & ref repeats */ 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ 1, 1, /* CRRANGE, CRMINRANGE */ + 1, 1, 1, 1, /* Possessive *+, ++, ?+, CRPOSRANGE */ 1, /* CLASS */ 1, /* NCLASS */ 1, /* XCLASS - variable length */ 0, /* REF */ 0, /* REFI */ + 0, /* DNREF */ + 0, /* DNREFI */ 0, /* RECURSE */ 0, /* CALLOUT */ 0, /* Alt */ @@ -240,8 +246,8 @@ static const pcre_uint8 poptable[] = { 0, 0, /* ONCE, ONCE_NC */ 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */ 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */ - 0, 0, /* CREF, NCREF */ - 0, 0, /* RREF, NRREF */ + 0, 0, /* CREF, DNCREF */ + 0, 0, /* RREF, DNRREF */ 0, /* DEF */ 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */ @@ -302,13 +308,13 @@ Returns: nothing static void pchars(const pcre_uchar *p, int length, FILE *f) { -int c; +pcre_uint32 c; while (length-- > 0) { if (isprint(c = *(p++))) fprintf(f, "%c", c); else - fprintf(f, "\\x%02x", c); + fprintf(f, "\\x{%02x}", c); } } #endif @@ -571,7 +577,7 @@ for (;;) { int i, j; int clen, dlen; - unsigned int c, d; + pcre_uint32 c, d; int forced_fail = 0; BOOL partial_newline = FALSE; BOOL could_continue = reset_could_continue; @@ -613,9 +619,10 @@ for (;;) { clen = 1; /* Number of data items in the character */ #ifdef SUPPORT_UTF - if (utf) { GETCHARLEN(c, ptr, clen); } else -#endif /* SUPPORT_UTF */ + GETCHARLENTEST(c, ptr, clen); +#else c = *ptr; +#endif /* SUPPORT_UTF */ } else { @@ -634,7 +641,8 @@ for (;;) BOOL caseless = FALSE; const pcre_uchar *code; int state_offset = current_state->offset; - int count, codevalue, rrc; + int codevalue, rrc; + int count; #ifdef PCRE_DEBUG printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset); @@ -1007,7 +1015,7 @@ for (;;) { const pcre_uchar *temp = ptr - 1; if (temp < md->start_used_ptr) md->start_used_ptr = temp; -#ifdef SUPPORT_UTF +#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 if (utf) { BACKCHAR(temp); } #endif GETCHARTEST(d, temp); @@ -1060,6 +1068,7 @@ for (;;) if (clen > 0) { BOOL OK; + const pcre_uint32 *cp; const ucd_record * prop = GET_UCD(c); switch(code[1]) { @@ -1091,15 +1100,23 @@ for (;;) PRIV(ucp_gentype)[prop->chartype] == ucp_N; break; - case PT_SPACE: /* Perl space */ - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || - c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; - break; + /* Perl space used to exclude VT, but from Perl 5.18 it is included, + which means that Perl space and POSIX space are now identical. PCRE + was changed at release 8.34. */ + case PT_SPACE: /* Perl space */ case PT_PXSPACE: /* POSIX space */ - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || - c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || - c == CHAR_FF || c == CHAR_CR; + switch(c) + { + HSPACE_CASES: + VSPACE_CASES: + OK = TRUE; + break; + + default: + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z; + break; + } break; case PT_WORD: @@ -1108,6 +1125,21 @@ for (;;) c == CHAR_UNDERSCORE; break; + case PT_CLIST: + cp = PRIV(ucd_caseless_sets) + code[2]; + for (;;) + { + if (c < *cp) { OK = FALSE; break; } + if (c == *cp++) { OK = TRUE; break; } + } + break; + + case PT_UCNC: + OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || + c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || + c >= 0xe000; + break; + /* Should never occur, but keep compilers from grumbling. */ default: @@ -1237,7 +1269,7 @@ for (;;) (d != OP_ANY || !IS_NEWLINE(ptr)) && ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) { - if (++count >= GET2(code, 1)) + if (++count >= (int)GET2(code, 1)) { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); } else { ADD_NEW(state_offset, count); } @@ -1271,7 +1303,7 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } - if (++count >= GET2(code, 1)) + if (++count >= (int)GET2(code, 1)) { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); } else { ADD_NEW(state_offset, count); } @@ -1294,6 +1326,7 @@ for (;;) if (clen > 0) { BOOL OK; + const pcre_uint32 *cp; const ucd_record * prop = GET_UCD(c); switch(code[2]) { @@ -1325,15 +1358,23 @@ for (;;) PRIV(ucp_gentype)[prop->chartype] == ucp_N; break; - case PT_SPACE: /* Perl space */ - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || - c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; - break; + /* Perl space used to exclude VT, but from Perl 5.18 it is included, + which means that Perl space and POSIX space are now identical. PCRE + was changed at release 8.34. */ + case PT_SPACE: /* Perl space */ case PT_PXSPACE: /* POSIX space */ - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || - c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || - c == CHAR_FF || c == CHAR_CR; + switch(c) + { + HSPACE_CASES: + VSPACE_CASES: + OK = TRUE; + break; + + default: + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z; + break; + } break; case PT_WORD: @@ -1342,6 +1383,21 @@ for (;;) c == CHAR_UNDERSCORE; break; + case PT_CLIST: + cp = PRIV(ucd_caseless_sets) + code[3]; + for (;;) + { + if (c < *cp) { OK = FALSE; break; } + if (c == *cp++) { OK = TRUE; break; } + } + break; + + case PT_UCNC: + OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || + c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || + c >= 0xe000; + break; + /* Should never occur, but keep compilers from grumbling. */ default: @@ -1368,8 +1424,9 @@ for (;;) case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS: count = current_state->count; /* Already matched */ if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } - if (clen > 0 && UCD_CATEGORY(c) != ucp_M) + if (clen > 0) { + int lgb, rgb; const pcre_uchar *nptr = ptr + clen; int ncount = 0; if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS) @@ -1377,14 +1434,16 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } + lgb = UCD_GRAPHBREAK(c); while (nptr < end_subject) { - int nd; - int ndlen = 1; - GETCHARLEN(nd, nptr, ndlen); - if (UCD_CATEGORY(nd) != ucp_M) break; + dlen = 1; + if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } + rgb = UCD_GRAPHBREAK(d); + if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; ncount++; - nptr += ndlen; + lgb = rgb; + nptr += dlen; } count++; ADD_NEW_DATA(-state_offset, count, ncount); @@ -1403,20 +1462,22 @@ for (;;) int ncount = 0; switch (c) { - case 0x000b: - case 0x000c: - case 0x0085: + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; goto ANYNL01; - case 0x000d: - if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; + case CHAR_CR: + if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1; /* Fall through */ ANYNL01: - case 0x000a: + case CHAR_LF: if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS) { active_count--; /* Remove non-match possibility */ @@ -1443,13 +1504,7 @@ for (;;) BOOL OK; switch (c) { - case 0x000a: - case 0x000b: - case 0x000c: - case 0x000d: - case 0x0085: - case 0x2028: - case 0x2029: + VSPACE_CASES: OK = TRUE; break; @@ -1482,25 +1537,7 @@ for (;;) BOOL OK; switch (c) { - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ + HSPACE_CASES: OK = TRUE; break; @@ -1541,6 +1578,7 @@ for (;;) if (clen > 0) { BOOL OK; + const pcre_uint32 *cp; const ucd_record * prop = GET_UCD(c); switch(code[2]) { @@ -1572,15 +1610,23 @@ for (;;) PRIV(ucp_gentype)[prop->chartype] == ucp_N; break; - case PT_SPACE: /* Perl space */ - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || - c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; - break; + /* Perl space used to exclude VT, but from Perl 5.18 it is included, + which means that Perl space and POSIX space are now identical. PCRE + was changed at release 8.34. */ + case PT_SPACE: /* Perl space */ case PT_PXSPACE: /* POSIX space */ - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || - c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || - c == CHAR_FF || c == CHAR_CR; + switch(c) + { + HSPACE_CASES: + VSPACE_CASES: + OK = TRUE; + break; + + default: + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z; + break; + } break; case PT_WORD: @@ -1589,6 +1635,21 @@ for (;;) c == CHAR_UNDERSCORE; break; + case PT_CLIST: + cp = PRIV(ucd_caseless_sets) + code[3]; + for (;;) + { + if (c < *cp) { OK = FALSE; break; } + if (c == *cp++) { OK = TRUE; break; } + } + break; + + case PT_UCNC: + OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || + c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || + c >= 0xe000; + break; + /* Should never occur, but keep compilers from grumbling. */ default: @@ -1624,8 +1685,9 @@ for (;;) QS2: ADD_ACTIVE(state_offset + 2, 0); - if (clen > 0 && UCD_CATEGORY(c) != ucp_M) + if (clen > 0) { + int lgb, rgb; const pcre_uchar *nptr = ptr + clen; int ncount = 0; if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR || @@ -1634,14 +1696,16 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } + lgb = UCD_GRAPHBREAK(c); while (nptr < end_subject) { - int nd; - int ndlen = 1; - GETCHARLEN(nd, nptr, ndlen); - if (UCD_CATEGORY(nd) != ucp_M) break; + dlen = 1; + if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } + rgb = UCD_GRAPHBREAK(d); + if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; ncount++; - nptr += ndlen; + lgb = rgb; + nptr += dlen; } ADD_NEW_DATA(-(state_offset + count), 0, ncount); } @@ -1667,27 +1731,29 @@ for (;;) int ncount = 0; switch (c) { - case 0x000b: - case 0x000c: - case 0x0085: + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; goto ANYNL02; - case 0x000d: - if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; + case CHAR_CR: + if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1; /* Fall through */ ANYNL02: - case 0x000a: + case CHAR_LF: if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR || codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY) { active_count--; /* Remove non-match possibility */ next_active_state--; } - ADD_NEW_DATA(-(state_offset + count), 0, ncount); + ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount); break; default: @@ -1715,13 +1781,7 @@ for (;;) BOOL OK; switch (c) { - case 0x000a: - case 0x000b: - case 0x000c: - case 0x000d: - case 0x0085: - case 0x2028: - case 0x2029: + VSPACE_CASES: OK = TRUE; break; @@ -1737,7 +1797,7 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } - ADD_NEW_DATA(-(state_offset + count), 0, 0); + ADD_NEW_DATA(-(state_offset + (int)count), 0, 0); } } break; @@ -1761,25 +1821,7 @@ for (;;) BOOL OK; switch (c) { - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ + HSPACE_CASES: OK = TRUE; break; @@ -1796,7 +1838,7 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } - ADD_NEW_DATA(-(state_offset + count), 0, 0); + ADD_NEW_DATA(-(state_offset + (int)count), 0, 0); } } break; @@ -1813,6 +1855,7 @@ for (;;) if (clen > 0) { BOOL OK; + const pcre_uint32 *cp; const ucd_record * prop = GET_UCD(c); switch(code[1 + IMM2_SIZE + 1]) { @@ -1844,15 +1887,23 @@ for (;;) PRIV(ucp_gentype)[prop->chartype] == ucp_N; break; - case PT_SPACE: /* Perl space */ - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || - c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; - break; + /* Perl space used to exclude VT, but from Perl 5.18 it is included, + which means that Perl space and POSIX space are now identical. PCRE + was changed at release 8.34. */ + case PT_SPACE: /* Perl space */ case PT_PXSPACE: /* POSIX space */ - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || - c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || - c == CHAR_FF || c == CHAR_CR; + switch(c) + { + HSPACE_CASES: + VSPACE_CASES: + OK = TRUE; + break; + + default: + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z; + break; + } break; case PT_WORD: @@ -1861,6 +1912,21 @@ for (;;) c == CHAR_UNDERSCORE; break; + case PT_CLIST: + cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2]; + for (;;) + { + if (c < *cp) { OK = FALSE; break; } + if (c == *cp++) { OK = TRUE; break; } + } + break; + + case PT_UCNC: + OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || + c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || + c >= 0xe000; + break; + /* Should never occur, but keep compilers from grumbling. */ default: @@ -1875,7 +1941,7 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } - if (++count >= GET2(code, 1)) + if (++count >= (int)GET2(code, 1)) { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); } else { ADD_NEW(state_offset, count); } @@ -1891,8 +1957,9 @@ for (;;) if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT) { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } count = current_state->count; /* Number already matched */ - if (clen > 0 && UCD_CATEGORY(c) != ucp_M) + if (clen > 0) { + int lgb, rgb; const pcre_uchar *nptr = ptr + clen; int ncount = 0; if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO) @@ -1900,18 +1967,20 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } + lgb = UCD_GRAPHBREAK(c); while (nptr < end_subject) { - int nd; - int ndlen = 1; - GETCHARLEN(nd, nptr, ndlen); - if (UCD_CATEGORY(nd) != ucp_M) break; + dlen = 1; + if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } + rgb = UCD_GRAPHBREAK(d); + if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; ncount++; - nptr += ndlen; + lgb = rgb; + nptr += dlen; } if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0) reset_could_continue = TRUE; - if (++count >= GET2(code, 1)) + if (++count >= (int)GET2(code, 1)) { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); } else { ADD_NEW_DATA(-state_offset, count, ncount); } @@ -1932,26 +2001,28 @@ for (;;) int ncount = 0; switch (c) { - case 0x000b: - case 0x000c: - case 0x0085: + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; goto ANYNL03; - case 0x000d: - if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; + case CHAR_CR: + if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1; /* Fall through */ ANYNL03: - case 0x000a: + case CHAR_LF: if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO) { active_count--; /* Remove non-match possibility */ next_active_state--; } - if (++count >= GET2(code, 1)) + if (++count >= (int)GET2(code, 1)) { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); } else { ADD_NEW_DATA(-state_offset, count, ncount); } @@ -1976,13 +2047,7 @@ for (;;) BOOL OK; switch (c) { - case 0x000a: - case 0x000b: - case 0x000c: - case 0x000d: - case 0x0085: - case 0x2028: - case 0x2029: + VSPACE_CASES: OK = TRUE; break; @@ -1997,7 +2062,7 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } - if (++count >= GET2(code, 1)) + if (++count >= (int)GET2(code, 1)) { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); } else { ADD_NEW_DATA(-state_offset, count, 0); } @@ -2018,25 +2083,7 @@ for (;;) BOOL OK; switch (c) { - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ + HSPACE_CASES: OK = TRUE; break; @@ -2052,7 +2099,7 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } - if (++count >= GET2(code, 1)) + if (++count >= (int)GET2(code, 1)) { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); } else { ADD_NEW_DATA(-state_offset, count, 0); } @@ -2112,17 +2159,21 @@ for (;;) to wait for them to pass before continuing. */ case OP_EXTUNI: - if (clen > 0 && UCD_CATEGORY(c) != ucp_M) + if (clen > 0) { + int lgb, rgb; const pcre_uchar *nptr = ptr + clen; int ncount = 0; + lgb = UCD_GRAPHBREAK(c); while (nptr < end_subject) { - int nclen = 1; - GETCHARLEN(c, nptr, nclen); - if (UCD_CATEGORY(c) != ucp_M) break; + dlen = 1; + if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } + rgb = UCD_GRAPHBREAK(d); + if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; ncount++; - nptr += nclen; + lgb = rgb; + nptr += dlen; } if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0) reset_could_continue = TRUE; @@ -2139,25 +2190,27 @@ for (;;) case OP_ANYNL: if (clen > 0) switch(c) { - case 0x000b: - case 0x000c: - case 0x0085: + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; - case 0x000a: + case CHAR_LF: ADD_NEW(state_offset + 1, 0); break; - case 0x000d: + case CHAR_CR: if (ptr + 1 >= end_subject) { ADD_NEW(state_offset + 1, 0); if ((md->moptions & PCRE_PARTIAL_HARD) != 0) reset_could_continue = TRUE; } - else if (ptr[1] == 0x0a) + else if (RAWUCHARTEST(ptr + 1) == CHAR_LF) { ADD_NEW_DATA(-(state_offset + 1), 0, 1); } @@ -2173,13 +2226,7 @@ for (;;) case OP_NOT_VSPACE: if (clen > 0) switch(c) { - case 0x000a: - case 0x000b: - case 0x000c: - case 0x000d: - case 0x0085: - case 0x2028: - case 0x2029: + VSPACE_CASES: break; default: @@ -2192,17 +2239,12 @@ for (;;) case OP_VSPACE: if (clen > 0) switch(c) { - case 0x000a: - case 0x000b: - case 0x000c: - case 0x000d: - case 0x0085: - case 0x2028: - case 0x2029: + VSPACE_CASES: ADD_NEW(state_offset + 1, 0); break; - default: break; + default: + break; } break; @@ -2210,25 +2252,7 @@ for (;;) case OP_NOT_HSPACE: if (clen > 0) switch(c) { - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ + HSPACE_CASES: break; default: @@ -2241,27 +2265,12 @@ for (;;) case OP_HSPACE: if (clen > 0) switch(c) { - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ + HSPACE_CASES: ADD_NEW(state_offset + 1, 0); break; + + default: + break; } break; @@ -2315,7 +2324,7 @@ for (;;) if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); } if (clen > 0) { - unsigned int otherd = NOTACHAR; + pcre_uint32 otherd = NOTACHAR; if (caseless) { #ifdef SUPPORT_UTF @@ -2362,7 +2371,7 @@ for (;;) ADD_ACTIVE(state_offset + dlen + 1, 0); if (clen > 0) { - unsigned int otherd = NOTACHAR; + pcre_uint32 otherd = NOTACHAR; if (caseless) { #ifdef SUPPORT_UTF @@ -2407,7 +2416,7 @@ for (;;) ADD_ACTIVE(state_offset + dlen + 1, 0); if (clen > 0) { - unsigned int otherd = NOTACHAR; + pcre_uint32 otherd = NOTACHAR; if (caseless) { #ifdef SUPPORT_UTF @@ -2444,7 +2453,7 @@ for (;;) count = current_state->count; /* Number already matched */ if (clen > 0) { - unsigned int otherd = NOTACHAR; + pcre_uint32 otherd = NOTACHAR; if (caseless) { #ifdef SUPPORT_UTF @@ -2460,7 +2469,7 @@ for (;;) } if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) { - if (++count >= GET2(code, 1)) + if (++count >= (int)GET2(code, 1)) { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); } else { ADD_NEW(state_offset, count); } @@ -2488,7 +2497,7 @@ for (;;) count = current_state->count; /* Number already matched */ if (clen > 0) { - unsigned int otherd = NOTACHAR; + pcre_uint32 otherd = NOTACHAR; if (caseless) { #ifdef SUPPORT_UTF @@ -2509,7 +2518,7 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } - if (++count >= GET2(code, 1)) + if (++count >= (int)GET2(code, 1)) { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); } else { ADD_NEW(state_offset, count); } @@ -2562,31 +2571,65 @@ for (;;) { case OP_CRSTAR: case OP_CRMINSTAR: + case OP_CRPOSSTAR: ADD_ACTIVE(next_state_offset + 1, 0); - if (isinclass) { ADD_NEW(state_offset, 0); } + if (isinclass) + { + if (*ecode == OP_CRPOSSTAR) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + ADD_NEW(state_offset, 0); + } break; case OP_CRPLUS: case OP_CRMINPLUS: + case OP_CRPOSPLUS: count = current_state->count; /* Already matched */ if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); } - if (isinclass) { count++; ADD_NEW(state_offset, count); } + if (isinclass) + { + if (count > 0 && *ecode == OP_CRPOSPLUS) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + count++; + ADD_NEW(state_offset, count); + } break; case OP_CRQUERY: case OP_CRMINQUERY: + case OP_CRPOSQUERY: ADD_ACTIVE(next_state_offset + 1, 0); - if (isinclass) { ADD_NEW(next_state_offset + 1, 0); } + if (isinclass) + { + if (*ecode == OP_CRPOSQUERY) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + ADD_NEW(next_state_offset + 1, 0); + } break; case OP_CRRANGE: case OP_CRMINRANGE: + case OP_CRPOSRANGE: count = current_state->count; /* Already matched */ - if (count >= GET2(ecode, 1)) + if (count >= (int)GET2(ecode, 1)) { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); } if (isinclass) { - int max = GET2(ecode, 1 + IMM2_SIZE); + int max = (int)GET2(ecode, 1 + IMM2_SIZE); + if (*ecode == OP_CRPOSRANGE) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } if (++count >= max && max != 0) /* Max 0 => no limit */ { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); } else @@ -2662,10 +2705,12 @@ for (;;) cb.version = 1; /* Version 1 of the callout block */ cb.callout_number = code[LINK_SIZE+2]; cb.offset_vector = offsets; -#ifdef COMPILE_PCRE8 +#if defined COMPILE_PCRE8 cb.subject = (PCRE_SPTR)start_subject; -#else +#elif defined COMPILE_PCRE16 cb.subject = (PCRE_SPTR16)start_subject; +#elif defined COMPILE_PCRE32 + cb.subject = (PCRE_SPTR32)start_subject; #endif cb.subject_length = (int)(end_subject - start_subject); cb.start_match = (int)(current_subject - start_subject); @@ -2684,9 +2729,11 @@ for (;;) condcode = code[LINK_SIZE+1]; - /* Back reference conditions are not supported */ + /* Back reference conditions and duplicate named recursion conditions + are not supported */ - if (condcode == OP_CREF || condcode == OP_NCREF) + if (condcode == OP_CREF || condcode == OP_DNCREF || + condcode == OP_DNRREF) return PCRE_ERROR_DFA_UCOND; /* The DEFINE condition is always false */ @@ -2698,7 +2745,7 @@ for (;;) which means "test if in any recursion". We can't test for specifically recursed groups. */ - else if (condcode == OP_RREF || condcode == OP_NRREF) + else if (condcode == OP_RREF) { int value = GET2(code, LINK_SIZE + 2); if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND; @@ -2796,7 +2843,7 @@ for (;;) for (rc = rc*2 - 2; rc >= 0; rc -= 2) { int charcount = local_offsets[rc+1] - local_offsets[rc]; -#ifdef SUPPORT_UTF +#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 if (utf) { const pcre_uchar *p = start_subject + local_offsets[rc]; @@ -2900,7 +2947,7 @@ for (;;) const pcre_uchar *p = ptr; const pcre_uchar *pp = local_ptr; charcount = (int)(pp - p); -#ifdef SUPPORT_UTF +#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--; #endif ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1)); @@ -2982,7 +3029,7 @@ for (;;) } else { -#ifdef SUPPORT_UTF +#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 if (utf) { const pcre_uchar *p = start_subject + local_offsets[0]; @@ -3011,10 +3058,12 @@ for (;;) cb.version = 1; /* Version 1 of the callout block */ cb.callout_number = code[1]; cb.offset_vector = offsets; -#ifdef COMPILE_PCRE8 +#if defined COMPILE_PCRE8 cb.subject = (PCRE_SPTR)start_subject; -#else +#elif defined COMPILE_PCRE16 cb.subject = (PCRE_SPTR16)start_subject; +#elif defined COMPILE_PCRE32 + cb.subject = (PCRE_SPTR32)start_subject; #endif cb.subject_length = (int)(end_subject - start_subject); cb.start_match = (int)(current_subject - start_subject); @@ -3072,15 +3121,7 @@ for (;;) ptr > md->start_used_ptr) /* Inspected non-empty string */ ) ) - { - if (offsetcount >= 2) - { - offsets[0] = (int)(md->start_used_ptr - start_subject); - offsets[1] = (int)(end_subject - start_subject); - } match_count = PCRE_ERROR_PARTIAL; - } - DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count, rlevel*2-2, SP)); @@ -3130,16 +3171,21 @@ Returns: > 0 => number of match offset pairs < -1 => some kind of unexpected problem */ -#ifdef COMPILE_PCRE8 +#if defined COMPILE_PCRE8 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data, const char *subject, int length, int start_offset, int options, int *offsets, int offsetcount, int *workspace, int wscount) -#else +#elif defined COMPILE_PCRE16 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data, PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets, int offsetcount, int *workspace, int wscount) +#elif defined COMPILE_PCRE32 +PCRE_EXP_DEFN int PCRE_CALL_CONVENTION +pcre32_dfa_exec(const pcre32 *argument_re, const pcre32_extra *extra_data, + PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets, + int offsetcount, int *workspace, int wscount) #endif { REAL_PCRE *re = (REAL_PCRE *)argument_re; @@ -3166,6 +3212,7 @@ if (re == NULL || subject == NULL || workspace == NULL (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE; +if (length < 0) return PCRE_ERROR_BADLENGTH; if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET; /* Check that the first field in the block is the magic number. If it is not, @@ -3214,7 +3261,7 @@ end_subject = (const pcre_uchar *)subject + length; req_char_ptr = current_subject - 1; #ifdef SUPPORT_UTF -/* PCRE_UTF16 has the same value as PCRE_UTF8. */ +/* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */ utf = (re->options & PCRE_UTF8) != 0; #else utf = FALSE; @@ -3300,12 +3347,21 @@ if (utf && (options & PCRE_NO_UTF8_CHECK) == 0) offsets[0] = erroroffset; offsets[1] = errorcode; } - return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)? +#if defined COMPILE_PCRE8 + return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0) ? PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8; +#elif defined COMPILE_PCRE16 + return (errorcode <= PCRE_UTF16_ERR1 && (options & PCRE_PARTIAL_HARD) != 0) ? + PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16; +#elif defined COMPILE_PCRE32 + return PCRE_ERROR_BADUTF32; +#endif } +#if defined COMPILE_PCRE8 || defined COMPILE_PCRE16 if (start_offset > 0 && start_offset < length && NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset])) return PCRE_ERROR_BADUTF8_OFFSET; +#endif } #endif @@ -3415,12 +3471,15 @@ for (;;) if (has_first_char) { if (first_char != first_char2) + { + pcre_uchar csc; while (current_subject < end_subject && - *current_subject != first_char && *current_subject != first_char2) + (csc = RAWUCHARTEST(current_subject)) != first_char && csc != first_char2) current_subject++; + } else while (current_subject < end_subject && - *current_subject != first_char) + RAWUCHARTEST(current_subject) != first_char) current_subject++; } @@ -3450,10 +3509,10 @@ for (;;) ANYCRLF, and we are now at a LF, advance the match position by one more character. */ - if (current_subject[-1] == CHAR_CR && + if (RAWUCHARTEST(current_subject - 1) == CHAR_CR && (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) && current_subject < end_subject && - *current_subject == CHAR_NL) + RAWUCHARTEST(current_subject) == CHAR_NL) current_subject++; } } @@ -3464,7 +3523,7 @@ for (;;) { while (current_subject < end_subject) { - register unsigned int c = *current_subject; + register pcre_uint32 c = RAWUCHARTEST(current_subject); #ifndef COMPILE_PCRE8 if (c > 255) c = 255; #endif @@ -3530,7 +3589,7 @@ for (;;) { while (p < end_subject) { - register int pp = *p++; + register pcre_uint32 pp = RAWUCHARINCTEST(p); if (pp == req_char || pp == req_char2) { p--; break; } } } @@ -3538,7 +3597,7 @@ for (;;) { while (p < end_subject) { - if (*p++ == req_char) { p--; break; } + if (RAWUCHARINCTEST(p) == req_char) { p--; break; } } } @@ -3576,7 +3635,17 @@ for (;;) /* Anything other than "no match" means we are done, always; otherwise, carry on only if not anchored. */ - if (rc != PCRE_ERROR_NOMATCH || anchored) return rc; + if (rc != PCRE_ERROR_NOMATCH || anchored) + { + if (rc == PCRE_ERROR_PARTIAL && offsetcount >= 2) + { + offsets[0] = (int)(md->start_used_ptr - (PCRE_PUCHAR)subject); + offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject); + if (offsetcount > 2) + offsets[2] = (int)(current_subject - (PCRE_PUCHAR)subject); + } + return rc; + } /* Advance to the next subject character unless we are at the end of a line and firstline is set. */ @@ -3596,9 +3665,9 @@ for (;;) not contain any explicit matches for \r or \n, and the newline option is CRLF or ANY or ANYCRLF, advance the match position by one more character. */ - if (current_subject[-1] == CHAR_CR && + if (RAWUCHARTEST(current_subject - 1) == CHAR_CR && current_subject < end_subject && - *current_subject == CHAR_NL && + RAWUCHARTEST(current_subject) == CHAR_NL && (re->flags & PCRE_HASCRORLF) == 0 && (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF ||