--- embedaddon/pcre/pcre_dfa_exec.c 2012/10/09 09:19:17 1.1.1.3 +++ embedaddon/pcre/pcre_dfa_exec.c 2013/07/22 08:25:55 1.1.1.4 @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the below for why this module is different). Written by Philip Hazel - Copyright (c) 1997-2012 University of Cambridge + Copyright (c) 1997-2013 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -302,13 +302,13 @@ Returns: nothing static void pchars(const pcre_uchar *p, int length, FILE *f) { -int c; +pcre_uint32 c; while (length-- > 0) { if (isprint(c = *(p++))) fprintf(f, "%c", c); else - fprintf(f, "\\x%02x", c); + fprintf(f, "\\x{%02x}", c); } } #endif @@ -571,7 +571,7 @@ for (;;) { int i, j; int clen, dlen; - unsigned int c, d; + pcre_uint32 c, d; int forced_fail = 0; BOOL partial_newline = FALSE; BOOL could_continue = reset_could_continue; @@ -613,9 +613,10 @@ for (;;) { clen = 1; /* Number of data items in the character */ #ifdef SUPPORT_UTF - if (utf) { GETCHARLEN(c, ptr, clen); } else -#endif /* SUPPORT_UTF */ + GETCHARLENTEST(c, ptr, clen); +#else c = *ptr; +#endif /* SUPPORT_UTF */ } else { @@ -634,7 +635,8 @@ for (;;) BOOL caseless = FALSE; const pcre_uchar *code; int state_offset = current_state->offset; - int count, codevalue, rrc; + int codevalue, rrc; + int count; #ifdef PCRE_DEBUG printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset); @@ -1007,7 +1009,7 @@ for (;;) { const pcre_uchar *temp = ptr - 1; if (temp < md->start_used_ptr) md->start_used_ptr = temp; -#ifdef SUPPORT_UTF +#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 if (utf) { BACKCHAR(temp); } #endif GETCHARTEST(d, temp); @@ -1060,6 +1062,7 @@ for (;;) if (clen > 0) { BOOL OK; + const pcre_uint32 *cp; const ucd_record * prop = GET_UCD(c); switch(code[1]) { @@ -1108,6 +1111,21 @@ for (;;) c == CHAR_UNDERSCORE; break; + case PT_CLIST: + cp = PRIV(ucd_caseless_sets) + code[2]; + for (;;) + { + if (c < *cp) { OK = FALSE; break; } + if (c == *cp++) { OK = TRUE; break; } + } + break; + + case PT_UCNC: + OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || + c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || + c >= 0xe000; + break; + /* Should never occur, but keep compilers from grumbling. */ default: @@ -1237,7 +1255,7 @@ for (;;) (d != OP_ANY || !IS_NEWLINE(ptr)) && ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) { - if (++count >= GET2(code, 1)) + if (++count >= (int)GET2(code, 1)) { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); } else { ADD_NEW(state_offset, count); } @@ -1271,7 +1289,7 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } - if (++count >= GET2(code, 1)) + if (++count >= (int)GET2(code, 1)) { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); } else { ADD_NEW(state_offset, count); } @@ -1294,6 +1312,7 @@ for (;;) if (clen > 0) { BOOL OK; + const pcre_uint32 *cp; const ucd_record * prop = GET_UCD(c); switch(code[2]) { @@ -1342,6 +1361,21 @@ for (;;) c == CHAR_UNDERSCORE; break; + case PT_CLIST: + cp = PRIV(ucd_caseless_sets) + code[3]; + for (;;) + { + if (c < *cp) { OK = FALSE; break; } + if (c == *cp++) { OK = TRUE; break; } + } + break; + + case PT_UCNC: + OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || + c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || + c >= 0xe000; + break; + /* Should never occur, but keep compilers from grumbling. */ default: @@ -1368,8 +1402,9 @@ for (;;) case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS: count = current_state->count; /* Already matched */ if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } - if (clen > 0 && UCD_CATEGORY(c) != ucp_M) + if (clen > 0) { + int lgb, rgb; const pcre_uchar *nptr = ptr + clen; int ncount = 0; if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS) @@ -1377,14 +1412,16 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } + lgb = UCD_GRAPHBREAK(c); while (nptr < end_subject) { - int nd; - int ndlen = 1; - GETCHARLEN(nd, nptr, ndlen); - if (UCD_CATEGORY(nd) != ucp_M) break; + dlen = 1; + if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } + rgb = UCD_GRAPHBREAK(d); + if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; ncount++; - nptr += ndlen; + lgb = rgb; + nptr += dlen; } count++; ADD_NEW_DATA(-state_offset, count, ncount); @@ -1403,20 +1440,22 @@ for (;;) int ncount = 0; switch (c) { - case 0x000b: - case 0x000c: - case 0x0085: + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; goto ANYNL01; - case 0x000d: - if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; + case CHAR_CR: + if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1; /* Fall through */ ANYNL01: - case 0x000a: + case CHAR_LF: if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS) { active_count--; /* Remove non-match possibility */ @@ -1443,13 +1482,7 @@ for (;;) BOOL OK; switch (c) { - case 0x000a: - case 0x000b: - case 0x000c: - case 0x000d: - case 0x0085: - case 0x2028: - case 0x2029: + VSPACE_CASES: OK = TRUE; break; @@ -1482,25 +1515,7 @@ for (;;) BOOL OK; switch (c) { - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ + HSPACE_CASES: OK = TRUE; break; @@ -1541,6 +1556,7 @@ for (;;) if (clen > 0) { BOOL OK; + const pcre_uint32 *cp; const ucd_record * prop = GET_UCD(c); switch(code[2]) { @@ -1589,6 +1605,21 @@ for (;;) c == CHAR_UNDERSCORE; break; + case PT_CLIST: + cp = PRIV(ucd_caseless_sets) + code[3]; + for (;;) + { + if (c < *cp) { OK = FALSE; break; } + if (c == *cp++) { OK = TRUE; break; } + } + break; + + case PT_UCNC: + OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || + c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || + c >= 0xe000; + break; + /* Should never occur, but keep compilers from grumbling. */ default: @@ -1624,8 +1655,9 @@ for (;;) QS2: ADD_ACTIVE(state_offset + 2, 0); - if (clen > 0 && UCD_CATEGORY(c) != ucp_M) + if (clen > 0) { + int lgb, rgb; const pcre_uchar *nptr = ptr + clen; int ncount = 0; if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR || @@ -1634,14 +1666,16 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } + lgb = UCD_GRAPHBREAK(c); while (nptr < end_subject) { - int nd; - int ndlen = 1; - GETCHARLEN(nd, nptr, ndlen); - if (UCD_CATEGORY(nd) != ucp_M) break; + dlen = 1; + if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } + rgb = UCD_GRAPHBREAK(d); + if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; ncount++; - nptr += ndlen; + lgb = rgb; + nptr += dlen; } ADD_NEW_DATA(-(state_offset + count), 0, ncount); } @@ -1667,27 +1701,29 @@ for (;;) int ncount = 0; switch (c) { - case 0x000b: - case 0x000c: - case 0x0085: + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; goto ANYNL02; - case 0x000d: - if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; + case CHAR_CR: + if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1; /* Fall through */ ANYNL02: - case 0x000a: + case CHAR_LF: if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR || codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY) { active_count--; /* Remove non-match possibility */ next_active_state--; } - ADD_NEW_DATA(-(state_offset + count), 0, ncount); + ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount); break; default: @@ -1715,13 +1751,7 @@ for (;;) BOOL OK; switch (c) { - case 0x000a: - case 0x000b: - case 0x000c: - case 0x000d: - case 0x0085: - case 0x2028: - case 0x2029: + VSPACE_CASES: OK = TRUE; break; @@ -1737,7 +1767,7 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } - ADD_NEW_DATA(-(state_offset + count), 0, 0); + ADD_NEW_DATA(-(state_offset + (int)count), 0, 0); } } break; @@ -1761,25 +1791,7 @@ for (;;) BOOL OK; switch (c) { - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ + HSPACE_CASES: OK = TRUE; break; @@ -1796,7 +1808,7 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } - ADD_NEW_DATA(-(state_offset + count), 0, 0); + ADD_NEW_DATA(-(state_offset + (int)count), 0, 0); } } break; @@ -1813,6 +1825,7 @@ for (;;) if (clen > 0) { BOOL OK; + const pcre_uint32 *cp; const ucd_record * prop = GET_UCD(c); switch(code[1 + IMM2_SIZE + 1]) { @@ -1861,6 +1874,21 @@ for (;;) c == CHAR_UNDERSCORE; break; + case PT_CLIST: + cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2]; + for (;;) + { + if (c < *cp) { OK = FALSE; break; } + if (c == *cp++) { OK = TRUE; break; } + } + break; + + case PT_UCNC: + OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || + c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || + c >= 0xe000; + break; + /* Should never occur, but keep compilers from grumbling. */ default: @@ -1875,7 +1903,7 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } - if (++count >= GET2(code, 1)) + if (++count >= (int)GET2(code, 1)) { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); } else { ADD_NEW(state_offset, count); } @@ -1891,8 +1919,9 @@ for (;;) if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT) { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } count = current_state->count; /* Number already matched */ - if (clen > 0 && UCD_CATEGORY(c) != ucp_M) + if (clen > 0) { + int lgb, rgb; const pcre_uchar *nptr = ptr + clen; int ncount = 0; if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO) @@ -1900,18 +1929,20 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } + lgb = UCD_GRAPHBREAK(c); while (nptr < end_subject) { - int nd; - int ndlen = 1; - GETCHARLEN(nd, nptr, ndlen); - if (UCD_CATEGORY(nd) != ucp_M) break; + dlen = 1; + if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } + rgb = UCD_GRAPHBREAK(d); + if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; ncount++; - nptr += ndlen; + lgb = rgb; + nptr += dlen; } if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0) reset_could_continue = TRUE; - if (++count >= GET2(code, 1)) + if (++count >= (int)GET2(code, 1)) { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); } else { ADD_NEW_DATA(-state_offset, count, ncount); } @@ -1932,26 +1963,28 @@ for (;;) int ncount = 0; switch (c) { - case 0x000b: - case 0x000c: - case 0x0085: + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; goto ANYNL03; - case 0x000d: - if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; + case CHAR_CR: + if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1; /* Fall through */ ANYNL03: - case 0x000a: + case CHAR_LF: if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO) { active_count--; /* Remove non-match possibility */ next_active_state--; } - if (++count >= GET2(code, 1)) + if (++count >= (int)GET2(code, 1)) { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); } else { ADD_NEW_DATA(-state_offset, count, ncount); } @@ -1976,13 +2009,7 @@ for (;;) BOOL OK; switch (c) { - case 0x000a: - case 0x000b: - case 0x000c: - case 0x000d: - case 0x0085: - case 0x2028: - case 0x2029: + VSPACE_CASES: OK = TRUE; break; @@ -1997,7 +2024,7 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } - if (++count >= GET2(code, 1)) + if (++count >= (int)GET2(code, 1)) { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); } else { ADD_NEW_DATA(-state_offset, count, 0); } @@ -2018,25 +2045,7 @@ for (;;) BOOL OK; switch (c) { - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ + HSPACE_CASES: OK = TRUE; break; @@ -2052,7 +2061,7 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } - if (++count >= GET2(code, 1)) + if (++count >= (int)GET2(code, 1)) { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); } else { ADD_NEW_DATA(-state_offset, count, 0); } @@ -2112,17 +2121,21 @@ for (;;) to wait for them to pass before continuing. */ case OP_EXTUNI: - if (clen > 0 && UCD_CATEGORY(c) != ucp_M) + if (clen > 0) { + int lgb, rgb; const pcre_uchar *nptr = ptr + clen; int ncount = 0; + lgb = UCD_GRAPHBREAK(c); while (nptr < end_subject) { - int nclen = 1; - GETCHARLEN(c, nptr, nclen); - if (UCD_CATEGORY(c) != ucp_M) break; + dlen = 1; + if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } + rgb = UCD_GRAPHBREAK(d); + if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; ncount++; - nptr += nclen; + lgb = rgb; + nptr += dlen; } if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0) reset_could_continue = TRUE; @@ -2139,25 +2152,27 @@ for (;;) case OP_ANYNL: if (clen > 0) switch(c) { - case 0x000b: - case 0x000c: - case 0x0085: + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; - case 0x000a: + case CHAR_LF: ADD_NEW(state_offset + 1, 0); break; - case 0x000d: + case CHAR_CR: if (ptr + 1 >= end_subject) { ADD_NEW(state_offset + 1, 0); if ((md->moptions & PCRE_PARTIAL_HARD) != 0) reset_could_continue = TRUE; } - else if (ptr[1] == 0x0a) + else if (RAWUCHARTEST(ptr + 1) == CHAR_LF) { ADD_NEW_DATA(-(state_offset + 1), 0, 1); } @@ -2173,13 +2188,7 @@ for (;;) case OP_NOT_VSPACE: if (clen > 0) switch(c) { - case 0x000a: - case 0x000b: - case 0x000c: - case 0x000d: - case 0x0085: - case 0x2028: - case 0x2029: + VSPACE_CASES: break; default: @@ -2192,17 +2201,12 @@ for (;;) case OP_VSPACE: if (clen > 0) switch(c) { - case 0x000a: - case 0x000b: - case 0x000c: - case 0x000d: - case 0x0085: - case 0x2028: - case 0x2029: + VSPACE_CASES: ADD_NEW(state_offset + 1, 0); break; - default: break; + default: + break; } break; @@ -2210,25 +2214,7 @@ for (;;) case OP_NOT_HSPACE: if (clen > 0) switch(c) { - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ + HSPACE_CASES: break; default: @@ -2241,27 +2227,12 @@ for (;;) case OP_HSPACE: if (clen > 0) switch(c) { - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ + HSPACE_CASES: ADD_NEW(state_offset + 1, 0); break; + + default: + break; } break; @@ -2315,7 +2286,7 @@ for (;;) if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); } if (clen > 0) { - unsigned int otherd = NOTACHAR; + pcre_uint32 otherd = NOTACHAR; if (caseless) { #ifdef SUPPORT_UTF @@ -2362,7 +2333,7 @@ for (;;) ADD_ACTIVE(state_offset + dlen + 1, 0); if (clen > 0) { - unsigned int otherd = NOTACHAR; + pcre_uint32 otherd = NOTACHAR; if (caseless) { #ifdef SUPPORT_UTF @@ -2407,7 +2378,7 @@ for (;;) ADD_ACTIVE(state_offset + dlen + 1, 0); if (clen > 0) { - unsigned int otherd = NOTACHAR; + pcre_uint32 otherd = NOTACHAR; if (caseless) { #ifdef SUPPORT_UTF @@ -2444,7 +2415,7 @@ for (;;) count = current_state->count; /* Number already matched */ if (clen > 0) { - unsigned int otherd = NOTACHAR; + pcre_uint32 otherd = NOTACHAR; if (caseless) { #ifdef SUPPORT_UTF @@ -2460,7 +2431,7 @@ for (;;) } if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) { - if (++count >= GET2(code, 1)) + if (++count >= (int)GET2(code, 1)) { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); } else { ADD_NEW(state_offset, count); } @@ -2488,7 +2459,7 @@ for (;;) count = current_state->count; /* Number already matched */ if (clen > 0) { - unsigned int otherd = NOTACHAR; + pcre_uint32 otherd = NOTACHAR; if (caseless) { #ifdef SUPPORT_UTF @@ -2509,7 +2480,7 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } - if (++count >= GET2(code, 1)) + if (++count >= (int)GET2(code, 1)) { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); } else { ADD_NEW(state_offset, count); } @@ -2582,11 +2553,11 @@ for (;;) case OP_CRRANGE: case OP_CRMINRANGE: count = current_state->count; /* Already matched */ - if (count >= GET2(ecode, 1)) + if (count >= (int)GET2(ecode, 1)) { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); } if (isinclass) { - int max = GET2(ecode, 1 + IMM2_SIZE); + int max = (int)GET2(ecode, 1 + IMM2_SIZE); if (++count >= max && max != 0) /* Max 0 => no limit */ { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); } else @@ -2662,10 +2633,12 @@ for (;;) cb.version = 1; /* Version 1 of the callout block */ cb.callout_number = code[LINK_SIZE+2]; cb.offset_vector = offsets; -#ifdef COMPILE_PCRE8 +#if defined COMPILE_PCRE8 cb.subject = (PCRE_SPTR)start_subject; -#else +#elif defined COMPILE_PCRE16 cb.subject = (PCRE_SPTR16)start_subject; +#elif defined COMPILE_PCRE32 + cb.subject = (PCRE_SPTR32)start_subject; #endif cb.subject_length = (int)(end_subject - start_subject); cb.start_match = (int)(current_subject - start_subject); @@ -2796,7 +2769,7 @@ for (;;) for (rc = rc*2 - 2; rc >= 0; rc -= 2) { int charcount = local_offsets[rc+1] - local_offsets[rc]; -#ifdef SUPPORT_UTF +#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 if (utf) { const pcre_uchar *p = start_subject + local_offsets[rc]; @@ -2900,7 +2873,7 @@ for (;;) const pcre_uchar *p = ptr; const pcre_uchar *pp = local_ptr; charcount = (int)(pp - p); -#ifdef SUPPORT_UTF +#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--; #endif ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1)); @@ -2982,7 +2955,7 @@ for (;;) } else { -#ifdef SUPPORT_UTF +#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 if (utf) { const pcre_uchar *p = start_subject + local_offsets[0]; @@ -3011,10 +2984,12 @@ for (;;) cb.version = 1; /* Version 1 of the callout block */ cb.callout_number = code[1]; cb.offset_vector = offsets; -#ifdef COMPILE_PCRE8 +#if defined COMPILE_PCRE8 cb.subject = (PCRE_SPTR)start_subject; -#else +#elif defined COMPILE_PCRE16 cb.subject = (PCRE_SPTR16)start_subject; +#elif defined COMPILE_PCRE32 + cb.subject = (PCRE_SPTR32)start_subject; #endif cb.subject_length = (int)(end_subject - start_subject); cb.start_match = (int)(current_subject - start_subject); @@ -3072,15 +3047,7 @@ for (;;) ptr > md->start_used_ptr) /* Inspected non-empty string */ ) ) - { - if (offsetcount >= 2) - { - offsets[0] = (int)(md->start_used_ptr - start_subject); - offsets[1] = (int)(end_subject - start_subject); - } match_count = PCRE_ERROR_PARTIAL; - } - DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count, rlevel*2-2, SP)); @@ -3130,16 +3097,21 @@ Returns: > 0 => number of match offset pairs < -1 => some kind of unexpected problem */ -#ifdef COMPILE_PCRE8 +#if defined COMPILE_PCRE8 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data, const char *subject, int length, int start_offset, int options, int *offsets, int offsetcount, int *workspace, int wscount) -#else +#elif defined COMPILE_PCRE16 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data, PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets, int offsetcount, int *workspace, int wscount) +#elif defined COMPILE_PCRE32 +PCRE_EXP_DEFN int PCRE_CALL_CONVENTION +pcre32_dfa_exec(const pcre32 *argument_re, const pcre32_extra *extra_data, + PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets, + int offsetcount, int *workspace, int wscount) #endif { REAL_PCRE *re = (REAL_PCRE *)argument_re; @@ -3166,6 +3138,7 @@ if (re == NULL || subject == NULL || workspace == NULL (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE; +if (length < 0) return PCRE_ERROR_BADLENGTH; if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET; /* Check that the first field in the block is the magic number. If it is not, @@ -3214,7 +3187,7 @@ end_subject = (const pcre_uchar *)subject + length; req_char_ptr = current_subject - 1; #ifdef SUPPORT_UTF -/* PCRE_UTF16 has the same value as PCRE_UTF8. */ +/* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */ utf = (re->options & PCRE_UTF8) != 0; #else utf = FALSE; @@ -3300,12 +3273,21 @@ if (utf && (options & PCRE_NO_UTF8_CHECK) == 0) offsets[0] = erroroffset; offsets[1] = errorcode; } - return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)? +#if defined COMPILE_PCRE8 + return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0) ? PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8; +#elif defined COMPILE_PCRE16 + return (errorcode <= PCRE_UTF16_ERR1 && (options & PCRE_PARTIAL_HARD) != 0) ? + PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16; +#elif defined COMPILE_PCRE32 + return PCRE_ERROR_BADUTF32; +#endif } +#if defined COMPILE_PCRE8 || defined COMPILE_PCRE16 if (start_offset > 0 && start_offset < length && NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset])) return PCRE_ERROR_BADUTF8_OFFSET; +#endif } #endif @@ -3415,12 +3397,15 @@ for (;;) if (has_first_char) { if (first_char != first_char2) + { + pcre_uchar csc; while (current_subject < end_subject && - *current_subject != first_char && *current_subject != first_char2) + (csc = RAWUCHARTEST(current_subject)) != first_char && csc != first_char2) current_subject++; + } else while (current_subject < end_subject && - *current_subject != first_char) + RAWUCHARTEST(current_subject) != first_char) current_subject++; } @@ -3450,10 +3435,10 @@ for (;;) ANYCRLF, and we are now at a LF, advance the match position by one more character. */ - if (current_subject[-1] == CHAR_CR && + if (RAWUCHARTEST(current_subject - 1) == CHAR_CR && (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) && current_subject < end_subject && - *current_subject == CHAR_NL) + RAWUCHARTEST(current_subject) == CHAR_NL) current_subject++; } } @@ -3464,7 +3449,7 @@ for (;;) { while (current_subject < end_subject) { - register unsigned int c = *current_subject; + register pcre_uint32 c = RAWUCHARTEST(current_subject); #ifndef COMPILE_PCRE8 if (c > 255) c = 255; #endif @@ -3530,7 +3515,7 @@ for (;;) { while (p < end_subject) { - register int pp = *p++; + register pcre_uint32 pp = RAWUCHARINCTEST(p); if (pp == req_char || pp == req_char2) { p--; break; } } } @@ -3538,7 +3523,7 @@ for (;;) { while (p < end_subject) { - if (*p++ == req_char) { p--; break; } + if (RAWUCHARINCTEST(p) == req_char) { p--; break; } } } @@ -3576,7 +3561,17 @@ for (;;) /* Anything other than "no match" means we are done, always; otherwise, carry on only if not anchored. */ - if (rc != PCRE_ERROR_NOMATCH || anchored) return rc; + if (rc != PCRE_ERROR_NOMATCH || anchored) + { + if (rc == PCRE_ERROR_PARTIAL && offsetcount >= 2) + { + offsets[0] = (int)(md->start_used_ptr - (PCRE_PUCHAR)subject); + offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject); + if (offsetcount > 2) + offsets[2] = (int)(current_subject - (PCRE_PUCHAR)subject); + } + return rc; + } /* Advance to the next subject character unless we are at the end of a line and firstline is set. */ @@ -3596,9 +3591,9 @@ for (;;) not contain any explicit matches for \r or \n, and the newline option is CRLF or ANY or ANYCRLF, advance the match position by one more character. */ - if (current_subject[-1] == CHAR_CR && + if (RAWUCHARTEST(current_subject - 1) == CHAR_CR && current_subject < end_subject && - *current_subject == CHAR_NL && + RAWUCHARTEST(current_subject) == CHAR_NL && (re->flags & PCRE_HASCRORLF) == 0 && (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF ||