version 1.1.1.3, 2012/10/09 09:19:17
|
version 1.1.1.5, 2014/06/15 19:46:03
|
Line 7
|
Line 7
|
and semantics are as close as possible to those of the Perl 5 language. |
and semantics are as close as possible to those of the Perl 5 language. |
|
|
Written by Philip Hazel |
Written by Philip Hazel |
Copyright (c) 1997-2012 University of Cambridge | Copyright (c) 1997-2013 University of Cambridge |
|
|
----------------------------------------------------------------------------- |
----------------------------------------------------------------------------- |
Redistribution and use in source and binary forms, with or without |
Redistribution and use in source and binary forms, with or without |
Line 40 POSSIBILITY OF SUCH DAMAGE.
|
Line 40 POSSIBILITY OF SUCH DAMAGE.
|
|
|
/* This header contains definitions that are shared between the different |
/* This header contains definitions that are shared between the different |
modules, but which are not relevant to the exported API. This includes some |
modules, but which are not relevant to the exported API. This includes some |
functions whose names all begin with "_pcre_" or "_pcre16_" depending on | functions whose names all begin with "_pcre_", "_pcre16_" or "_pcre32_" |
the PRIV macro. */ | depending on the PRIV macro. */ |
|
|
#ifndef PCRE_INTERNAL_H |
#ifndef PCRE_INTERNAL_H |
#define PCRE_INTERNAL_H |
#define PCRE_INTERNAL_H |
Line 53 the PRIV macro. */
|
Line 53 the PRIV macro. */
|
#endif |
#endif |
|
|
/* PCRE is compiled as an 8 bit library if it is not requested otherwise. */ |
/* PCRE is compiled as an 8 bit library if it is not requested otherwise. */ |
#ifndef COMPILE_PCRE16 | |
| #if !defined COMPILE_PCRE16 && !defined COMPILE_PCRE32 |
#define COMPILE_PCRE8 |
#define COMPILE_PCRE8 |
#endif |
#endif |
|
|
Line 78 Until then we define it if SUPPORT_UTF is defined. */
|
Line 79 Until then we define it if SUPPORT_UTF is defined. */
|
#define SUPPORT_UTF8 1 |
#define SUPPORT_UTF8 1 |
#endif |
#endif |
|
|
/* We do not support both EBCDIC and UTF-8/16 at the same time. The "configure" | /* We do not support both EBCDIC and UTF-8/16/32 at the same time. The "configure" |
script prevents both being selected, but not everybody uses "configure". */ |
script prevents both being selected, but not everybody uses "configure". */ |
|
|
#if defined EBCDIC && defined SUPPORT_UTF |
#if defined EBCDIC && defined SUPPORT_UTF |
#error The use of both EBCDIC and SUPPORT_UTF8/16 is not supported. | #error The use of both EBCDIC and SUPPORT_UTF is not supported. |
#endif |
#endif |
|
|
/* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef |
/* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef |
Line 111 setjmp and stdarg are used is when NO_RECURSE is set.
|
Line 112 setjmp and stdarg are used is when NO_RECURSE is set.
|
#include <stdlib.h> |
#include <stdlib.h> |
#include <string.h> |
#include <string.h> |
|
|
|
/* Valgrind (memcheck) support */ |
|
|
|
#ifdef SUPPORT_VALGRIND |
|
#include <valgrind/memcheck.h> |
|
#endif |
|
|
/* When compiling a DLL for Windows, the exported symbols have to be declared |
/* When compiling a DLL for Windows, the exported symbols have to be declared |
using some MS magic. I found some useful information on this web page: |
using some MS magic. I found some useful information on this web page: |
http://msdn2.microsoft.com/en-us/library/y4h7bcy6(VS.80).aspx. According to the |
http://msdn2.microsoft.com/en-us/library/y4h7bcy6(VS.80).aspx. According to the |
Line 187 preprocessor time in standard C environments. */
|
Line 194 preprocessor time in standard C environments. */
|
typedef unsigned char pcre_uint8; |
typedef unsigned char pcre_uint8; |
|
|
#if USHRT_MAX == 65535 |
#if USHRT_MAX == 65535 |
typedef unsigned short pcre_uint16; | typedef unsigned short pcre_uint16; |
typedef short pcre_int16; | typedef short pcre_int16; |
| #define PCRE_UINT16_MAX USHRT_MAX |
| #define PCRE_INT16_MAX SHRT_MAX |
#elif UINT_MAX == 65535 |
#elif UINT_MAX == 65535 |
typedef unsigned int pcre_uint16; | typedef unsigned int pcre_uint16; |
typedef int pcre_int16; | typedef int pcre_int16; |
| #define PCRE_UINT16_MAX UINT_MAX |
| #define PCRE_INT16_MAX INT_MAX |
#else |
#else |
#error Cannot determine a type for 16-bit unsigned integers | #error Cannot determine a type for 16-bit integers |
#endif |
#endif |
|
|
#if UINT_MAX == 4294967295 | #if UINT_MAX == 4294967295U |
typedef unsigned int pcre_uint32; | typedef unsigned int pcre_uint32; |
typedef int pcre_int32; | typedef int pcre_int32; |
#elif ULONG_MAX == 4294967295 | #define PCRE_UINT32_MAX UINT_MAX |
typedef unsigned long int pcre_uint32; | #define PCRE_INT32_MAX INT_MAX |
typedef long int pcre_int32; | #elif ULONG_MAX == 4294967295UL |
| typedef unsigned long int pcre_uint32; |
| typedef long int pcre_int32; |
| #define PCRE_UINT32_MAX ULONG_MAX |
| #define PCRE_INT32_MAX LONG_MAX |
#else |
#else |
#error Cannot determine a type for 32-bit unsigned integers | #error Cannot determine a type for 32-bit integers |
#endif |
#endif |
|
|
/* When checking for integer overflow in pcre_compile(), we need to handle |
/* When checking for integer overflow in pcre_compile(), we need to handle |
Line 214 stdint.h is available, include it; it may define INT64
|
Line 229 stdint.h is available, include it; it may define INT64
|
have stdint.h (e.g. Solaris) may have inttypes.h. The macro int64_t may be set |
have stdint.h (e.g. Solaris) may have inttypes.h. The macro int64_t may be set |
by "configure". */ |
by "configure". */ |
|
|
#if HAVE_STDINT_H | #if defined HAVE_STDINT_H |
#include <stdint.h> |
#include <stdint.h> |
#elif HAVE_INTTYPES_H | #elif defined HAVE_INTTYPES_H |
#include <inttypes.h> |
#include <inttypes.h> |
#endif |
#endif |
|
|
Line 243 exactly 256 items. When the character is able to conta
|
Line 258 exactly 256 items. When the character is able to conta
|
items, some check is needed before accessing these tables. |
items, some check is needed before accessing these tables. |
*/ |
*/ |
|
|
#ifdef COMPILE_PCRE8 | #if defined COMPILE_PCRE8 |
|
|
typedef unsigned char pcre_uchar; |
typedef unsigned char pcre_uchar; |
#define IN_UCHARS(x) (x) |
#define IN_UCHARS(x) (x) |
#define MAX_255(c) 1 |
#define MAX_255(c) 1 |
#define TABLE_GET(c, table, default) ((table)[c]) |
#define TABLE_GET(c, table, default) ((table)[c]) |
|
|
#else | #elif defined COMPILE_PCRE16 |
|
|
#ifdef COMPILE_PCRE16 |
|
#if USHRT_MAX != 65535 |
#if USHRT_MAX != 65535 |
/* This is a warning message. Change PCRE_UCHAR16 to a 16 bit data type in |
/* This is a warning message. Change PCRE_UCHAR16 to a 16 bit data type in |
pcre.h(.in) and disable (comment out) this message. */ |
pcre.h(.in) and disable (comment out) this message. */ |
Line 260 pcre.h(.in) and disable (comment out) this message. */
|
Line 274 pcre.h(.in) and disable (comment out) this message. */
|
#endif |
#endif |
|
|
typedef pcre_uint16 pcre_uchar; |
typedef pcre_uint16 pcre_uchar; |
#define IN_UCHARS(x) ((x) << 1) | #define UCHAR_SHIFT (1) |
| #define IN_UCHARS(x) ((x) << UCHAR_SHIFT) |
#define MAX_255(c) ((c) <= 255u) |
#define MAX_255(c) ((c) <= 255u) |
#define TABLE_GET(c, table, default) (MAX_255(c)? ((table)[c]):(default)) |
#define TABLE_GET(c, table, default) (MAX_255(c)? ((table)[c]):(default)) |
|
|
|
#elif defined COMPILE_PCRE32 |
|
|
|
typedef pcre_uint32 pcre_uchar; |
|
#define UCHAR_SHIFT (2) |
|
#define IN_UCHARS(x) ((x) << UCHAR_SHIFT) |
|
#define MAX_255(c) ((c) <= 255u) |
|
#define TABLE_GET(c, table, default) (MAX_255(c)? ((table)[c]):(default)) |
|
|
#else |
#else |
#error Unsupported compiling mode |
#error Unsupported compiling mode |
#endif /* COMPILE_PCRE16 */ | #endif /* COMPILE_PCRE[8|16|32] */ |
|
|
#endif /* COMPILE_PCRE8 */ |
|
|
|
/* This is an unsigned int value that no character can ever have. UTF-8 |
/* This is an unsigned int value that no character can ever have. UTF-8 |
characters only go up to 0x7fffffff (though Unicode doesn't go beyond |
characters only go up to 0x7fffffff (though Unicode doesn't go beyond |
0x0010ffff). */ |
0x0010ffff). */ |
Line 295 start/end of string field names are. */
|
Line 316 start/end of string field names are. */
|
&(NLBLOCK->nllen), utf)) \ |
&(NLBLOCK->nllen), utf)) \ |
: \ |
: \ |
((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \ |
((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \ |
(p)[0] == NLBLOCK->nl[0] && \ | RAWUCHARTEST(p) == NLBLOCK->nl[0] && \ |
(NLBLOCK->nllen == 1 || (p)[1] == NLBLOCK->nl[1]) \ | (NLBLOCK->nllen == 1 || RAWUCHARTEST(p+1) == NLBLOCK->nl[1]) \ |
) \ |
) \ |
) |
) |
|
|
Line 309 start/end of string field names are. */
|
Line 330 start/end of string field names are. */
|
&(NLBLOCK->nllen), utf)) \ |
&(NLBLOCK->nllen), utf)) \ |
: \ |
: \ |
((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \ |
((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \ |
(p)[-NLBLOCK->nllen] == NLBLOCK->nl[0] && \ | RAWUCHARTEST(p - NLBLOCK->nllen) == NLBLOCK->nl[0] && \ |
(NLBLOCK->nllen == 1 || (p)[-NLBLOCK->nllen+1] == NLBLOCK->nl[1]) \ | (NLBLOCK->nllen == 1 || RAWUCHARTEST(p - NLBLOCK->nllen + 1) == NLBLOCK->nl[1]) \ |
) \ |
) \ |
) |
) |
|
|
Line 335 values. */
|
Line 356 values. */
|
#include "pcre.h" |
#include "pcre.h" |
#include "ucp.h" |
#include "ucp.h" |
|
|
|
#ifdef COMPILE_PCRE32 |
|
/* Assert that the public PCRE_UCHAR32 is a 32-bit type */ |
|
typedef int __assert_pcre_uchar32_size[sizeof(PCRE_UCHAR32) == 4 ? 1 : -1]; |
|
#endif |
|
|
/* When compiling for use with the Virtual Pascal compiler, these functions |
/* When compiling for use with the Virtual Pascal compiler, these functions |
need to have their names changed. PCRE must be compiled with the -DVPCOMPAT |
need to have their names changed. PCRE must be compiled with the -DVPCOMPAT |
option on the command line. */ |
option on the command line. */ |
Line 396 The macros are controlled by the value of LINK_SIZE. T
|
Line 422 The macros are controlled by the value of LINK_SIZE. T
|
the config.h file, but can be overridden by using -D on the command line. This |
the config.h file, but can be overridden by using -D on the command line. This |
is automated on Unix systems via the "configure" command. */ |
is automated on Unix systems via the "configure" command. */ |
|
|
#ifdef COMPILE_PCRE8 | #if defined COMPILE_PCRE8 |
|
|
#if LINK_SIZE == 2 |
#if LINK_SIZE == 2 |
|
|
Line 441 is automated on Unix systems via the "configure" comma
|
Line 467 is automated on Unix systems via the "configure" comma
|
#error LINK_SIZE must be either 2, 3, or 4 |
#error LINK_SIZE must be either 2, 3, or 4 |
#endif |
#endif |
|
|
#else /* COMPILE_PCRE8 */ | #elif defined COMPILE_PCRE16 |
|
|
#ifdef COMPILE_PCRE16 |
|
|
|
#if LINK_SIZE == 2 |
#if LINK_SIZE == 2 |
|
|
|
/* Redefine LINK_SIZE as a multiple of sizeof(pcre_uchar) */ |
#undef LINK_SIZE |
#undef LINK_SIZE |
#define LINK_SIZE 1 |
#define LINK_SIZE 1 |
|
|
Line 460 is automated on Unix systems via the "configure" comma
|
Line 485 is automated on Unix systems via the "configure" comma
|
|
|
#elif LINK_SIZE == 3 || LINK_SIZE == 4 |
#elif LINK_SIZE == 3 || LINK_SIZE == 4 |
|
|
|
/* Redefine LINK_SIZE as a multiple of sizeof(pcre_uchar) */ |
#undef LINK_SIZE |
#undef LINK_SIZE |
#define LINK_SIZE 2 |
#define LINK_SIZE 2 |
|
|
Line 477 is automated on Unix systems via the "configure" comma
|
Line 503 is automated on Unix systems via the "configure" comma
|
#error LINK_SIZE must be either 2, 3, or 4 |
#error LINK_SIZE must be either 2, 3, or 4 |
#endif |
#endif |
|
|
|
#elif defined COMPILE_PCRE32 |
|
|
|
/* Only supported LINK_SIZE is 4 */ |
|
/* Redefine LINK_SIZE as a multiple of sizeof(pcre_uchar) */ |
|
#undef LINK_SIZE |
|
#define LINK_SIZE 1 |
|
|
|
#define PUT(a,n,d) \ |
|
(a[n] = (d)) |
|
|
|
#define GET(a,n) \ |
|
(a[n]) |
|
|
|
/* Keep it positive */ |
|
#define MAX_PATTERN_SIZE (1 << 30) |
|
|
#else |
#else |
#error Unsupported compiling mode |
#error Unsupported compiling mode |
#endif /* COMPILE_PCRE16 */ | #endif /* COMPILE_PCRE[8|16|32] */ |
|
|
#endif /* COMPILE_PCRE8 */ |
|
|
|
/* Convenience macro defined in terms of the others */ |
/* Convenience macro defined in terms of the others */ |
|
|
#define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE |
#define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE |
Line 492 is automated on Unix systems via the "configure" comma
|
Line 532 is automated on Unix systems via the "configure" comma
|
offsets changes. There are used for repeat counts and for other things such as |
offsets changes. There are used for repeat counts and for other things such as |
capturing parenthesis numbers in back references. */ |
capturing parenthesis numbers in back references. */ |
|
|
#ifdef COMPILE_PCRE8 | #if defined COMPILE_PCRE8 |
|
|
#define IMM2_SIZE 2 |
#define IMM2_SIZE 2 |
|
|
Line 500 capturing parenthesis numbers in back references. */
|
Line 540 capturing parenthesis numbers in back references. */
|
a[n] = (d) >> 8; \ |
a[n] = (d) >> 8; \ |
a[(n)+1] = (d) & 255 |
a[(n)+1] = (d) & 255 |
|
|
|
/* For reasons that I do not understand, the expression in this GET2 macro is |
|
treated by gcc as a signed expression, even when a is declared as unsigned. It |
|
seems that any kind of arithmetic results in a signed value. */ |
|
|
#define GET2(a,n) \ |
#define GET2(a,n) \ |
(((a)[n] << 8) | (a)[(n)+1]) | (unsigned int)(((a)[n] << 8) | (a)[(n)+1]) |
|
|
#else /* COMPILE_PCRE8 */ | #elif defined COMPILE_PCRE16 |
|
|
#ifdef COMPILE_PCRE16 | #define IMM2_SIZE 1 |
|
|
|
#define PUT2(a,n,d) \ |
|
a[n] = d |
|
|
|
#define GET2(a,n) \ |
|
a[n] |
|
|
|
#elif defined COMPILE_PCRE32 |
|
|
#define IMM2_SIZE 1 |
#define IMM2_SIZE 1 |
|
|
#define PUT2(a,n,d) \ |
#define PUT2(a,n,d) \ |
Line 517 capturing parenthesis numbers in back references. */
|
Line 569 capturing parenthesis numbers in back references. */
|
|
|
#else |
#else |
#error Unsupported compiling mode |
#error Unsupported compiling mode |
#endif /* COMPILE_PCRE16 */ | #endif /* COMPILE_PCRE[8|16|32] */ |
|
|
#endif /* COMPILE_PCRE8 */ |
|
|
|
#define PUT2INC(a,n,d) PUT2(a,n,d), a += IMM2_SIZE |
#define PUT2INC(a,n,d) PUT2(a,n,d), a += IMM2_SIZE |
|
|
/* The maximum length of a MARK name is currently one data unit; it may be |
/* The maximum length of a MARK name is currently one data unit; it may be |
changed in future to be a fixed number of bytes or to depend on LINK_SIZE. */ |
changed in future to be a fixed number of bytes or to depend on LINK_SIZE. */ |
|
|
#define MAX_MARK ((1 << (sizeof(pcre_uchar)*8)) - 1) | #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 |
| #define MAX_MARK ((1u << 16) - 1) |
| #else |
| #define MAX_MARK ((1u << 8) - 1) |
| #endif |
|
|
/* When UTF encoding is being used, a character is no longer just a single |
/* When UTF encoding is being used, a character is no longer just a single |
character. The macros for character handling generate simple sequences when | byte. The macros for character handling generate simple sequences when used in |
used in character-mode, and more complicated ones for UTF characters. | character-mode, and more complicated ones for UTF characters. GETCHARLENTEST |
GETCHARLENTEST and other macros are not used when UTF is not supported, | and other macros are not used when UTF is not supported, so they are not |
so they are not defined. To make sure they can never even appear when | defined. To make sure they can never even appear when UTF support is omitted, |
UTF support is omitted, we don't even define them. */ | we don't even define them. */ |
|
|
#ifndef SUPPORT_UTF |
#ifndef SUPPORT_UTF |
|
|
Line 546 UTF support is omitted, we don't even define them. */
|
Line 600 UTF support is omitted, we don't even define them. */
|
#define GETCHARINC(c, eptr) c = *eptr++; |
#define GETCHARINC(c, eptr) c = *eptr++; |
#define GETCHARINCTEST(c, eptr) c = *eptr++; |
#define GETCHARINCTEST(c, eptr) c = *eptr++; |
#define GETCHARLEN(c, eptr, len) c = *eptr; |
#define GETCHARLEN(c, eptr, len) c = *eptr; |
|
#define RAWUCHAR(eptr) (*(eptr)) |
|
#define RAWUCHARINC(eptr) (*(eptr)++) |
|
#define RAWUCHARTEST(eptr) (*(eptr)) |
|
#define RAWUCHARINCTEST(eptr) (*(eptr)++) |
/* #define GETCHARLENTEST(c, eptr, len) */ |
/* #define GETCHARLENTEST(c, eptr, len) */ |
/* #define BACKCHAR(eptr) */ |
/* #define BACKCHAR(eptr) */ |
/* #define FORWARDCHAR(eptr) */ |
/* #define FORWARDCHAR(eptr) */ |
Line 553 UTF support is omitted, we don't even define them. */
|
Line 611 UTF support is omitted, we don't even define them. */
|
|
|
#else /* SUPPORT_UTF */ |
#else /* SUPPORT_UTF */ |
|
|
#ifdef COMPILE_PCRE8 |
|
|
|
/* These macros were originally written in the form of loops that used data |
|
from the tables whose names start with PRIV(utf8_table). They were rewritten by |
|
a user so as not to use loops, because in some environments this gives a |
|
significant performance advantage, and it seems never to do any harm. */ |
|
|
|
/* Tells the biggest code point which can be encoded as a single character. */ |
|
|
|
#define MAX_VALUE_FOR_SINGLE_CHAR 127 |
|
|
|
/* Tests whether the code point needs extra characters to decode. */ |
/* Tests whether the code point needs extra characters to decode. */ |
|
|
#define HAS_EXTRALEN(c) ((c) >= 0xc0) | #define HASUTF8EXTRALEN(c) ((c) >= 0xc0) |
|
|
/* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE. |
|
Otherwise it has an undefined behaviour. */ |
|
|
|
#define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3f]) |
|
|
|
/* Returns TRUE, if the given character is not the first character |
|
of a UTF sequence. */ |
|
|
|
#define NOT_FIRSTCHAR(c) (((c) & 0xc0) == 0x80) |
|
|
|
/* Base macro to pick up the remaining bytes of a UTF-8 character, not |
/* Base macro to pick up the remaining bytes of a UTF-8 character, not |
advancing the pointer. */ |
advancing the pointer. */ |
|
|
Line 600 advancing the pointer. */
|
Line 637 advancing the pointer. */
|
((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \ |
((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \ |
} |
} |
|
|
/* Get the next UTF-8 character, not advancing the pointer. This is called when |
|
we know we are in UTF-8 mode. */ |
|
|
|
#define GETCHAR(c, eptr) \ |
|
c = *eptr; \ |
|
if (c >= 0xc0) GETUTF8(c, eptr); |
|
|
|
/* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the |
|
pointer. */ |
|
|
|
#define GETCHARTEST(c, eptr) \ |
|
c = *eptr; \ |
|
if (utf && c >= 0xc0) GETUTF8(c, eptr); |
|
|
|
/* Base macro to pick up the remaining bytes of a UTF-8 character, advancing |
/* Base macro to pick up the remaining bytes of a UTF-8 character, advancing |
the pointer. */ |
the pointer. */ |
|
|
Line 648 the pointer. */
|
Line 671 the pointer. */
|
} \ |
} \ |
} |
} |
|
|
|
#if defined COMPILE_PCRE8 |
|
|
|
/* These macros were originally written in the form of loops that used data |
|
from the tables whose names start with PRIV(utf8_table). They were rewritten by |
|
a user so as not to use loops, because in some environments this gives a |
|
significant performance advantage, and it seems never to do any harm. */ |
|
|
|
/* Tells the biggest code point which can be encoded as a single character. */ |
|
|
|
#define MAX_VALUE_FOR_SINGLE_CHAR 127 |
|
|
|
/* Tests whether the code point needs extra characters to decode. */ |
|
|
|
#define HAS_EXTRALEN(c) ((c) >= 0xc0) |
|
|
|
/* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE. |
|
Otherwise it has an undefined behaviour. */ |
|
|
|
#define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3f]) |
|
|
|
/* Returns TRUE, if the given character is not the first character |
|
of a UTF sequence. */ |
|
|
|
#define NOT_FIRSTCHAR(c) (((c) & 0xc0) == 0x80) |
|
|
|
/* Get the next UTF-8 character, not advancing the pointer. This is called when |
|
we know we are in UTF-8 mode. */ |
|
|
|
#define GETCHAR(c, eptr) \ |
|
c = *eptr; \ |
|
if (c >= 0xc0) GETUTF8(c, eptr); |
|
|
|
/* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the |
|
pointer. */ |
|
|
|
#define GETCHARTEST(c, eptr) \ |
|
c = *eptr; \ |
|
if (utf && c >= 0xc0) GETUTF8(c, eptr); |
|
|
/* Get the next UTF-8 character, advancing the pointer. This is called when we |
/* Get the next UTF-8 character, advancing the pointer. This is called when we |
know we are in UTF-8 mode. */ |
know we are in UTF-8 mode. */ |
|
|
Line 714 do not know if we are in UTF-8 mode. */
|
Line 776 do not know if we are in UTF-8 mode. */
|
c = *eptr; \ |
c = *eptr; \ |
if (utf && c >= 0xc0) GETUTF8LEN(c, eptr, len); |
if (utf && c >= 0xc0) GETUTF8LEN(c, eptr, len); |
|
|
|
/* Returns the next uchar, not advancing the pointer. This is called when |
|
we know we are in UTF mode. */ |
|
|
|
#define RAWUCHAR(eptr) \ |
|
(*(eptr)) |
|
|
|
/* Returns the next uchar, advancing the pointer. This is called when |
|
we know we are in UTF mode. */ |
|
|
|
#define RAWUCHARINC(eptr) \ |
|
(*((eptr)++)) |
|
|
|
/* Returns the next uchar, testing for UTF mode, and not advancing the |
|
pointer. */ |
|
|
|
#define RAWUCHARTEST(eptr) \ |
|
(*(eptr)) |
|
|
|
/* Returns the next uchar, testing for UTF mode, advancing the |
|
pointer. */ |
|
|
|
#define RAWUCHARINCTEST(eptr) \ |
|
(*((eptr)++)) |
|
|
/* If the pointer is not at the start of a character, move it back until |
/* If the pointer is not at the start of a character, move it back until |
it is. This is called only in UTF-8 mode - we don't put a test within the macro |
it is. This is called only in UTF-8 mode - we don't put a test within the macro |
because almost all calls are already within a block of UTF-8 only code. */ |
because almost all calls are already within a block of UTF-8 only code. */ |
Line 727 because almost all calls are already within a block of
|
Line 813 because almost all calls are already within a block of
|
#define ACROSSCHAR(condition, eptr, action) \ |
#define ACROSSCHAR(condition, eptr, action) \ |
while((condition) && ((eptr) & 0xc0) == 0x80) action |
while((condition) && ((eptr) & 0xc0) == 0x80) action |
|
|
#else /* COMPILE_PCRE8 */ | #elif defined COMPILE_PCRE16 |
|
|
#ifdef COMPILE_PCRE16 |
|
|
|
/* Tells the biggest code point which can be encoded as a single character. */ |
/* Tells the biggest code point which can be encoded as a single character. */ |
|
|
#define MAX_VALUE_FOR_SINGLE_CHAR 65535 |
#define MAX_VALUE_FOR_SINGLE_CHAR 65535 |
Line 811 we do not know if we are in UTF-16 mode. */
|
Line 895 we do not know if we are in UTF-16 mode. */
|
c = *eptr; \ |
c = *eptr; \ |
if (utf && (c & 0xfc00) == 0xd800) GETUTF16LEN(c, eptr, len); |
if (utf && (c & 0xfc00) == 0xd800) GETUTF16LEN(c, eptr, len); |
|
|
|
/* Returns the next uchar, not advancing the pointer. This is called when |
|
we know we are in UTF mode. */ |
|
|
|
#define RAWUCHAR(eptr) \ |
|
(*(eptr)) |
|
|
|
/* Returns the next uchar, advancing the pointer. This is called when |
|
we know we are in UTF mode. */ |
|
|
|
#define RAWUCHARINC(eptr) \ |
|
(*((eptr)++)) |
|
|
|
/* Returns the next uchar, testing for UTF mode, and not advancing the |
|
pointer. */ |
|
|
|
#define RAWUCHARTEST(eptr) \ |
|
(*(eptr)) |
|
|
|
/* Returns the next uchar, testing for UTF mode, advancing the |
|
pointer. */ |
|
|
|
#define RAWUCHARINCTEST(eptr) \ |
|
(*((eptr)++)) |
|
|
/* If the pointer is not at the start of a character, move it back until |
/* If the pointer is not at the start of a character, move it back until |
it is. This is called only in UTF-16 mode - we don't put a test within the |
it is. This is called only in UTF-16 mode - we don't put a test within the |
macro because almost all calls are already within a block of UTF-16 only |
macro because almost all calls are already within a block of UTF-16 only |
Line 825 code. */
|
Line 933 code. */
|
#define ACROSSCHAR(condition, eptr, action) \ |
#define ACROSSCHAR(condition, eptr, action) \ |
if ((condition) && ((eptr) & 0xfc00) == 0xdc00) action |
if ((condition) && ((eptr) & 0xfc00) == 0xdc00) action |
|
|
#endif | #elif defined COMPILE_PCRE32 |
|
|
#endif /* COMPILE_PCRE8 */ | /* These are trivial for the 32-bit library, since all UTF-32 characters fit |
| into one pcre_uchar unit. */ |
| #define MAX_VALUE_FOR_SINGLE_CHAR (0x10ffffu) |
| #define HAS_EXTRALEN(c) (0) |
| #define GET_EXTRALEN(c) (0) |
| #define NOT_FIRSTCHAR(c) (0) |
|
|
|
/* Get the next UTF-32 character, not advancing the pointer. This is called when |
|
we know we are in UTF-32 mode. */ |
|
|
|
#define GETCHAR(c, eptr) \ |
|
c = *(eptr); |
|
|
|
/* Get the next UTF-32 character, testing for UTF-32 mode, and not advancing the |
|
pointer. */ |
|
|
|
#define GETCHARTEST(c, eptr) \ |
|
c = *(eptr); |
|
|
|
/* Get the next UTF-32 character, advancing the pointer. This is called when we |
|
know we are in UTF-32 mode. */ |
|
|
|
#define GETCHARINC(c, eptr) \ |
|
c = *((eptr)++); |
|
|
|
/* Get the next character, testing for UTF-32 mode, and advancing the pointer. |
|
This is called when we don't know if we are in UTF-32 mode. */ |
|
|
|
#define GETCHARINCTEST(c, eptr) \ |
|
c = *((eptr)++); |
|
|
|
/* Get the next UTF-32 character, not advancing the pointer, not incrementing |
|
length (since all UTF-32 is of length 1). This is called when we know we are in |
|
UTF-32 mode. */ |
|
|
|
#define GETCHARLEN(c, eptr, len) \ |
|
GETCHAR(c, eptr) |
|
|
|
/* Get the next UTF-32character, testing for UTF-32 mode, not advancing the |
|
pointer, not incrementing the length (since all UTF-32 is of length 1). |
|
This is called when we do not know if we are in UTF-32 mode. */ |
|
|
|
#define GETCHARLENTEST(c, eptr, len) \ |
|
GETCHARTEST(c, eptr) |
|
|
|
/* Returns the next uchar, not advancing the pointer. This is called when |
|
we know we are in UTF mode. */ |
|
|
|
#define RAWUCHAR(eptr) \ |
|
(*(eptr)) |
|
|
|
/* Returns the next uchar, advancing the pointer. This is called when |
|
we know we are in UTF mode. */ |
|
|
|
#define RAWUCHARINC(eptr) \ |
|
(*((eptr)++)) |
|
|
|
/* Returns the next uchar, testing for UTF mode, and not advancing the |
|
pointer. */ |
|
|
|
#define RAWUCHARTEST(eptr) \ |
|
(*(eptr)) |
|
|
|
/* Returns the next uchar, testing for UTF mode, advancing the |
|
pointer. */ |
|
|
|
#define RAWUCHARINCTEST(eptr) \ |
|
(*((eptr)++)) |
|
|
|
/* If the pointer is not at the start of a character, move it back until |
|
it is. This is called only in UTF-32 mode - we don't put a test within the |
|
macro because almost all calls are already within a block of UTF-32 only |
|
code. |
|
These are all no-ops since all UTF-32 characters fit into one pcre_uchar. */ |
|
|
|
#define BACKCHAR(eptr) do { } while (0) |
|
|
|
/* Same as above, just in the other direction. */ |
|
#define FORWARDCHAR(eptr) do { } while (0) |
|
|
|
/* Same as above, but it allows a fully customizable form. */ |
|
#define ACROSSCHAR(condition, eptr, action) do { } while (0) |
|
|
|
#else |
|
#error Unsupported compiling mode |
|
#endif /* COMPILE_PCRE[8|16|32] */ |
|
|
#endif /* SUPPORT_UTF */ |
#endif /* SUPPORT_UTF */ |
|
|
|
/* Tests for Unicode horizontal and vertical whitespace characters must check a |
|
number of different values. Using a switch statement for this generates the |
|
fastest code (no loop, no memory access), and there are several places in the |
|
interpreter code where this happens. In order to ensure that all the case lists |
|
remain in step, we use macros so that there is only one place where the lists |
|
are defined. |
|
|
/* In case there is no definition of offsetof() provided - though any proper | These values are also required as lists in pcre_compile.c when processing \h, |
Standard C system should have one. */ | \H, \v and \V in a character class. The lists are defined in pcre_tables.c, but |
| macros that define the values are here so that all the definitions are |
| together. The lists must be in ascending character order, terminated by |
| NOTACHAR (which is 0xffffffff). |
|
|
#ifndef offsetof | Any changes should ensure that the various macros are kept in step with each |
#define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field)) | other. NOTE: The values also appear in pcre_jit_compile.c. */ |
| |
| /* ------ ASCII/Unicode environments ------ */ |
| |
| #ifndef EBCDIC |
| |
| #define HSPACE_LIST \ |
| CHAR_HT, CHAR_SPACE, 0xa0, \ |
| 0x1680, 0x180e, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, \ |
| 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x202f, 0x205f, 0x3000, \ |
| NOTACHAR |
| |
| #define HSPACE_MULTIBYTE_CASES \ |
| case 0x1680: /* OGHAM SPACE MARK */ \ |
| case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ \ |
| case 0x2000: /* EN QUAD */ \ |
| case 0x2001: /* EM QUAD */ \ |
| case 0x2002: /* EN SPACE */ \ |
| case 0x2003: /* EM SPACE */ \ |
| case 0x2004: /* THREE-PER-EM SPACE */ \ |
| case 0x2005: /* FOUR-PER-EM SPACE */ \ |
| case 0x2006: /* SIX-PER-EM SPACE */ \ |
| case 0x2007: /* FIGURE SPACE */ \ |
| case 0x2008: /* PUNCTUATION SPACE */ \ |
| case 0x2009: /* THIN SPACE */ \ |
| case 0x200A: /* HAIR SPACE */ \ |
| case 0x202f: /* NARROW NO-BREAK SPACE */ \ |
| case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ \ |
| case 0x3000 /* IDEOGRAPHIC SPACE */ |
| |
| #define HSPACE_BYTE_CASES \ |
| case CHAR_HT: \ |
| case CHAR_SPACE: \ |
| case 0xa0 /* NBSP */ |
| |
| #define HSPACE_CASES \ |
| HSPACE_BYTE_CASES: \ |
| HSPACE_MULTIBYTE_CASES |
| |
| #define VSPACE_LIST \ |
| CHAR_LF, CHAR_VT, CHAR_FF, CHAR_CR, CHAR_NEL, 0x2028, 0x2029, NOTACHAR |
| |
| #define VSPACE_MULTIBYTE_CASES \ |
| case 0x2028: /* LINE SEPARATOR */ \ |
| case 0x2029 /* PARAGRAPH SEPARATOR */ |
| |
| #define VSPACE_BYTE_CASES \ |
| case CHAR_LF: \ |
| case CHAR_VT: \ |
| case CHAR_FF: \ |
| case CHAR_CR: \ |
| case CHAR_NEL |
| |
| #define VSPACE_CASES \ |
| VSPACE_BYTE_CASES: \ |
| VSPACE_MULTIBYTE_CASES |
| |
| /* ------ EBCDIC environments ------ */ |
| |
| #else |
| #define HSPACE_LIST CHAR_HT, CHAR_SPACE |
| |
| #define HSPACE_BYTE_CASES \ |
| case CHAR_HT: \ |
| case CHAR_SPACE |
| |
| #define HSPACE_CASES HSPACE_BYTE_CASES |
| |
| #ifdef EBCDIC_NL25 |
| #define VSPACE_LIST \ |
| CHAR_VT, CHAR_FF, CHAR_CR, CHAR_NEL, CHAR_LF, NOTACHAR |
| #else |
| #define VSPACE_LIST \ |
| CHAR_VT, CHAR_FF, CHAR_CR, CHAR_LF, CHAR_NEL, NOTACHAR |
#endif |
#endif |
|
|
|
#define VSPACE_BYTE_CASES \ |
|
case CHAR_LF: \ |
|
case CHAR_VT: \ |
|
case CHAR_FF: \ |
|
case CHAR_CR: \ |
|
case CHAR_NEL |
|
|
|
#define VSPACE_CASES VSPACE_BYTE_CASES |
|
#endif /* EBCDIC */ |
|
|
|
/* ------ End of whitespace macros ------ */ |
|
|
|
|
|
|
/* Private flags containing information about the compiled regex. They used to |
/* Private flags containing information about the compiled regex. They used to |
live at the top end of the options word, but that got almost full, so now they | live at the top end of the options word, but that got almost full, so they were |
are in a 16-bit flags word. From release 8.00, PCRE_NOPARTIAL is unused, as | moved to a 16-bit flags word - which got almost full, so now they are in a |
the restrictions on partial matching have been lifted. It remains for backwards | 32-bit flags word. From release 8.00, PCRE_NOPARTIAL is unused, as the |
| restrictions on partial matching have been lifted. It remains for backwards |
compatibility. */ |
compatibility. */ |
|
|
#ifdef COMPILE_PCRE8 | #define PCRE_MODE8 0x00000001 /* compiled in 8 bit mode */ |
#define PCRE_MODE 0x0001 /* compiled in 8 bit mode */ | #define PCRE_MODE16 0x00000002 /* compiled in 16 bit mode */ |
| #define PCRE_MODE32 0x00000004 /* compiled in 32 bit mode */ |
| #define PCRE_FIRSTSET 0x00000010 /* first_char is set */ |
| #define PCRE_FCH_CASELESS 0x00000020 /* caseless first char */ |
| #define PCRE_REQCHSET 0x00000040 /* req_byte is set */ |
| #define PCRE_RCH_CASELESS 0x00000080 /* caseless requested char */ |
| #define PCRE_STARTLINE 0x00000100 /* start after \n for multiline */ |
| #define PCRE_NOPARTIAL 0x00000200 /* can't use partial with this regex */ |
| #define PCRE_JCHANGED 0x00000400 /* j option used in regex */ |
| #define PCRE_HASCRORLF 0x00000800 /* explicit \r or \n in pattern */ |
| #define PCRE_HASTHEN 0x00001000 /* pattern contains (*THEN) */ |
| #define PCRE_MLSET 0x00002000 /* match limit set by regex */ |
| #define PCRE_RLSET 0x00004000 /* recursion limit set by regex */ |
| #define PCRE_MATCH_EMPTY 0x00008000 /* pattern can match empty string */ |
| |
| #if defined COMPILE_PCRE8 |
| #define PCRE_MODE PCRE_MODE8 |
| #elif defined COMPILE_PCRE16 |
| #define PCRE_MODE PCRE_MODE16 |
| #elif defined COMPILE_PCRE32 |
| #define PCRE_MODE PCRE_MODE32 |
#endif |
#endif |
#ifdef COMPILE_PCRE16 | #define PCRE_MODE_MASK (PCRE_MODE8 | PCRE_MODE16 | PCRE_MODE32) |
#define PCRE_MODE 0x0002 /* compiled in 16 bit mode */ | |
#endif | |
#define PCRE_FIRSTSET 0x0010 /* first_char is set */ | |
#define PCRE_FCH_CASELESS 0x0020 /* caseless first char */ | |
#define PCRE_REQCHSET 0x0040 /* req_byte is set */ | |
#define PCRE_RCH_CASELESS 0x0080 /* caseless requested char */ | |
#define PCRE_STARTLINE 0x0100 /* start after \n for multiline */ | |
#define PCRE_NOPARTIAL 0x0200 /* can't use partial with this regex */ | |
#define PCRE_JCHANGED 0x0400 /* j option used in regex */ | |
#define PCRE_HASCRORLF 0x0800 /* explicit \r or \n in pattern */ | |
#define PCRE_HASTHEN 0x1000 /* pattern contains (*THEN) */ | |
|
|
/* Flags for the "extra" block produced by pcre_study(). */ |
/* Flags for the "extra" block produced by pcre_study(). */ |
|
|
Line 876 time, run time, or study time, respectively. */
|
Line 1174 time, run time, or study time, respectively. */
|
#define PUBLIC_COMPILE_OPTIONS \ |
#define PUBLIC_COMPILE_OPTIONS \ |
(PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \ |
(PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \ |
PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \ |
PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \ |
PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \ | PCRE_NO_AUTO_CAPTURE|PCRE_NO_AUTO_POSSESS| \ |
| PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \ |
PCRE_DUPNAMES|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \ |
PCRE_DUPNAMES|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \ |
PCRE_JAVASCRIPT_COMPAT|PCRE_UCP|PCRE_NO_START_OPTIMIZE) | PCRE_JAVASCRIPT_COMPAT|PCRE_UCP|PCRE_NO_START_OPTIMIZE|PCRE_NEVER_UTF) |
|
|
#define PUBLIC_EXEC_OPTIONS \ |
#define PUBLIC_EXEC_OPTIONS \ |
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NOTEMPTY_ATSTART| \ |
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NOTEMPTY_ATSTART| \ |
Line 893 time, run time, or study time, respectively. */
|
Line 1192 time, run time, or study time, respectively. */
|
|
|
#define PUBLIC_STUDY_OPTIONS \ |
#define PUBLIC_STUDY_OPTIONS \ |
(PCRE_STUDY_JIT_COMPILE|PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE| \ |
(PCRE_STUDY_JIT_COMPILE|PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE| \ |
PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) | PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE|PCRE_STUDY_EXTRA_NEEDED) |
|
|
|
#define PUBLIC_JIT_EXEC_OPTIONS \ |
|
(PCRE_NO_UTF8_CHECK|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|\ |
|
PCRE_NOTEMPTY_ATSTART|PCRE_PARTIAL_SOFT|PCRE_PARTIAL_HARD) |
|
|
/* Magic number to provide a small check against being handed junk. */ |
/* Magic number to provide a small check against being handed junk. */ |
|
|
#define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */ |
#define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */ |
Line 904 in different endianness. */
|
Line 1207 in different endianness. */
|
|
|
#define REVERSED_MAGIC_NUMBER 0x45524350UL /* 'ERCP' */ |
#define REVERSED_MAGIC_NUMBER 0x45524350UL /* 'ERCP' */ |
|
|
/* Negative values for the firstchar and reqchar variables */ |
|
|
|
#define REQ_UNSET (-2) |
|
#define REQ_NONE (-1) |
|
|
|
/* The maximum remaining length of subject we are prepared to search for a |
/* The maximum remaining length of subject we are prepared to search for a |
req_byte match. */ |
req_byte match. */ |
|
|
Line 945 macros to give the functions distinct names. */
|
Line 1243 macros to give the functions distinct names. */
|
#ifndef SUPPORT_UTF |
#ifndef SUPPORT_UTF |
|
|
/* UTF-8 support is not enabled; use the platform-dependent character literals |
/* UTF-8 support is not enabled; use the platform-dependent character literals |
so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */ | so that PCRE works in both ASCII and EBCDIC environments, but only in non-UTF |
| mode. Newline characters are problematic in EBCDIC. Though it has CR and LF |
| characters, a common practice has been to use its NL (0x15) character as the |
| line terminator in C-like processing environments. However, sometimes the LF |
| (0x25) character is used instead, according to this Unicode document: |
|
|
|
http://unicode.org/standard/reports/tr13/tr13-5.html |
|
|
|
PCRE defaults EBCDIC NL to 0x15, but has a build-time option to select 0x25 |
|
instead. Whichever is *not* chosen is defined as NEL. |
|
|
|
In both ASCII and EBCDIC environments, CHAR_NL and CHAR_LF are synonyms for the |
|
same code point. */ |
|
|
|
#ifdef EBCDIC |
|
|
|
#ifndef EBCDIC_NL25 |
|
#define CHAR_NL '\x15' |
|
#define CHAR_NEL '\x25' |
|
#define STR_NL "\x15" |
|
#define STR_NEL "\x25" |
|
#else |
|
#define CHAR_NL '\x25' |
|
#define CHAR_NEL '\x15' |
|
#define STR_NL "\x25" |
|
#define STR_NEL "\x15" |
|
#endif |
|
|
|
#define CHAR_LF CHAR_NL |
|
#define STR_LF STR_NL |
|
|
|
#define CHAR_ESC '\047' |
|
#define CHAR_DEL '\007' |
|
#define STR_ESC "\047" |
|
#define STR_DEL "\007" |
|
|
|
#else /* Not EBCDIC */ |
|
|
|
/* In ASCII/Unicode, linefeed is '\n' and we equate this to NL for |
|
compatibility. NEL is the Unicode newline character; make sure it is |
|
a positive value. */ |
|
|
|
#define CHAR_LF '\n' |
|
#define CHAR_NL CHAR_LF |
|
#define CHAR_NEL ((unsigned char)'\x85') |
|
#define CHAR_ESC '\033' |
|
#define CHAR_DEL '\177' |
|
|
|
#define STR_LF "\n" |
|
#define STR_NL STR_LF |
|
#define STR_NEL "\x85" |
|
#define STR_ESC "\033" |
|
#define STR_DEL "\177" |
|
|
|
#endif /* EBCDIC */ |
|
|
|
/* The remaining definitions work in both environments. */ |
|
|
|
#define CHAR_NULL '\0' |
#define CHAR_HT '\t' |
#define CHAR_HT '\t' |
#define CHAR_VT '\v' |
#define CHAR_VT '\v' |
#define CHAR_FF '\f' |
#define CHAR_FF '\f' |
#define CHAR_CR '\r' |
#define CHAR_CR '\r' |
#define CHAR_NL '\n' |
|
#define CHAR_BS '\b' |
#define CHAR_BS '\b' |
#define CHAR_BEL '\a' |
#define CHAR_BEL '\a' |
#ifdef EBCDIC |
|
#define CHAR_ESC '\047' |
|
#define CHAR_DEL '\007' |
|
#else |
|
#define CHAR_ESC '\033' |
|
#define CHAR_DEL '\177' |
|
#endif |
|
|
|
#define CHAR_SPACE ' ' |
#define CHAR_SPACE ' ' |
#define CHAR_EXCLAMATION_MARK '!' |
#define CHAR_EXCLAMATION_MARK '!' |
Line 1062 so that PCRE works on both ASCII and EBCDIC platforms,
|
Line 1409 so that PCRE works on both ASCII and EBCDIC platforms,
|
#define STR_VT "\v" |
#define STR_VT "\v" |
#define STR_FF "\f" |
#define STR_FF "\f" |
#define STR_CR "\r" |
#define STR_CR "\r" |
#define STR_NL "\n" |
|
#define STR_BS "\b" |
#define STR_BS "\b" |
#define STR_BEL "\a" |
#define STR_BEL "\a" |
#ifdef EBCDIC |
|
#define STR_ESC "\047" |
|
#define STR_DEL "\007" |
|
#else |
|
#define STR_ESC "\033" |
|
#define STR_DEL "\177" |
|
#endif |
|
|
|
#define STR_SPACE " " |
#define STR_SPACE " " |
#define STR_EXCLAMATION_MARK "!" |
#define STR_EXCLAMATION_MARK "!" |
Line 1194 so that PCRE works on both ASCII and EBCDIC platforms,
|
Line 1533 so that PCRE works on both ASCII and EBCDIC platforms,
|
#define STRING_xdigit "xdigit" |
#define STRING_xdigit "xdigit" |
|
|
#define STRING_DEFINE "DEFINE" |
#define STRING_DEFINE "DEFINE" |
|
#define STRING_WEIRD_STARTWORD "[:<:]]" |
|
#define STRING_WEIRD_ENDWORD "[:>:]]" |
|
|
#define STRING_CR_RIGHTPAR "CR)" | #define STRING_CR_RIGHTPAR "CR)" |
#define STRING_LF_RIGHTPAR "LF)" | #define STRING_LF_RIGHTPAR "LF)" |
#define STRING_CRLF_RIGHTPAR "CRLF)" | #define STRING_CRLF_RIGHTPAR "CRLF)" |
#define STRING_ANY_RIGHTPAR "ANY)" | #define STRING_ANY_RIGHTPAR "ANY)" |
#define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)" | #define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)" |
#define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)" | #define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)" |
#define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)" | #define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)" |
#ifdef COMPILE_PCRE8 | #define STRING_UTF8_RIGHTPAR "UTF8)" |
#define STRING_UTF_RIGHTPAR "UTF8)" | #define STRING_UTF16_RIGHTPAR "UTF16)" |
#endif | #define STRING_UTF32_RIGHTPAR "UTF32)" |
#ifdef COMPILE_PCRE16 | #define STRING_UTF_RIGHTPAR "UTF)" |
#define STRING_UTF_RIGHTPAR "UTF16)" | #define STRING_UCP_RIGHTPAR "UCP)" |
#endif | #define STRING_NO_AUTO_POSSESS_RIGHTPAR "NO_AUTO_POSSESS)" |
#define STRING_UCP_RIGHTPAR "UCP)" | #define STRING_NO_START_OPT_RIGHTPAR "NO_START_OPT)" |
#define STRING_NO_START_OPT_RIGHTPAR "NO_START_OPT)" | #define STRING_LIMIT_MATCH_EQ "LIMIT_MATCH=" |
| #define STRING_LIMIT_RECURSION_EQ "LIMIT_RECURSION=" |
|
|
#else /* SUPPORT_UTF */ |
#else /* SUPPORT_UTF */ |
|
|
Line 1221 only. */
|
Line 1563 only. */
|
#define CHAR_VT '\013' |
#define CHAR_VT '\013' |
#define CHAR_FF '\014' |
#define CHAR_FF '\014' |
#define CHAR_CR '\015' |
#define CHAR_CR '\015' |
#define CHAR_NL '\012' | #define CHAR_LF '\012' |
| #define CHAR_NL CHAR_LF |
| #define CHAR_NEL ((unsigned char)'\x85') |
#define CHAR_BS '\010' |
#define CHAR_BS '\010' |
#define CHAR_BEL '\007' |
#define CHAR_BEL '\007' |
#define CHAR_ESC '\033' |
#define CHAR_ESC '\033' |
#define CHAR_DEL '\177' |
#define CHAR_DEL '\177' |
|
|
|
#define CHAR_NULL '\0' |
#define CHAR_SPACE '\040' |
#define CHAR_SPACE '\040' |
#define CHAR_EXCLAMATION_MARK '\041' |
#define CHAR_EXCLAMATION_MARK '\041' |
#define CHAR_QUOTATION_MARK '\042' |
#define CHAR_QUOTATION_MARK '\042' |
Line 1454 only. */
|
Line 1799 only. */
|
#define STRING_xdigit STR_x STR_d STR_i STR_g STR_i STR_t |
#define STRING_xdigit STR_x STR_d STR_i STR_g STR_i STR_t |
|
|
#define STRING_DEFINE STR_D STR_E STR_F STR_I STR_N STR_E |
#define STRING_DEFINE STR_D STR_E STR_F STR_I STR_N STR_E |
|
#define STRING_WEIRD_STARTWORD STR_LEFT_SQUARE_BRACKET STR_COLON STR_LESS_THAN_SIGN STR_COLON STR_RIGHT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET |
|
#define STRING_WEIRD_ENDWORD STR_LEFT_SQUARE_BRACKET STR_COLON STR_GREATER_THAN_SIGN STR_COLON STR_RIGHT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET |
|
|
#define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS | #define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS |
#define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS | #define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS |
#define STRING_CRLF_RIGHTPAR STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS | #define STRING_CRLF_RIGHTPAR STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS |
#define STRING_ANY_RIGHTPAR STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS | #define STRING_ANY_RIGHTPAR STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS |
#define STRING_ANYCRLF_RIGHTPAR STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS | #define STRING_ANYCRLF_RIGHTPAR STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS |
#define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS | #define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS |
#define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS | #define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS |
#ifdef COMPILE_PCRE8 | #define STRING_UTF8_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS |
#define STRING_UTF_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS | #define STRING_UTF16_RIGHTPAR STR_U STR_T STR_F STR_1 STR_6 STR_RIGHT_PARENTHESIS |
#endif | #define STRING_UTF32_RIGHTPAR STR_U STR_T STR_F STR_3 STR_2 STR_RIGHT_PARENTHESIS |
#ifdef COMPILE_PCRE16 | #define STRING_UTF_RIGHTPAR STR_U STR_T STR_F STR_RIGHT_PARENTHESIS |
#define STRING_UTF_RIGHTPAR STR_U STR_T STR_F STR_1 STR_6 STR_RIGHT_PARENTHESIS | #define STRING_UCP_RIGHTPAR STR_U STR_C STR_P STR_RIGHT_PARENTHESIS |
#endif | #define STRING_NO_AUTO_POSSESS_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_A STR_U STR_T STR_O STR_UNDERSCORE STR_P STR_O STR_S STR_S STR_E STR_S STR_S STR_RIGHT_PARENTHESIS |
#define STRING_UCP_RIGHTPAR STR_U STR_C STR_P STR_RIGHT_PARENTHESIS | #define STRING_NO_START_OPT_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_S STR_T STR_A STR_R STR_T STR_UNDERSCORE STR_O STR_P STR_T STR_RIGHT_PARENTHESIS |
#define STRING_NO_START_OPT_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_S STR_T STR_A STR_R STR_T STR_UNDERSCORE STR_O STR_P STR_T STR_RIGHT_PARENTHESIS | #define STRING_LIMIT_MATCH_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_M STR_A STR_T STR_C STR_H STR_EQUALS_SIGN |
| #define STRING_LIMIT_RECURSION_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_R STR_E STR_C STR_U STR_R STR_S STR_I STR_O STR_N STR_EQUALS_SIGN |
|
|
#endif /* SUPPORT_UTF */ |
#endif /* SUPPORT_UTF */ |
|
|
Line 1484 only. */
|
Line 1832 only. */
|
#endif |
#endif |
|
|
#ifndef ESC_n |
#ifndef ESC_n |
#define ESC_n CHAR_NL | #define ESC_n CHAR_LF |
#endif |
#endif |
|
|
#ifndef ESC_r |
#ifndef ESC_r |
Line 1509 only. */
|
Line 1857 only. */
|
#define PT_SPACE 6 /* Perl space - Z plus 9,10,12,13 */ |
#define PT_SPACE 6 /* Perl space - Z plus 9,10,12,13 */ |
#define PT_PXSPACE 7 /* POSIX space - Z plus 9,10,11,12,13 */ |
#define PT_PXSPACE 7 /* POSIX space - Z plus 9,10,11,12,13 */ |
#define PT_WORD 8 /* Word - L plus N plus underscore */ |
#define PT_WORD 8 /* Word - L plus N plus underscore */ |
|
#define PT_CLIST 9 /* Pseudo-property: match character list */ |
|
#define PT_UCNC 10 /* Universal Character nameable character */ |
|
#define PT_TABSIZE 11 /* Size of square table for autopossessify tests */ |
|
|
|
/* The following special properties are used only in XCLASS items, when POSIX |
|
classes are specified and PCRE_UCP is set - in other words, for Unicode |
|
handling of these classes. They are not available via the \p or \P escapes like |
|
those in the above list, and so they do not take part in the autopossessifying |
|
table. */ |
|
|
|
#define PT_PXGRAPH 11 /* [:graph:] - characters that mark the paper */ |
|
#define PT_PXPRINT 12 /* [:print:] - [:graph:] plus non-control spaces */ |
|
#define PT_PXPUNCT 13 /* [:punct:] - punctuation characters */ |
|
|
/* Flag bits and data types for the extended class (OP_XCLASS) for classes that |
/* Flag bits and data types for the extended class (OP_XCLASS) for classes that |
contain characters with values greater than 255. */ |
contain characters with values greater than 255. */ |
|
|
Line 1523 contain characters with values greater than 255. */
|
Line 1884 contain characters with values greater than 255. */
|
#define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */ |
#define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */ |
|
|
/* These are escaped items that aren't just an encoding of a particular data |
/* These are escaped items that aren't just an encoding of a particular data |
value such as \n. They must have non-zero values, as check_escape() returns | value such as \n. They must have non-zero values, as check_escape() returns 0 |
their negation. Also, they must appear in the same order as in the opcode | for a data character. Also, they must appear in the same order as in the |
definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it | opcode definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it |
corresponds to "." in DOTALL mode rather than an escape sequence. It is also |
corresponds to "." in DOTALL mode rather than an escape sequence. It is also |
used for [^] in JavaScript compatibility mode, and for \C in non-utf mode. In |
used for [^] in JavaScript compatibility mode, and for \C in non-utf mode. In |
non-DOTALL mode, "." behaves like \N. |
non-DOTALL mode, "." behaves like \N. |
|
|
The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc. |
The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc. |
when PCRE_UCP is set, when replacement of \d etc by \p sequences is required. | when PCRE_UCP is set and replacement of \d etc by \p sequences is required. |
They must be contiguous, and remain in order so that the replacements can be |
They must be contiguous, and remain in order so that the replacements can be |
looked up from a table. |
looked up from a table. |
|
|
The final escape must be ESC_REF as subsequent values are used for | Negative numbers are used to encode a backreference (\1, \2, \3, etc.) in |
backreferences (\1, \2, \3, etc). There are two tests in the code for an escape | check_escape(). There are two tests in the code for an escape |
greater than ESC_b and less than ESC_Z to detect the types that may be |
greater than ESC_b and less than ESC_Z to detect the types that may be |
repeated. These are the types that consume characters. If any new escapes are |
repeated. These are the types that consume characters. If any new escapes are |
put in between that don't consume a character, that code will have to change. |
put in between that don't consume a character, that code will have to change. |
Line 1546 enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, E
|
Line 1907 enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, E
|
ESC_W, ESC_w, ESC_N, ESC_dum, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H, |
ESC_W, ESC_w, ESC_N, ESC_dum, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H, |
ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, |
ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, |
ESC_E, ESC_Q, ESC_g, ESC_k, |
ESC_E, ESC_Q, ESC_g, ESC_k, |
ESC_DU, ESC_du, ESC_SU, ESC_su, ESC_WU, ESC_wu, | ESC_DU, ESC_du, ESC_SU, ESC_su, ESC_WU, ESC_wu }; |
ESC_REF }; | |
|
|
/* Opcode table: Starting from 1 (i.e. after OP_END), the values up to |
|
OP_EOD must correspond in order to the list of escapes immediately above. |
|
|
|
*** NOTE NOTE NOTE *** Whenever this list is updated, the two macro definitions | /********************** Opcode definitions ******************/ |
that follow must also be updated to match. There are also tables called | |
"coptable" and "poptable" in pcre_dfa_exec.c that must be updated. */ | |
|
|
|
/****** NOTE NOTE NOTE ****** |
|
|
|
Starting from 1 (i.e. after OP_END), the values up to OP_EOD must correspond in |
|
order to the list of escapes immediately above. Furthermore, values up to |
|
OP_DOLLM must not be changed without adjusting the table called autoposstab in |
|
pcre_compile.c |
|
|
|
Whenever this list is updated, the two macro definitions that follow must be |
|
updated to match. The possessification table called "opcode_possessify" in |
|
pcre_compile.c must also be updated, and also the tables called "coptable" |
|
and "poptable" in pcre_dfa_exec.c. |
|
|
|
****** NOTE NOTE NOTE ******/ |
|
|
|
|
|
/* The values between FIRST_AUTOTAB_OP and LAST_AUTOTAB_RIGHT_OP, inclusive, |
|
are used in a table for deciding whether a repeated character type can be |
|
auto-possessified. */ |
|
|
|
#define FIRST_AUTOTAB_OP OP_NOT_DIGIT |
|
#define LAST_AUTOTAB_LEFT_OP OP_EXTUNI |
|
#define LAST_AUTOTAB_RIGHT_OP OP_DOLLM |
|
|
enum { |
enum { |
OP_END, /* 0 End of pattern */ |
OP_END, /* 0 End of pattern */ |
|
|
Line 1573 enum {
|
Line 1952 enum {
|
OP_NOT_WORDCHAR, /* 10 \W */ |
OP_NOT_WORDCHAR, /* 10 \W */ |
OP_WORDCHAR, /* 11 \w */ |
OP_WORDCHAR, /* 11 \w */ |
|
|
OP_ANY, /* 12 Match any character except newline */ | OP_ANY, /* 12 Match any character except newline (\N) */ |
OP_ALLANY, /* 13 Match any character */ |
OP_ALLANY, /* 13 Match any character */ |
OP_ANYBYTE, /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */ |
OP_ANYBYTE, /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */ |
OP_NOTPROP, /* 15 \P (not Unicode property) */ |
OP_NOTPROP, /* 15 \P (not Unicode property) */ |
Line 1584 enum {
|
Line 1963 enum {
|
OP_NOT_VSPACE, /* 20 \V (not vertical whitespace) */ |
OP_NOT_VSPACE, /* 20 \V (not vertical whitespace) */ |
OP_VSPACE, /* 21 \v (vertical whitespace) */ |
OP_VSPACE, /* 21 \v (vertical whitespace) */ |
OP_EXTUNI, /* 22 \X (extended Unicode sequence */ |
OP_EXTUNI, /* 22 \X (extended Unicode sequence */ |
OP_EODN, /* 23 End of data or \n at end of data: \Z. */ | OP_EODN, /* 23 End of data or \n at end of data (\Z) */ |
OP_EOD, /* 24 End of data: \z */ | OP_EOD, /* 24 End of data (\z) */ |
|
|
OP_CIRC, /* 25 Start of line - not multiline */ | /* Line end assertions */ |
OP_CIRCM, /* 26 Start of line - multiline */ | |
OP_DOLL, /* 27 End of line - not multiline */ | OP_DOLL, /* 25 End of line - not multiline */ |
OP_DOLLM, /* 28 End of line - multiline */ | OP_DOLLM, /* 26 End of line - multiline */ |
| OP_CIRC, /* 27 Start of line - not multiline */ |
| OP_CIRCM, /* 28 Start of line - multiline */ |
| |
| /* Single characters; caseful must precede the caseless ones */ |
| |
OP_CHAR, /* 29 Match one character, casefully */ |
OP_CHAR, /* 29 Match one character, casefully */ |
OP_CHARI, /* 30 Match one character, caselessly */ |
OP_CHARI, /* 30 Match one character, caselessly */ |
OP_NOT, /* 31 Match one character, not the given one, casefully */ |
OP_NOT, /* 31 Match one character, not the given one, casefully */ |
Line 1599 enum {
|
Line 1983 enum {
|
/* The following sets of 13 opcodes must always be kept in step because |
/* The following sets of 13 opcodes must always be kept in step because |
the offset from the first one is used to generate the others. */ |
the offset from the first one is used to generate the others. */ |
|
|
/**** Single characters, caseful, must precede the caseless ones ****/ | /* Repeated characters; caseful must precede the caseless ones */ |
|
|
OP_STAR, /* 33 The maximizing and minimizing versions of */ |
OP_STAR, /* 33 The maximizing and minimizing versions of */ |
OP_MINSTAR, /* 34 these six opcodes must come in pairs, with */ |
OP_MINSTAR, /* 34 these six opcodes must come in pairs, with */ |
Line 1617 enum {
|
Line 2001 enum {
|
OP_POSQUERY, /* 44 Posesssified query, caseful */ |
OP_POSQUERY, /* 44 Posesssified query, caseful */ |
OP_POSUPTO, /* 45 Possessified upto, caseful */ |
OP_POSUPTO, /* 45 Possessified upto, caseful */ |
|
|
/**** Single characters, caseless, must follow the caseful ones */ | /* Repeated characters; caseless must follow the caseful ones */ |
|
|
OP_STARI, /* 46 */ |
OP_STARI, /* 46 */ |
OP_MINSTARI, /* 47 */ |
OP_MINSTARI, /* 47 */ |
Line 1635 enum {
|
Line 2019 enum {
|
OP_POSQUERYI, /* 57 Posesssified query, caseless */ |
OP_POSQUERYI, /* 57 Posesssified query, caseless */ |
OP_POSUPTOI, /* 58 Possessified upto, caseless */ |
OP_POSUPTOI, /* 58 Possessified upto, caseless */ |
|
|
/**** The negated ones must follow the non-negated ones, and match them ****/ | /* The negated ones must follow the non-negated ones, and match them */ |
/**** Negated single character, caseful; must precede the caseless ones ****/ | /* Negated repeated character, caseful; must precede the caseless ones */ |
|
|
OP_NOTSTAR, /* 59 The maximizing and minimizing versions of */ |
OP_NOTSTAR, /* 59 The maximizing and minimizing versions of */ |
OP_NOTMINSTAR, /* 60 these six opcodes must come in pairs, with */ |
OP_NOTMINSTAR, /* 60 these six opcodes must come in pairs, with */ |
Line 1654 enum {
|
Line 2038 enum {
|
OP_NOTPOSQUERY, /* 70 */ |
OP_NOTPOSQUERY, /* 70 */ |
OP_NOTPOSUPTO, /* 71 */ |
OP_NOTPOSUPTO, /* 71 */ |
|
|
/**** Negated single character, caseless; must follow the caseful ones ****/ | /* Negated repeated character, caseless; must follow the caseful ones */ |
|
|
OP_NOTSTARI, /* 72 */ |
OP_NOTSTARI, /* 72 */ |
OP_NOTMINSTARI, /* 73 */ |
OP_NOTMINSTARI, /* 73 */ |
Line 1672 enum {
|
Line 2056 enum {
|
OP_NOTPOSQUERYI, /* 83 */ |
OP_NOTPOSQUERYI, /* 83 */ |
OP_NOTPOSUPTOI, /* 84 */ |
OP_NOTPOSUPTOI, /* 84 */ |
|
|
/**** Character types ****/ | /* Character types */ |
|
|
OP_TYPESTAR, /* 85 The maximizing and minimizing versions of */ |
OP_TYPESTAR, /* 85 The maximizing and minimizing versions of */ |
OP_TYPEMINSTAR, /* 86 these six opcodes must come in pairs, with */ |
OP_TYPEMINSTAR, /* 86 these six opcodes must come in pairs, with */ |
Line 1703 enum {
|
Line 2087 enum {
|
OP_CRRANGE, /* 104 These are different to the three sets above. */ |
OP_CRRANGE, /* 104 These are different to the three sets above. */ |
OP_CRMINRANGE, /* 105 */ |
OP_CRMINRANGE, /* 105 */ |
|
|
|
OP_CRPOSSTAR, /* 106 Possessified versions */ |
|
OP_CRPOSPLUS, /* 107 */ |
|
OP_CRPOSQUERY, /* 108 */ |
|
OP_CRPOSRANGE, /* 109 */ |
|
|
/* End of quantifier opcodes */ |
/* End of quantifier opcodes */ |
|
|
OP_CLASS, /* 106 Match a character class, chars < 256 only */ | OP_CLASS, /* 110 Match a character class, chars < 256 only */ |
OP_NCLASS, /* 107 Same, but the bitmap was created from a negative | OP_NCLASS, /* 111 Same, but the bitmap was created from a negative |
class - the difference is relevant only when a |
class - the difference is relevant only when a |
character > 255 is encountered. */ |
character > 255 is encountered. */ |
OP_XCLASS, /* 108 Extended class for handling > 255 chars within the | OP_XCLASS, /* 112 Extended class for handling > 255 chars within the |
class. This does both positive and negative. */ |
class. This does both positive and negative. */ |
OP_REF, /* 109 Match a back reference, casefully */ | OP_REF, /* 113 Match a back reference, casefully */ |
OP_REFI, /* 110 Match a back reference, caselessly */ | OP_REFI, /* 114 Match a back reference, caselessly */ |
OP_RECURSE, /* 111 Match a numbered subpattern (possibly recursive) */ | OP_DNREF, /* 115 Match a duplicate name backref, casefully */ |
OP_CALLOUT, /* 112 Call out to external function if provided */ | OP_DNREFI, /* 116 Match a duplicate name backref, caselessly */ |
| OP_RECURSE, /* 117 Match a numbered subpattern (possibly recursive) */ |
| OP_CALLOUT, /* 118 Call out to external function if provided */ |
|
|
OP_ALT, /* 113 Start of alternation */ | OP_ALT, /* 119 Start of alternation */ |
OP_KET, /* 114 End of group that doesn't have an unbounded repeat */ | OP_KET, /* 120 End of group that doesn't have an unbounded repeat */ |
OP_KETRMAX, /* 115 These two must remain together and in this */ | OP_KETRMAX, /* 121 These two must remain together and in this */ |
OP_KETRMIN, /* 116 order. They are for groups the repeat for ever. */ | OP_KETRMIN, /* 122 order. They are for groups the repeat for ever. */ |
OP_KETRPOS, /* 117 Possessive unlimited repeat. */ | OP_KETRPOS, /* 123 Possessive unlimited repeat. */ |
|
|
/* The assertions must come before BRA, CBRA, ONCE, and COND, and the four |
/* The assertions must come before BRA, CBRA, ONCE, and COND, and the four |
asserts must remain in order. */ |
asserts must remain in order. */ |
|
|
OP_REVERSE, /* 118 Move pointer back - used in lookbehind assertions */ | OP_REVERSE, /* 124 Move pointer back - used in lookbehind assertions */ |
OP_ASSERT, /* 119 Positive lookahead */ | OP_ASSERT, /* 125 Positive lookahead */ |
OP_ASSERT_NOT, /* 120 Negative lookahead */ | OP_ASSERT_NOT, /* 126 Negative lookahead */ |
OP_ASSERTBACK, /* 121 Positive lookbehind */ | OP_ASSERTBACK, /* 127 Positive lookbehind */ |
OP_ASSERTBACK_NOT, /* 122 Negative lookbehind */ | OP_ASSERTBACK_NOT, /* 128 Negative lookbehind */ |
|
|
/* ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND must come immediately |
/* ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND must come immediately |
after the assertions, with ONCE first, as there's a test for >= ONCE for a |
after the assertions, with ONCE first, as there's a test for >= ONCE for a |
subpattern that isn't an assertion. The POS versions must immediately follow |
subpattern that isn't an assertion. The POS versions must immediately follow |
the non-POS versions in each case. */ |
the non-POS versions in each case. */ |
|
|
OP_ONCE, /* 123 Atomic group, contains captures */ | OP_ONCE, /* 129 Atomic group, contains captures */ |
OP_ONCE_NC, /* 124 Atomic group containing no captures */ | OP_ONCE_NC, /* 130 Atomic group containing no captures */ |
OP_BRA, /* 125 Start of non-capturing bracket */ | OP_BRA, /* 131 Start of non-capturing bracket */ |
OP_BRAPOS, /* 126 Ditto, with unlimited, possessive repeat */ | OP_BRAPOS, /* 132 Ditto, with unlimited, possessive repeat */ |
OP_CBRA, /* 127 Start of capturing bracket */ | OP_CBRA, /* 133 Start of capturing bracket */ |
OP_CBRAPOS, /* 128 Ditto, with unlimited, possessive repeat */ | OP_CBRAPOS, /* 134 Ditto, with unlimited, possessive repeat */ |
OP_COND, /* 129 Conditional group */ | OP_COND, /* 135 Conditional group */ |
|
|
/* These five must follow the previous five, in the same order. There's a |
/* These five must follow the previous five, in the same order. There's a |
check for >= SBRA to distinguish the two sets. */ |
check for >= SBRA to distinguish the two sets. */ |
|
|
OP_SBRA, /* 130 Start of non-capturing bracket, check empty */ | OP_SBRA, /* 136 Start of non-capturing bracket, check empty */ |
OP_SBRAPOS, /* 131 Ditto, with unlimited, possessive repeat */ | OP_SBRAPOS, /* 137 Ditto, with unlimited, possessive repeat */ |
OP_SCBRA, /* 132 Start of capturing bracket, check empty */ | OP_SCBRA, /* 138 Start of capturing bracket, check empty */ |
OP_SCBRAPOS, /* 133 Ditto, with unlimited, possessive repeat */ | OP_SCBRAPOS, /* 139 Ditto, with unlimited, possessive repeat */ |
OP_SCOND, /* 134 Conditional group, check empty */ | OP_SCOND, /* 140 Conditional group, check empty */ |
|
|
/* The next two pairs must (respectively) be kept together. */ |
/* The next two pairs must (respectively) be kept together. */ |
|
|
OP_CREF, /* 135 Used to hold a capture number as condition */ | OP_CREF, /* 141 Used to hold a capture number as condition */ |
OP_NCREF, /* 136 Same, but generated by a name reference*/ | OP_DNCREF, /* 142 Used to point to duplicate names as a condition */ |
OP_RREF, /* 137 Used to hold a recursion number as condition */ | OP_RREF, /* 143 Used to hold a recursion number as condition */ |
OP_NRREF, /* 138 Same, but generated by a name reference*/ | OP_DNRREF, /* 144 Used to point to duplicate names as a condition */ |
OP_DEF, /* 139 The DEFINE condition */ | OP_DEF, /* 145 The DEFINE condition */ |
|
|
OP_BRAZERO, /* 140 These two must remain together and in this */ | OP_BRAZERO, /* 146 These two must remain together and in this */ |
OP_BRAMINZERO, /* 141 order. */ | OP_BRAMINZERO, /* 147 order. */ |
OP_BRAPOSZERO, /* 142 */ | OP_BRAPOSZERO, /* 148 */ |
|
|
/* These are backtracking control verbs */ |
/* These are backtracking control verbs */ |
|
|
OP_MARK, /* 143 always has an argument */ | OP_MARK, /* 149 always has an argument */ |
OP_PRUNE, /* 144 */ | OP_PRUNE, /* 150 */ |
OP_PRUNE_ARG, /* 145 same, but with argument */ | OP_PRUNE_ARG, /* 151 same, but with argument */ |
OP_SKIP, /* 146 */ | OP_SKIP, /* 152 */ |
OP_SKIP_ARG, /* 147 same, but with argument */ | OP_SKIP_ARG, /* 153 same, but with argument */ |
OP_THEN, /* 148 */ | OP_THEN, /* 154 */ |
OP_THEN_ARG, /* 149 same, but with argument */ | OP_THEN_ARG, /* 155 same, but with argument */ |
OP_COMMIT, /* 150 */ | OP_COMMIT, /* 156 */ |
|
|
/* These are forced failure and success verbs */ |
/* These are forced failure and success verbs */ |
|
|
OP_FAIL, /* 151 */ | OP_FAIL, /* 157 */ |
OP_ACCEPT, /* 152 */ | OP_ACCEPT, /* 158 */ |
OP_ASSERT_ACCEPT, /* 153 Used inside assertions */ | OP_ASSERT_ACCEPT, /* 159 Used inside assertions */ |
OP_CLOSE, /* 154 Used before OP_ACCEPT to close open captures */ | OP_CLOSE, /* 160 Used before OP_ACCEPT to close open captures */ |
|
|
/* This is used to skip a subpattern with a {0} quantifier */ |
/* This is used to skip a subpattern with a {0} quantifier */ |
|
|
OP_SKIPZERO, /* 155 */ | OP_SKIPZERO, /* 161 */ |
|
|
/* This is not an opcode, but is used to check that tables indexed by opcode |
/* This is not an opcode, but is used to check that tables indexed by opcode |
are the correct length, in order to catch updating errors - there have been |
are the correct length, in order to catch updating errors - there have been |
Line 1796 enum {
|
Line 2187 enum {
|
|
|
/* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro |
/* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro |
definitions that follow must also be updated to match. There are also tables |
definitions that follow must also be updated to match. There are also tables |
called "coptable" and "poptable" in pcre_dfa_exec.c that must be updated. */ | called "opcode_possessify" in pcre_compile.c and "coptable" and "poptable" in |
| pcre_dfa_exec.c that must be updated. */ |
|
|
|
|
/* This macro defines textual names for all the opcodes. These are used only |
/* This macro defines textual names for all the opcodes. These are used only |
Line 1809 some cases doesn't actually use these names at all). *
|
Line 2201 some cases doesn't actually use these names at all). *
|
"\\S", "\\s", "\\W", "\\w", "Any", "AllAny", "Anybyte", \ |
"\\S", "\\s", "\\W", "\\w", "Any", "AllAny", "Anybyte", \ |
"notprop", "prop", "\\R", "\\H", "\\h", "\\V", "\\v", \ |
"notprop", "prop", "\\R", "\\H", "\\h", "\\V", "\\v", \ |
"extuni", "\\Z", "\\z", \ |
"extuni", "\\Z", "\\z", \ |
"^", "^", "$", "$", "char", "chari", "not", "noti", \ | "$", "$", "^", "^", "char", "chari", "not", "noti", \ |
"*", "*?", "+", "+?", "?", "??", \ |
"*", "*?", "+", "+?", "?", "??", \ |
"{", "{", "{", \ |
"{", "{", "{", \ |
"*+","++", "?+", "{", \ |
"*+","++", "?+", "{", \ |
Line 1825 some cases doesn't actually use these names at all). *
|
Line 2217 some cases doesn't actually use these names at all). *
|
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ |
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ |
"*+","++", "?+", "{", \ |
"*+","++", "?+", "{", \ |
"*", "*?", "+", "+?", "?", "??", "{", "{", \ |
"*", "*?", "+", "+?", "?", "??", "{", "{", \ |
"class", "nclass", "xclass", "Ref", "Refi", \ | "*+","++", "?+", "{", \ |
| "class", "nclass", "xclass", "Ref", "Refi", "DnRef", "DnRefi", \ |
"Recurse", "Callout", \ |
"Recurse", "Callout", \ |
"Alt", "Ket", "KetRmax", "KetRmin", "KetRpos", \ |
"Alt", "Ket", "KetRmax", "KetRmin", "KetRpos", \ |
"Reverse", "Assert", "Assert not", "AssertB", "AssertB not", \ |
"Reverse", "Assert", "Assert not", "AssertB", "AssertB not", \ |
Line 1834 some cases doesn't actually use these names at all). *
|
Line 2227 some cases doesn't actually use these names at all). *
|
"Cond", \ |
"Cond", \ |
"SBra", "SBraPos", "SCBra", "SCBraPos", \ |
"SBra", "SBraPos", "SCBra", "SCBraPos", \ |
"SCond", \ |
"SCond", \ |
"Cond ref", "Cond nref", "Cond rec", "Cond nrec", "Cond def", \ | "Cond ref", "Cond dnref", "Cond rec", "Cond dnrec", "Cond def", \ |
"Brazero", "Braminzero", "Braposzero", \ |
"Brazero", "Braminzero", "Braposzero", \ |
"*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP", \ |
"*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP", \ |
"*THEN", "*THEN", "*COMMIT", "*FAIL", \ |
"*THEN", "*THEN", "*COMMIT", "*FAIL", \ |
Line 1859 in UTF-8 mode. The code that uses this table must know
|
Line 2252 in UTF-8 mode. The code that uses this table must know
|
3, 3, /* \P, \p */ \ |
3, 3, /* \P, \p */ \ |
1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ \ |
1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ \ |
1, /* \X */ \ |
1, /* \X */ \ |
1, 1, 1, 1, 1, 1, /* \Z, \z, ^, ^M, $, $M */ \ | 1, 1, 1, 1, 1, 1, /* \Z, \z, $, $M ^, ^M */ \ |
2, /* Char - the minimum length */ \ |
2, /* Char - the minimum length */ \ |
2, /* Chari - the minimum length */ \ |
2, /* Chari - the minimum length */ \ |
2, /* not */ \ |
2, /* not */ \ |
Line 1890 in UTF-8 mode. The code that uses this table must know
|
Line 2283 in UTF-8 mode. The code that uses this table must know
|
/* Character class & ref repeats */ \ |
/* Character class & ref repeats */ \ |
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \ |
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \ |
1+2*IMM2_SIZE, 1+2*IMM2_SIZE, /* CRRANGE, CRMINRANGE */ \ |
1+2*IMM2_SIZE, 1+2*IMM2_SIZE, /* CRRANGE, CRMINRANGE */ \ |
|
1, 1, 1, 1+2*IMM2_SIZE, /* Possessive *+, ++, ?+, CRPOSRANGE */ \ |
1+(32/sizeof(pcre_uchar)), /* CLASS */ \ |
1+(32/sizeof(pcre_uchar)), /* CLASS */ \ |
1+(32/sizeof(pcre_uchar)), /* NCLASS */ \ |
1+(32/sizeof(pcre_uchar)), /* NCLASS */ \ |
0, /* XCLASS - variable length */ \ |
0, /* XCLASS - variable length */ \ |
1+IMM2_SIZE, /* REF */ \ |
1+IMM2_SIZE, /* REF */ \ |
1+IMM2_SIZE, /* REFI */ \ |
1+IMM2_SIZE, /* REFI */ \ |
|
1+2*IMM2_SIZE, /* DNREF */ \ |
|
1+2*IMM2_SIZE, /* DNREFI */ \ |
1+LINK_SIZE, /* RECURSE */ \ |
1+LINK_SIZE, /* RECURSE */ \ |
2+2*LINK_SIZE, /* CALLOUT */ \ |
2+2*LINK_SIZE, /* CALLOUT */ \ |
1+LINK_SIZE, /* Alt */ \ |
1+LINK_SIZE, /* Alt */ \ |
Line 1919 in UTF-8 mode. The code that uses this table must know
|
Line 2315 in UTF-8 mode. The code that uses this table must know
|
1+LINK_SIZE+IMM2_SIZE, /* SCBRA */ \ |
1+LINK_SIZE+IMM2_SIZE, /* SCBRA */ \ |
1+LINK_SIZE+IMM2_SIZE, /* SCBRAPOS */ \ |
1+LINK_SIZE+IMM2_SIZE, /* SCBRAPOS */ \ |
1+LINK_SIZE, /* SCOND */ \ |
1+LINK_SIZE, /* SCOND */ \ |
1+IMM2_SIZE, 1+IMM2_SIZE, /* CREF, NCREF */ \ | 1+IMM2_SIZE, 1+2*IMM2_SIZE, /* CREF, DNCREF */ \ |
1+IMM2_SIZE, 1+IMM2_SIZE, /* RREF, NRREF */ \ | 1+IMM2_SIZE, 1+2*IMM2_SIZE, /* RREF, DNRREF */ \ |
1, /* DEF */ \ |
1, /* DEF */ \ |
1, 1, 1, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ \ |
1, 1, 1, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ \ |
3, 1, 3, /* MARK, PRUNE, PRUNE_ARG */ \ |
3, 1, 3, /* MARK, PRUNE, PRUNE_ARG */ \ |
Line 1929 in UTF-8 mode. The code that uses this table must know
|
Line 2325 in UTF-8 mode. The code that uses this table must know
|
1, 1, 1, 1, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */ \ |
1, 1, 1, 1, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */ \ |
1+IMM2_SIZE, 1 /* CLOSE, SKIPZERO */ |
1+IMM2_SIZE, 1 /* CLOSE, SKIPZERO */ |
|
|
/* A magic value for OP_RREF and OP_NRREF to indicate the "any recursion" | /* A magic value for OP_RREF to indicate the "any recursion" condition. */ |
condition. */ | |
|
|
#define RREF_ANY 0xffff |
#define RREF_ANY 0xffff |
|
|
Line 1945 enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6,
|
Line 2340 enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6,
|
ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, |
ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, |
ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, |
ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, |
ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, |
ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, |
ERR70, ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERRCOUNT }; | ERR70, ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, |
| ERR80, ERR81, ERR82, ERR83, ERR84, ERRCOUNT }; |
|
|
/* JIT compiling modes. The function list is indexed by them. */ |
/* JIT compiling modes. The function list is indexed by them. */ |
|
|
enum { JIT_COMPILE, JIT_PARTIAL_SOFT_COMPILE, JIT_PARTIAL_HARD_COMPILE, |
enum { JIT_COMPILE, JIT_PARTIAL_SOFT_COMPILE, JIT_PARTIAL_HARD_COMPILE, |
JIT_NUMBER_OF_COMPILE_MODES }; |
JIT_NUMBER_OF_COMPILE_MODES }; |
|
|
Line 1955 enum { JIT_COMPILE, JIT_PARTIAL_SOFT_COMPILE, JIT_PART
|
Line 2352 enum { JIT_COMPILE, JIT_PARTIAL_SOFT_COMPILE, JIT_PART
|
code vector run on as long as necessary after the end. We store an explicit |
code vector run on as long as necessary after the end. We store an explicit |
offset to the name table so that if a regex is compiled on one host, saved, and |
offset to the name table so that if a regex is compiled on one host, saved, and |
then run on another where the size of pointers is different, all might still |
then run on another where the size of pointers is different, all might still |
be well. For the case of compiled-on-4 and run-on-8, we include an extra | be well. |
pointer that is always NULL. For future-proofing, a few dummy fields were | |
originally included - even though you can never get this planning right - but | |
there is only one left now. | |
|
|
NOTE NOTE NOTE: | The size of the structure must be a multiple of 8 bytes. For the case of |
Because people can now save and re-use compiled patterns, any additions to this | compiled-on-4 and run-on-8, we include an extra pointer that is always NULL so |
structure should be made at the end, and something earlier (e.g. a new | that there are an even number of pointers which therefore are a multiple of 8 |
flag in the options or one of the dummy fields) should indicate that the new | bytes. |
fields are present. Currently PCRE always sets the dummy fields to zero. | |
NOTE NOTE NOTE | It is necessary to fork the struct for the 32 bit library, since it needs to |
| use pcre_uint32 for first_char and req_char. We can't put an ifdef inside the |
| typedef because pcretest needs access to the struct of the 8-, 16- and 32-bit |
| variants. |
| |
| *** WARNING *** |
| When new fields are added to these structures, remember to adjust the code in |
| pcre_byte_order.c that is concerned with swapping the byte order of the fields |
| when a compiled regex is reloaded on a host with different endianness. |
| *** WARNING *** |
| There is also similar byte-flipping code in pcretest.c, which is used for |
| testing the byte-flipping features. It must also be kept in step. |
| *** WARNING *** |
*/ |
*/ |
|
|
#ifdef COMPILE_PCRE8 | typedef struct real_pcre8_or_16 { |
#define REAL_PCRE real_pcre | pcre_uint32 magic_number; |
#else | pcre_uint32 size; /* Total that was malloced */ |
#define REAL_PCRE real_pcre16 | pcre_uint32 options; /* Public options */ |
#endif | pcre_uint32 flags; /* Private flags */ |
| pcre_uint32 limit_match; /* Limit set from regex */ |
| pcre_uint32 limit_recursion; /* Limit set from regex */ |
| pcre_uint16 first_char; /* Starting character */ |
| pcre_uint16 req_char; /* This character must be seen */ |
| pcre_uint16 max_lookbehind; /* Longest lookbehind (characters) */ |
| pcre_uint16 top_bracket; /* Highest numbered group */ |
| pcre_uint16 top_backref; /* Highest numbered back reference */ |
| pcre_uint16 name_table_offset; /* Offset to name table that follows */ |
| pcre_uint16 name_entry_size; /* Size of any name items */ |
| pcre_uint16 name_count; /* Number of name items */ |
| pcre_uint16 ref_count; /* Reference count */ |
| pcre_uint16 dummy1; /* To ensure size is a multiple of 8 */ |
| pcre_uint16 dummy2; /* To ensure size is a multiple of 8 */ |
| pcre_uint16 dummy3; /* To ensure size is a multiple of 8 */ |
| const pcre_uint8 *tables; /* Pointer to tables or NULL for std */ |
| void *nullpad; /* NULL padding */ |
| } real_pcre8_or_16; |
|
|
typedef struct REAL_PCRE { | typedef struct real_pcre8_or_16 real_pcre; |
| typedef struct real_pcre8_or_16 real_pcre16; |
| |
| typedef struct real_pcre32 { |
pcre_uint32 magic_number; |
pcre_uint32 magic_number; |
pcre_uint32 size; /* Total that was malloced */ |
pcre_uint32 size; /* Total that was malloced */ |
pcre_uint32 options; /* Public options */ |
pcre_uint32 options; /* Public options */ |
pcre_uint16 flags; /* Private flags */ | pcre_uint32 flags; /* Private flags */ |
| pcre_uint32 limit_match; /* Limit set from regex */ |
| pcre_uint32 limit_recursion; /* Limit set from regex */ |
| pcre_uint32 first_char; /* Starting character */ |
| pcre_uint32 req_char; /* This character must be seen */ |
pcre_uint16 max_lookbehind; /* Longest lookbehind (characters) */ |
pcre_uint16 max_lookbehind; /* Longest lookbehind (characters) */ |
pcre_uint16 top_bracket; /* Highest numbered group */ |
pcre_uint16 top_bracket; /* Highest numbered group */ |
pcre_uint16 top_backref; /* Highest numbered back reference */ |
pcre_uint16 top_backref; /* Highest numbered back reference */ |
pcre_uint16 first_char; /* Starting character */ |
|
pcre_uint16 req_char; /* This character must be seen */ |
|
pcre_uint16 name_table_offset; /* Offset to name table that follows */ |
pcre_uint16 name_table_offset; /* Offset to name table that follows */ |
pcre_uint16 name_entry_size; /* Size of any name items */ |
pcre_uint16 name_entry_size; /* Size of any name items */ |
pcre_uint16 name_count; /* Number of name items */ |
pcre_uint16 name_count; /* Number of name items */ |
pcre_uint16 ref_count; /* Reference count */ |
pcre_uint16 ref_count; /* Reference count */ |
|
pcre_uint16 dummy; /* To ensure size is a multiple of 8 */ |
const pcre_uint8 *tables; /* Pointer to tables or NULL for std */ |
const pcre_uint8 *tables; /* Pointer to tables or NULL for std */ |
const pcre_uint8 *nullpad; /* NULL padding */ | void *nullpad; /* NULL padding */ |
} REAL_PCRE; | } real_pcre32; |
|
|
|
#if defined COMPILE_PCRE8 |
|
#define REAL_PCRE real_pcre |
|
#elif defined COMPILE_PCRE16 |
|
#define REAL_PCRE real_pcre16 |
|
#elif defined COMPILE_PCRE32 |
|
#define REAL_PCRE real_pcre32 |
|
#endif |
|
|
|
/* Assert that the size of REAL_PCRE is divisible by 8 */ |
|
typedef int __assert_real_pcre_size_divisible_8[(sizeof(REAL_PCRE) % 8) == 0 ? 1 : -1]; |
|
|
|
/* Needed in pcretest to access some fields in the real_pcre* structures |
|
* directly. They're unified for 8/16/32 bits since the structs only differ |
|
* after these fields; if that ever changes, need to fork those defines into |
|
* 8/16 and 32 bit versions. */ |
|
#define REAL_PCRE_MAGIC(re) (((REAL_PCRE*)re)->magic_number) |
|
#define REAL_PCRE_SIZE(re) (((REAL_PCRE*)re)->size) |
|
#define REAL_PCRE_OPTIONS(re) (((REAL_PCRE*)re)->options) |
|
#define REAL_PCRE_FLAGS(re) (((REAL_PCRE*)re)->flags) |
|
|
/* The format of the block used to store data from pcre_study(). The same |
/* The format of the block used to store data from pcre_study(). The same |
remark (see NOTE above) about extending this structure applies. */ |
remark (see NOTE above) about extending this structure applies. */ |
|
|
Line 2013 typedef struct open_capitem {
|
Line 2462 typedef struct open_capitem {
|
pcre_uint16 flag; /* Set TRUE if recursive back ref */ |
pcre_uint16 flag; /* Set TRUE if recursive back ref */ |
} open_capitem; |
} open_capitem; |
|
|
|
/* Structure for building a list of named groups during the first pass of |
|
compiling. */ |
|
|
|
typedef struct named_group { |
|
const pcre_uchar *name; /* Points to the name in the pattern */ |
|
int length; /* Length of the name */ |
|
pcre_uint32 number; /* Group number */ |
|
} named_group; |
|
|
/* Structure for passing "static" information around between the functions |
/* Structure for passing "static" information around between the functions |
doing the compiling, so that they are thread-safe. */ |
doing the compiling, so that they are thread-safe. */ |
|
|
Line 2025 typedef struct compile_data {
|
Line 2483 typedef struct compile_data {
|
const pcre_uchar *start_code; /* The start of the compiled code */ |
const pcre_uchar *start_code; /* The start of the compiled code */ |
const pcre_uchar *start_pattern; /* The start of the pattern */ |
const pcre_uchar *start_pattern; /* The start of the pattern */ |
const pcre_uchar *end_pattern; /* The end of the pattern */ |
const pcre_uchar *end_pattern; /* The end of the pattern */ |
open_capitem *open_caps; /* Chain of open capture items */ |
|
pcre_uchar *hwm; /* High watermark of workspace */ |
pcre_uchar *hwm; /* High watermark of workspace */ |
|
open_capitem *open_caps; /* Chain of open capture items */ |
|
named_group *named_groups; /* Points to vector in pre-compile */ |
pcre_uchar *name_table; /* The name/number table */ |
pcre_uchar *name_table; /* The name/number table */ |
int names_found; /* Number of entries so far */ |
int names_found; /* Number of entries so far */ |
int name_entry_size; /* Size of each entry */ |
int name_entry_size; /* Size of each entry */ |
|
int named_group_list_size; /* Number of entries in the list */ |
int workspace_size; /* Size of workspace */ |
int workspace_size; /* Size of workspace */ |
int bracount; /* Count of capturing parens as we compile */ | unsigned int bracount; /* Count of capturing parens as we compile */ |
int final_bracount; /* Saved value after first pass */ |
int final_bracount; /* Saved value after first pass */ |
int max_lookbehind; /* Maximum lookbehind (characters) */ |
int max_lookbehind; /* Maximum lookbehind (characters) */ |
int top_backref; /* Maximum back reference */ |
int top_backref; /* Maximum back reference */ |
unsigned int backref_map; /* Bitmap of low back refs */ |
unsigned int backref_map; /* Bitmap of low back refs */ |
|
unsigned int namedrefcount; /* Number of backreferences by name */ |
|
int parens_depth; /* Depth of nested parentheses */ |
int assert_depth; /* Depth of nested assertions */ |
int assert_depth; /* Depth of nested assertions */ |
int external_options; /* External (initial) options */ | pcre_uint32 external_options; /* External (initial) options */ |
int external_flags; /* External flag bits to be set */ | pcre_uint32 external_flags; /* External flag bits to be set */ |
int req_varyopt; /* "After variable item" flag for reqbyte */ |
int req_varyopt; /* "After variable item" flag for reqbyte */ |
BOOL had_accept; /* (*ACCEPT) encountered */ |
BOOL had_accept; /* (*ACCEPT) encountered */ |
|
BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */ |
BOOL check_lookbehind; /* Lookbehinds need later checking */ |
BOOL check_lookbehind; /* Lookbehinds need later checking */ |
|
BOOL dupnames; /* Duplicate names exist */ |
int nltype; /* Newline type */ |
int nltype; /* Newline type */ |
int nllen; /* Newline string length */ |
int nllen; /* Newline string length */ |
pcre_uchar nl[4]; /* Newline string when fixed length */ |
pcre_uchar nl[4]; /* Newline string when fixed length */ |
Line 2060 call within the pattern; used by pcre_exec(). */
|
Line 2524 call within the pattern; used by pcre_exec(). */
|
|
|
typedef struct recursion_info { |
typedef struct recursion_info { |
struct recursion_info *prevrec; /* Previous recursion record (or NULL) */ |
struct recursion_info *prevrec; /* Previous recursion record (or NULL) */ |
int group_num; /* Number of group that was called */ | unsigned int group_num; /* Number of group that was called */ |
int *offset_save; /* Pointer to start of saved offsets */ |
int *offset_save; /* Pointer to start of saved offsets */ |
int saved_max; /* Number of saved offsets */ |
int saved_max; /* Number of saved offsets */ |
|
int saved_capture_last; /* Last capture number */ |
PCRE_PUCHAR subject_position; /* Position at start of recursion */ |
PCRE_PUCHAR subject_position; /* Position at start of recursion */ |
} recursion_info; |
} recursion_info; |
|
|
Line 2099 typedef struct match_data {
|
Line 2564 typedef struct match_data {
|
int nllen; /* Newline string length */ |
int nllen; /* Newline string length */ |
int name_count; /* Number of names in name table */ |
int name_count; /* Number of names in name table */ |
int name_entry_size; /* Size of entry in names table */ |
int name_entry_size; /* Size of entry in names table */ |
|
unsigned int skip_arg_count; /* For counting SKIP_ARGs */ |
|
unsigned int ignore_skip_arg; /* For re-run when SKIP arg name not found */ |
pcre_uchar *name_table; /* Table of names */ |
pcre_uchar *name_table; /* Table of names */ |
pcre_uchar nl[4]; /* Newline string when fixed */ |
pcre_uchar nl[4]; /* Newline string when fixed */ |
const pcre_uint8 *lcc; /* Points to lower casing table */ |
const pcre_uint8 *lcc; /* Points to lower casing table */ |
const pcre_uint8 *fcc; /* Points to case-flipping table */ |
const pcre_uint8 *fcc; /* Points to case-flipping table */ |
const pcre_uint8 *ctypes; /* Points to table of type maps */ |
const pcre_uint8 *ctypes; /* Points to table of type maps */ |
BOOL offset_overflow; /* Set if too many extractions */ |
|
BOOL notbol; /* NOTBOL flag */ |
BOOL notbol; /* NOTBOL flag */ |
BOOL noteol; /* NOTEOL flag */ |
BOOL noteol; /* NOTEOL flag */ |
BOOL utf; /* UTF-8 / UTF-16 flag */ |
BOOL utf; /* UTF-8 / UTF-16 flag */ |
Line 2116 typedef struct match_data {
|
Line 2582 typedef struct match_data {
|
BOOL hitend; /* Hit the end of the subject at some point */ |
BOOL hitend; /* Hit the end of the subject at some point */ |
BOOL bsr_anycrlf; /* \R is just any CRLF, not full Unicode */ |
BOOL bsr_anycrlf; /* \R is just any CRLF, not full Unicode */ |
BOOL hasthen; /* Pattern contains (*THEN) */ |
BOOL hasthen; /* Pattern contains (*THEN) */ |
BOOL ignore_skip_arg; /* For re-run when SKIP name not found */ |
|
const pcre_uchar *start_code; /* For use when recursing */ |
const pcre_uchar *start_code; /* For use when recursing */ |
PCRE_PUCHAR start_subject; /* Start of the subject string */ |
PCRE_PUCHAR start_subject; /* Start of the subject string */ |
PCRE_PUCHAR end_subject; /* End of the subject string */ |
PCRE_PUCHAR end_subject; /* End of the subject string */ |
Line 2125 typedef struct match_data {
|
Line 2590 typedef struct match_data {
|
PCRE_PUCHAR start_used_ptr; /* Earliest consulted character */ |
PCRE_PUCHAR start_used_ptr; /* Earliest consulted character */ |
int partial; /* PARTIAL options */ |
int partial; /* PARTIAL options */ |
int end_offset_top; /* Highwater mark at end of match */ |
int end_offset_top; /* Highwater mark at end of match */ |
int capture_last; /* Most recent capture number */ | pcre_int32 capture_last; /* Most recent capture number + overflow flag */ |
int start_offset; /* The start offset value */ |
int start_offset; /* The start offset value */ |
int match_function_type; /* Set for certain special calls of MATCH() */ |
int match_function_type; /* Set for certain special calls of MATCH() */ |
eptrblock *eptrchain; /* Chain of eptrblocks for tail recursions */ |
eptrblock *eptrchain; /* Chain of eptrblocks for tail recursions */ |
Line 2194 total length. */
|
Line 2659 total length. */
|
|
|
/* Internal function and data prefixes. */ |
/* Internal function and data prefixes. */ |
|
|
#ifdef COMPILE_PCRE8 | #if defined COMPILE_PCRE8 |
#ifndef PUBL |
#ifndef PUBL |
#define PUBL(name) pcre_##name |
#define PUBL(name) pcre_##name |
#endif |
#endif |
#ifndef PRIV |
#ifndef PRIV |
#define PRIV(name) _pcre_##name |
#define PRIV(name) _pcre_##name |
#endif |
#endif |
#else /* COMPILE_PCRE8 */ | #elif defined COMPILE_PCRE16 |
#ifdef COMPILE_PCRE16 | |
#ifndef PUBL |
#ifndef PUBL |
#define PUBL(name) pcre16_##name |
#define PUBL(name) pcre16_##name |
#endif |
#endif |
#ifndef PRIV |
#ifndef PRIV |
#define PRIV(name) _pcre16_##name |
#define PRIV(name) _pcre16_##name |
#endif |
#endif |
|
#elif defined COMPILE_PCRE32 |
|
#ifndef PUBL |
|
#define PUBL(name) pcre32_##name |
|
#endif |
|
#ifndef PRIV |
|
#define PRIV(name) _pcre32_##name |
|
#endif |
#else |
#else |
#error Unsupported compiling mode |
#error Unsupported compiling mode |
#endif /* COMPILE_PCRE16 */ | #endif /* COMPILE_PCRE[8|16|32] */ |
#endif /* COMPILE_PCRE8 */ | |
|
|
/* Layout of the UCP type table that translates property names into types and |
/* Layout of the UCP type table that translates property names into types and |
codes. Each entry used to point directly to a name, but to reduce the number of |
codes. Each entry used to point directly to a name, but to reduce the number of |
Line 2232 but are not part of the PCRE public API. The data for
|
Line 2702 but are not part of the PCRE public API. The data for
|
pcre_tables.c module. */ |
pcre_tables.c module. */ |
|
|
#ifdef COMPILE_PCRE8 |
#ifdef COMPILE_PCRE8 |
|
|
extern const int PRIV(utf8_table1)[]; |
extern const int PRIV(utf8_table1)[]; |
extern const int PRIV(utf8_table1_size); |
extern const int PRIV(utf8_table1_size); |
extern const int PRIV(utf8_table2)[]; |
extern const int PRIV(utf8_table2)[]; |
extern const int PRIV(utf8_table3)[]; |
extern const int PRIV(utf8_table3)[]; |
extern const pcre_uint8 PRIV(utf8_table4)[]; |
extern const pcre_uint8 PRIV(utf8_table4)[]; |
|
|
#endif /* COMPILE_PCRE8 */ |
#endif /* COMPILE_PCRE8 */ |
|
|
extern const char PRIV(utt_names)[]; |
extern const char PRIV(utt_names)[]; |
extern const ucp_type_table PRIV(utt)[]; |
extern const ucp_type_table PRIV(utt)[]; |
extern const int PRIV(utt_size); |
extern const int PRIV(utt_size); |
|
|
|
extern const pcre_uint8 PRIV(OP_lengths)[]; |
extern const pcre_uint8 PRIV(default_tables)[]; |
extern const pcre_uint8 PRIV(default_tables)[]; |
|
|
extern const pcre_uint8 PRIV(OP_lengths)[]; | extern const pcre_uint32 PRIV(hspace_list)[]; |
| extern const pcre_uint32 PRIV(vspace_list)[]; |
|
|
|
|
/* Internal shared functions. These are functions that are used by more than |
/* Internal shared functions. These are functions that are used by more than |
Line 2255 one of the exported public functions. They have to be
|
Line 2725 one of the exported public functions. They have to be
|
sense, but are not part of the PCRE public API. */ |
sense, but are not part of the PCRE public API. */ |
|
|
/* String comparison functions. */ |
/* String comparison functions. */ |
#ifdef COMPILE_PCRE8 | #if defined COMPILE_PCRE8 |
|
|
#define STRCMP_UC_UC(str1, str2) \ |
#define STRCMP_UC_UC(str1, str2) \ |
strcmp((char *)(str1), (char *)(str2)) |
strcmp((char *)(str1), (char *)(str2)) |
Line 2267 sense, but are not part of the PCRE public API. */
|
Line 2737 sense, but are not part of the PCRE public API. */
|
strncmp((char *)(str1), (str2), (num)) |
strncmp((char *)(str1), (str2), (num)) |
#define STRLEN_UC(str) strlen((const char *)str) |
#define STRLEN_UC(str) strlen((const char *)str) |
|
|
#else | #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32 |
|
|
extern int PRIV(strcmp_uc_uc)(const pcre_uchar *, |
extern int PRIV(strcmp_uc_uc)(const pcre_uchar *, |
const pcre_uchar *); |
const pcre_uchar *); |
Line 2289 extern unsigned int PRIV(strlen_uc)(const pcre_uc
|
Line 2759 extern unsigned int PRIV(strlen_uc)(const pcre_uc
|
PRIV(strncmp_uc_c8)((str1), (str2), (num)) |
PRIV(strncmp_uc_c8)((str1), (str2), (num)) |
#define STRLEN_UC(str) PRIV(strlen_uc)(str) |
#define STRLEN_UC(str) PRIV(strlen_uc)(str) |
|
|
#endif /* COMPILE_PCRE8 */ | #endif /* COMPILE_PCRE[8|16|32] */ |
|
|
|
#if defined COMPILE_PCRE8 || defined COMPILE_PCRE16 |
|
|
|
#define STRCMP_UC_UC_TEST(str1, str2) STRCMP_UC_UC(str1, str2) |
|
#define STRCMP_UC_C8_TEST(str1, str2) STRCMP_UC_C8(str1, str2) |
|
|
|
#elif defined COMPILE_PCRE32 |
|
|
|
extern int PRIV(strcmp_uc_uc_utf)(const pcre_uchar *, |
|
const pcre_uchar *); |
|
extern int PRIV(strcmp_uc_c8_utf)(const pcre_uchar *, |
|
const char *); |
|
|
|
#define STRCMP_UC_UC_TEST(str1, str2) \ |
|
(utf ? PRIV(strcmp_uc_uc_utf)((str1), (str2)) : PRIV(strcmp_uc_uc)((str1), (str2))) |
|
#define STRCMP_UC_C8_TEST(str1, str2) \ |
|
(utf ? PRIV(strcmp_uc_c8_utf)((str1), (str2)) : PRIV(strcmp_uc_c8)((str1), (str2))) |
|
|
|
#endif /* COMPILE_PCRE[8|16|32] */ |
|
|
extern const pcre_uchar *PRIV(find_bracket)(const pcre_uchar *, BOOL, int); |
extern const pcre_uchar *PRIV(find_bracket)(const pcre_uchar *, BOOL, int); |
extern BOOL PRIV(is_newline)(PCRE_PUCHAR, int, PCRE_PUCHAR, |
extern BOOL PRIV(is_newline)(PCRE_PUCHAR, int, PCRE_PUCHAR, |
int *, BOOL); |
int *, BOOL); |
extern int PRIV(ord2utf)(pcre_uint32, pcre_uchar *); | extern unsigned int PRIV(ord2utf)(pcre_uint32, pcre_uchar *); |
extern int PRIV(valid_utf)(PCRE_PUCHAR, int, int *); |
extern int PRIV(valid_utf)(PCRE_PUCHAR, int, int *); |
extern BOOL PRIV(was_newline)(PCRE_PUCHAR, int, PCRE_PUCHAR, |
extern BOOL PRIV(was_newline)(PCRE_PUCHAR, int, PCRE_PUCHAR, |
int *, BOOL); |
int *, BOOL); |
extern BOOL PRIV(xclass)(int, const pcre_uchar *, BOOL); | extern BOOL PRIV(xclass)(pcre_uint32, const pcre_uchar *, BOOL); |
|
|
#ifdef SUPPORT_JIT |
#ifdef SUPPORT_JIT |
extern void PRIV(jit_compile)(const REAL_PCRE *, |
extern void PRIV(jit_compile)(const REAL_PCRE *, |
PUBL(extra) *, int); |
PUBL(extra) *, int); |
extern int PRIV(jit_exec)(const REAL_PCRE *, const PUBL(extra) *, | extern int PRIV(jit_exec)(const PUBL(extra) *, |
const pcre_uchar *, int, int, int, int *, int); |
const pcre_uchar *, int, int, int, int *, int); |
extern void PRIV(jit_free)(void *); |
extern void PRIV(jit_free)(void *); |
extern int PRIV(jit_get_size)(void *); |
extern int PRIV(jit_get_size)(void *); |
Line 2313 extern const char* PRIV(jit_get_target)(void);
|
Line 2802 extern const char* PRIV(jit_get_target)(void);
|
/* Unicode character database (UCD) */ |
/* Unicode character database (UCD) */ |
|
|
typedef struct { |
typedef struct { |
pcre_uint8 script; | pcre_uint8 script; /* ucp_Arabic, etc. */ |
pcre_uint8 chartype; | pcre_uint8 chartype; /* ucp_Cc, etc. (general categories) */ |
pcre_int32 other_case; | pcre_uint8 gbprop; /* ucp_gbControl, etc. (grapheme break property) */ |
| pcre_uint8 caseset; /* offset to multichar other cases or zero */ |
| pcre_int32 other_case; /* offset to other case, or zero if none */ |
} ucd_record; |
} ucd_record; |
|
|
|
extern const pcre_uint32 PRIV(ucd_caseless_sets)[]; |
extern const ucd_record PRIV(ucd_records)[]; |
extern const ucd_record PRIV(ucd_records)[]; |
extern const pcre_uint8 PRIV(ucd_stage1)[]; |
extern const pcre_uint8 PRIV(ucd_stage1)[]; |
extern const pcre_uint16 PRIV(ucd_stage2)[]; |
extern const pcre_uint16 PRIV(ucd_stage2)[]; |
extern const int PRIV(ucp_gentype)[]; | extern const pcre_uint32 PRIV(ucp_gentype)[]; |
| extern const pcre_uint32 PRIV(ucp_gbtable)[]; |
#ifdef SUPPORT_JIT |
#ifdef SUPPORT_JIT |
extern const int PRIV(ucp_typerange)[]; |
extern const int PRIV(ucp_typerange)[]; |
#endif |
#endif |
Line 2331 extern const int PRIV(ucp_typerange)[];
|
Line 2824 extern const int PRIV(ucp_typerange)[];
|
|
|
#define UCD_BLOCK_SIZE 128 |
#define UCD_BLOCK_SIZE 128 |
#define GET_UCD(ch) (PRIV(ucd_records) + \ |
#define GET_UCD(ch) (PRIV(ucd_records) + \ |
PRIV(ucd_stage2)[PRIV(ucd_stage1)[(ch) / UCD_BLOCK_SIZE] * \ | PRIV(ucd_stage2)[PRIV(ucd_stage1)[(int)(ch) / UCD_BLOCK_SIZE] * \ |
UCD_BLOCK_SIZE + (ch) % UCD_BLOCK_SIZE]) | UCD_BLOCK_SIZE + (int)(ch) % UCD_BLOCK_SIZE]) |
|
|
#define UCD_CHARTYPE(ch) GET_UCD(ch)->chartype | #define UCD_CHARTYPE(ch) GET_UCD(ch)->chartype |
#define UCD_SCRIPT(ch) GET_UCD(ch)->script | #define UCD_SCRIPT(ch) GET_UCD(ch)->script |
#define UCD_CATEGORY(ch) PRIV(ucp_gentype)[UCD_CHARTYPE(ch)] | #define UCD_CATEGORY(ch) PRIV(ucp_gentype)[UCD_CHARTYPE(ch)] |
#define UCD_OTHERCASE(ch) (ch + GET_UCD(ch)->other_case) | #define UCD_GRAPHBREAK(ch) GET_UCD(ch)->gbprop |
| #define UCD_CASESET(ch) GET_UCD(ch)->caseset |
| #define UCD_OTHERCASE(ch) ((pcre_uint32)((int)ch + (int)(GET_UCD(ch)->other_case))) |
|
|
#endif /* SUPPORT_UCP */ |
#endif /* SUPPORT_UCP */ |
|
|