Annotation of embedaddon/php/ext/pcre/pcrelib/pcre_internal.h, revision 1.1
1.1 ! misho 1: /*************************************************
! 2: * Perl-Compatible Regular Expressions *
! 3: *************************************************/
! 4:
! 5:
! 6: /* PCRE is a library of functions to support regular expressions whose syntax
! 7: and semantics are as close as possible to those of the Perl 5 language.
! 8:
! 9: Written by Philip Hazel
! 10: Copyright (c) 1997-2010 University of Cambridge
! 11:
! 12: -----------------------------------------------------------------------------
! 13: Redistribution and use in source and binary forms, with or without
! 14: modification, are permitted provided that the following conditions are met:
! 15:
! 16: * Redistributions of source code must retain the above copyright notice,
! 17: this list of conditions and the following disclaimer.
! 18:
! 19: * Redistributions in binary form must reproduce the above copyright
! 20: notice, this list of conditions and the following disclaimer in the
! 21: documentation and/or other materials provided with the distribution.
! 22:
! 23: * Neither the name of the University of Cambridge nor the names of its
! 24: contributors may be used to endorse or promote products derived from
! 25: this software without specific prior written permission.
! 26:
! 27: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
! 28: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
! 29: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
! 30: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
! 31: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
! 32: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
! 33: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
! 34: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
! 35: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
! 36: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
! 37: POSSIBILITY OF SUCH DAMAGE.
! 38: -----------------------------------------------------------------------------
! 39: */
! 40:
! 41: /* This header contains definitions that are shared between the different
! 42: modules, but which are not relevant to the exported API. This includes some
! 43: functions whose names all begin with "_pcre_". */
! 44:
! 45: #ifndef PCRE_INTERNAL_H
! 46: #define PCRE_INTERNAL_H
! 47:
! 48: /* Define PCRE_DEBUG to get debugging output on stdout. */
! 49:
! 50: #if 0
! 51: #define PCRE_DEBUG
! 52: #endif
! 53:
! 54: /* We do not support both EBCDIC and UTF-8 at the same time. The "configure"
! 55: script prevents both being selected, but not everybody uses "configure". */
! 56:
! 57: #if defined EBCDIC && defined SUPPORT_UTF8
! 58: #error The use of both EBCDIC and SUPPORT_UTF8 is not supported.
! 59: #endif
! 60:
! 61: /* If SUPPORT_UCP is defined, SUPPORT_UTF8 must also be defined. The
! 62: "configure" script ensures this, but not everybody uses "configure". */
! 63:
! 64: #if defined SUPPORT_UCP && !defined SUPPORT_UTF8
! 65: #define SUPPORT_UTF8 1
! 66: #endif
! 67:
! 68: /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
! 69: inline, and there are *still* stupid compilers about that don't like indented
! 70: pre-processor statements, or at least there were when I first wrote this. After
! 71: all, it had only been about 10 years then...
! 72:
! 73: It turns out that the Mac Debugging.h header also defines the macro DPRINTF, so
! 74: be absolutely sure we get our version. */
! 75:
! 76: #undef DPRINTF
! 77: #ifdef PCRE_DEBUG
! 78: #define DPRINTF(p) printf p
! 79: #else
! 80: #define DPRINTF(p) /* Nothing */
! 81: #endif
! 82:
! 83:
! 84: /* Standard C headers plus the external interface definition. The only time
! 85: setjmp and stdarg are used is when NO_RECURSE is set. */
! 86:
! 87: #include <ctype.h>
! 88: #include <limits.h>
! 89: #include <stddef.h>
! 90: #include <stdio.h>
! 91: #include <stdlib.h>
! 92: #include <string.h>
! 93:
! 94: /* When compiling a DLL for Windows, the exported symbols have to be declared
! 95: using some MS magic. I found some useful information on this web page:
! 96: http://msdn2.microsoft.com/en-us/library/y4h7bcy6(VS.80).aspx. According to the
! 97: information there, using __declspec(dllexport) without "extern" we have a
! 98: definition; with "extern" we have a declaration. The settings here override the
! 99: setting in pcre.h (which is included below); it defines only PCRE_EXP_DECL,
! 100: which is all that is needed for applications (they just import the symbols). We
! 101: use:
! 102:
! 103: PCRE_EXP_DECL for declarations
! 104: PCRE_EXP_DEFN for definitions of exported functions
! 105: PCRE_EXP_DATA_DEFN for definitions of exported variables
! 106:
! 107: The reason for the two DEFN macros is that in non-Windows environments, one
! 108: does not want to have "extern" before variable definitions because it leads to
! 109: compiler warnings. So we distinguish between functions and variables. In
! 110: Windows, the two should always be the same.
! 111:
! 112: The reason for wrapping this in #ifndef PCRE_EXP_DECL is so that pcretest,
! 113: which is an application, but needs to import this file in order to "peek" at
! 114: internals, can #include pcre.h first to get an application's-eye view.
! 115:
! 116: In principle, people compiling for non-Windows, non-Unix-like (i.e. uncommon,
! 117: special-purpose environments) might want to stick other stuff in front of
! 118: exported symbols. That's why, in the non-Windows case, we set PCRE_EXP_DEFN and
! 119: PCRE_EXP_DATA_DEFN only if they are not already set. */
! 120:
! 121: #ifndef PCRE_EXP_DECL
! 122: # ifdef _WIN32
! 123: # ifndef PCRE_STATIC
! 124: # define PCRE_EXP_DECL extern __declspec(dllexport)
! 125: # define PCRE_EXP_DEFN __declspec(dllexport)
! 126: # define PCRE_EXP_DATA_DEFN __declspec(dllexport)
! 127: # else
! 128: # define PCRE_EXP_DECL extern
! 129: # define PCRE_EXP_DEFN
! 130: # define PCRE_EXP_DATA_DEFN
! 131: # endif
! 132: # else
! 133: # ifdef __cplusplus
! 134: # define PCRE_EXP_DECL extern "C"
! 135: # else
! 136: # define PCRE_EXP_DECL extern
! 137: # endif
! 138: # ifndef PCRE_EXP_DEFN
! 139: # define PCRE_EXP_DEFN PCRE_EXP_DECL
! 140: # endif
! 141: # ifndef PCRE_EXP_DATA_DEFN
! 142: # define PCRE_EXP_DATA_DEFN
! 143: # endif
! 144: # endif
! 145: #endif
! 146:
! 147: /* When compiling with the MSVC compiler, it is sometimes necessary to include
! 148: a "calling convention" before exported function names. (This is secondhand
! 149: information; I know nothing about MSVC myself). For example, something like
! 150:
! 151: void __cdecl function(....)
! 152:
! 153: might be needed. In order so make this easy, all the exported functions have
! 154: PCRE_CALL_CONVENTION just before their names. It is rarely needed; if not
! 155: set, we ensure here that it has no effect. */
! 156:
! 157: #ifndef PCRE_CALL_CONVENTION
! 158: #define PCRE_CALL_CONVENTION
! 159: #endif
! 160:
! 161: /* We need to have types that specify unsigned 16-bit and 32-bit integers. We
! 162: cannot determine these outside the compilation (e.g. by running a program as
! 163: part of "configure") because PCRE is often cross-compiled for use on other
! 164: systems. Instead we make use of the maximum sizes that are available at
! 165: preprocessor time in standard C environments. */
! 166:
! 167: #if USHRT_MAX == 65535
! 168: typedef unsigned short pcre_uint16;
! 169: typedef short pcre_int16;
! 170: #elif UINT_MAX == 65535
! 171: typedef unsigned int pcre_uint16;
! 172: typedef int pcre_int16;
! 173: #else
! 174: #error Cannot determine a type for 16-bit unsigned integers
! 175: #endif
! 176:
! 177: #if UINT_MAX == 4294967295
! 178: typedef unsigned int pcre_uint32;
! 179: typedef int pcre_int32;
! 180: #elif ULONG_MAX == 4294967295
! 181: typedef unsigned long int pcre_uint32;
! 182: typedef long int pcre_int32;
! 183: #else
! 184: #error Cannot determine a type for 32-bit unsigned integers
! 185: #endif
! 186:
! 187: /* When checking for integer overflow in pcre_compile(), we need to handle
! 188: large integers. If a 64-bit integer type is available, we can use that.
! 189: Otherwise we have to cast to double, which of course requires floating point
! 190: arithmetic. Handle this by defining a macro for the appropriate type. If
! 191: stdint.h is available, include it; it may define INT64_MAX. Systems that do not
! 192: have stdint.h (e.g. Solaris) may have inttypes.h. The macro int64_t may be set
! 193: by "configure". */
! 194:
! 195: #ifdef PHP_WIN32
! 196: #include "win32/php_stdint.h"
! 197: #elif HAVE_STDINT_H
! 198: #include <stdint.h>
! 199: #elif HAVE_INTTYPES_H
! 200: #include <inttypes.h>
! 201: #endif
! 202:
! 203: #if defined INT64_MAX || defined int64_t
! 204: #define INT64_OR_DOUBLE int64_t
! 205: #else
! 206: #define INT64_OR_DOUBLE double
! 207: #endif
! 208:
! 209: /* All character handling must be done as unsigned characters. Otherwise there
! 210: are problems with top-bit-set characters and functions such as isspace().
! 211: However, we leave the interface to the outside world as char *, because that
! 212: should make things easier for callers. We define a short type for unsigned char
! 213: to save lots of typing. I tried "uchar", but it causes problems on Digital
! 214: Unix, where it is defined in sys/types, so use "uschar" instead. */
! 215:
! 216: typedef unsigned char uschar;
! 217:
! 218: /* This is an unsigned int value that no character can ever have. UTF-8
! 219: characters only go up to 0x7fffffff (though Unicode doesn't go beyond
! 220: 0x0010ffff). */
! 221:
! 222: #define NOTACHAR 0xffffffff
! 223:
! 224: /* PCRE is able to support several different kinds of newline (CR, LF, CRLF,
! 225: "any" and "anycrlf" at present). The following macros are used to package up
! 226: testing for newlines. NLBLOCK, PSSTART, and PSEND are defined in the various
! 227: modules to indicate in which datablock the parameters exist, and what the
! 228: start/end of string field names are. */
! 229:
! 230: #define NLTYPE_FIXED 0 /* Newline is a fixed length string */
! 231: #define NLTYPE_ANY 1 /* Newline is any Unicode line ending */
! 232: #define NLTYPE_ANYCRLF 2 /* Newline is CR, LF, or CRLF */
! 233:
! 234: /* This macro checks for a newline at the given position */
! 235:
! 236: #define IS_NEWLINE(p) \
! 237: ((NLBLOCK->nltype != NLTYPE_FIXED)? \
! 238: ((p) < NLBLOCK->PSEND && \
! 239: _pcre_is_newline((p), NLBLOCK->nltype, NLBLOCK->PSEND, &(NLBLOCK->nllen),\
! 240: utf8)) \
! 241: : \
! 242: ((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \
! 243: (p)[0] == NLBLOCK->nl[0] && \
! 244: (NLBLOCK->nllen == 1 || (p)[1] == NLBLOCK->nl[1]) \
! 245: ) \
! 246: )
! 247:
! 248: /* This macro checks for a newline immediately preceding the given position */
! 249:
! 250: #define WAS_NEWLINE(p) \
! 251: ((NLBLOCK->nltype != NLTYPE_FIXED)? \
! 252: ((p) > NLBLOCK->PSSTART && \
! 253: _pcre_was_newline((p), NLBLOCK->nltype, NLBLOCK->PSSTART, \
! 254: &(NLBLOCK->nllen), utf8)) \
! 255: : \
! 256: ((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \
! 257: (p)[-NLBLOCK->nllen] == NLBLOCK->nl[0] && \
! 258: (NLBLOCK->nllen == 1 || (p)[-NLBLOCK->nllen+1] == NLBLOCK->nl[1]) \
! 259: ) \
! 260: )
! 261:
! 262: /* When PCRE is compiled as a C++ library, the subject pointer can be replaced
! 263: with a custom type. This makes it possible, for example, to allow pcre_exec()
! 264: to process subject strings that are discontinuous by using a smart pointer
! 265: class. It must always be possible to inspect all of the subject string in
! 266: pcre_exec() because of the way it backtracks. Two macros are required in the
! 267: normal case, for sign-unspecified and unsigned char pointers. The former is
! 268: used for the external interface and appears in pcre.h, which is why its name
! 269: must begin with PCRE_. */
! 270:
! 271: #ifdef CUSTOM_SUBJECT_PTR
! 272: #define PCRE_SPTR CUSTOM_SUBJECT_PTR
! 273: #define USPTR CUSTOM_SUBJECT_PTR
! 274: #else
! 275: #define PCRE_SPTR const char *
! 276: #define USPTR const unsigned char *
! 277: #endif
! 278:
! 279:
! 280:
! 281: /* Include the public PCRE header and the definitions of UCP character property
! 282: values. */
! 283:
! 284: #include "pcre.h"
! 285: #include "ucp.h"
! 286:
! 287: /* When compiling for use with the Virtual Pascal compiler, these functions
! 288: need to have their names changed. PCRE must be compiled with the -DVPCOMPAT
! 289: option on the command line. */
! 290:
! 291: #ifdef VPCOMPAT
! 292: #define strlen(s) _strlen(s)
! 293: #define strncmp(s1,s2,m) _strncmp(s1,s2,m)
! 294: #define memcmp(s,c,n) _memcmp(s,c,n)
! 295: #define memcpy(d,s,n) _memcpy(d,s,n)
! 296: #define memmove(d,s,n) _memmove(d,s,n)
! 297: #define memset(s,c,n) _memset(s,c,n)
! 298: #else /* VPCOMPAT */
! 299:
! 300: /* To cope with SunOS4 and other systems that lack memmove() but have bcopy(),
! 301: define a macro for memmove() if HAVE_MEMMOVE is false, provided that HAVE_BCOPY
! 302: is set. Otherwise, include an emulating function for those systems that have
! 303: neither (there some non-Unix environments where this is the case). */
! 304:
! 305: #ifndef HAVE_MEMMOVE
! 306: #undef memmove /* some systems may have a macro */
! 307: #ifdef HAVE_BCOPY
! 308: #define memmove(a, b, c) bcopy(b, a, c)
! 309: #else /* HAVE_BCOPY */
! 310: static void *
! 311: pcre_memmove(void *d, const void *s, size_t n)
! 312: {
! 313: size_t i;
! 314: unsigned char *dest = (unsigned char *)d;
! 315: const unsigned char *src = (const unsigned char *)s;
! 316: if (dest > src)
! 317: {
! 318: dest += n;
! 319: src += n;
! 320: for (i = 0; i < n; ++i) *(--dest) = *(--src);
! 321: return (void *)dest;
! 322: }
! 323: else
! 324: {
! 325: for (i = 0; i < n; ++i) *dest++ = *src++;
! 326: return (void *)(dest - n);
! 327: }
! 328: }
! 329: #define memmove(a, b, c) pcre_memmove(a, b, c)
! 330: #endif /* not HAVE_BCOPY */
! 331: #endif /* not HAVE_MEMMOVE */
! 332: #endif /* not VPCOMPAT */
! 333:
! 334:
! 335: /* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored
! 336: in big-endian order) by default. These are used, for example, to link from the
! 337: start of a subpattern to its alternatives and its end. The use of 2 bytes per
! 338: offset limits the size of the compiled regex to around 64K, which is big enough
! 339: for almost everybody. However, I received a request for an even bigger limit.
! 340: For this reason, and also to make the code easier to maintain, the storing and
! 341: loading of offsets from the byte string is now handled by the macros that are
! 342: defined here.
! 343:
! 344: The macros are controlled by the value of LINK_SIZE. This defaults to 2 in
! 345: the config.h file, but can be overridden by using -D on the command line. This
! 346: is automated on Unix systems via the "configure" command. */
! 347:
! 348: #if LINK_SIZE == 2
! 349:
! 350: #define PUT(a,n,d) \
! 351: (a[n] = (d) >> 8), \
! 352: (a[(n)+1] = (d) & 255)
! 353:
! 354: #define GET(a,n) \
! 355: (((a)[n] << 8) | (a)[(n)+1])
! 356:
! 357: #define MAX_PATTERN_SIZE (1 << 16)
! 358:
! 359:
! 360: #elif LINK_SIZE == 3
! 361:
! 362: #define PUT(a,n,d) \
! 363: (a[n] = (d) >> 16), \
! 364: (a[(n)+1] = (d) >> 8), \
! 365: (a[(n)+2] = (d) & 255)
! 366:
! 367: #define GET(a,n) \
! 368: (((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2])
! 369:
! 370: #define MAX_PATTERN_SIZE (1 << 24)
! 371:
! 372:
! 373: #elif LINK_SIZE == 4
! 374:
! 375: #define PUT(a,n,d) \
! 376: (a[n] = (d) >> 24), \
! 377: (a[(n)+1] = (d) >> 16), \
! 378: (a[(n)+2] = (d) >> 8), \
! 379: (a[(n)+3] = (d) & 255)
! 380:
! 381: #define GET(a,n) \
! 382: (((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3])
! 383:
! 384: #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */
! 385:
! 386:
! 387: #else
! 388: #error LINK_SIZE must be either 2, 3, or 4
! 389: #endif
! 390:
! 391:
! 392: /* Convenience macro defined in terms of the others */
! 393:
! 394: #define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE
! 395:
! 396:
! 397: /* PCRE uses some other 2-byte quantities that do not change when the size of
! 398: offsets changes. There are used for repeat counts and for other things such as
! 399: capturing parenthesis numbers in back references. */
! 400:
! 401: #define PUT2(a,n,d) \
! 402: a[n] = (d) >> 8; \
! 403: a[(n)+1] = (d) & 255
! 404:
! 405: #define GET2(a,n) \
! 406: (((a)[n] << 8) | (a)[(n)+1])
! 407:
! 408: #define PUT2INC(a,n,d) PUT2(a,n,d), a += 2
! 409:
! 410:
! 411: /* When UTF-8 encoding is being used, a character is no longer just a single
! 412: byte. The macros for character handling generate simple sequences when used in
! 413: byte-mode, and more complicated ones for UTF-8 characters. GETCHARLENTEST is
! 414: not used when UTF-8 is not supported, so it is not defined, and BACKCHAR should
! 415: never be called in byte mode. To make sure they can never even appear when
! 416: UTF-8 support is omitted, we don't even define them. */
! 417:
! 418: #ifndef SUPPORT_UTF8
! 419: #define GETCHAR(c, eptr) c = *eptr;
! 420: #define GETCHARTEST(c, eptr) c = *eptr;
! 421: #define GETCHARINC(c, eptr) c = *eptr++;
! 422: #define GETCHARINCTEST(c, eptr) c = *eptr++;
! 423: #define GETCHARLEN(c, eptr, len) c = *eptr;
! 424: /* #define GETCHARLENTEST(c, eptr, len) */
! 425: /* #define BACKCHAR(eptr) */
! 426:
! 427: #else /* SUPPORT_UTF8 */
! 428:
! 429: /* These macros were originally written in the form of loops that used data
! 430: from the tables whose names start with _pcre_utf8_table. They were rewritten by
! 431: a user so as not to use loops, because in some environments this gives a
! 432: significant performance advantage, and it seems never to do any harm. */
! 433:
! 434: /* Base macro to pick up the remaining bytes of a UTF-8 character, not
! 435: advancing the pointer. */
! 436:
! 437: #define GETUTF8(c, eptr) \
! 438: { \
! 439: if ((c & 0x20) == 0) \
! 440: c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \
! 441: else if ((c & 0x10) == 0) \
! 442: c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
! 443: else if ((c & 0x08) == 0) \
! 444: c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | \
! 445: ((eptr[2] & 0x3f) << 6) | (eptr[3] & 0x3f); \
! 446: else if ((c & 0x04) == 0) \
! 447: c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | \
! 448: ((eptr[2] & 0x3f) << 12) | ((eptr[3] & 0x3f) << 6) | \
! 449: (eptr[4] & 0x3f); \
! 450: else \
! 451: c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | \
! 452: ((eptr[2] & 0x3f) << 18) | ((eptr[3] & 0x3f) << 12) | \
! 453: ((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \
! 454: }
! 455:
! 456: /* Get the next UTF-8 character, not advancing the pointer. This is called when
! 457: we know we are in UTF-8 mode. */
! 458:
! 459: #define GETCHAR(c, eptr) \
! 460: c = *eptr; \
! 461: if (c >= 0xc0) GETUTF8(c, eptr);
! 462:
! 463: /* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
! 464: pointer. */
! 465:
! 466: #define GETCHARTEST(c, eptr) \
! 467: c = *eptr; \
! 468: if (utf8 && c >= 0xc0) GETUTF8(c, eptr);
! 469:
! 470: /* Base macro to pick up the remaining bytes of a UTF-8 character, advancing
! 471: the pointer. */
! 472:
! 473: #define GETUTF8INC(c, eptr) \
! 474: { \
! 475: if ((c & 0x20) == 0) \
! 476: c = ((c & 0x1f) << 6) | (*eptr++ & 0x3f); \
! 477: else if ((c & 0x10) == 0) \
! 478: { \
! 479: c = ((c & 0x0f) << 12) | ((*eptr & 0x3f) << 6) | (eptr[1] & 0x3f); \
! 480: eptr += 2; \
! 481: } \
! 482: else if ((c & 0x08) == 0) \
! 483: { \
! 484: c = ((c & 0x07) << 18) | ((*eptr & 0x3f) << 12) | \
! 485: ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
! 486: eptr += 3; \
! 487: } \
! 488: else if ((c & 0x04) == 0) \
! 489: { \
! 490: c = ((c & 0x03) << 24) | ((*eptr & 0x3f) << 18) | \
! 491: ((eptr[1] & 0x3f) << 12) | ((eptr[2] & 0x3f) << 6) | \
! 492: (eptr[3] & 0x3f); \
! 493: eptr += 4; \
! 494: } \
! 495: else \
! 496: { \
! 497: c = ((c & 0x01) << 30) | ((*eptr & 0x3f) << 24) | \
! 498: ((eptr[1] & 0x3f) << 18) | ((eptr[2] & 0x3f) << 12) | \
! 499: ((eptr[3] & 0x3f) << 6) | (eptr[4] & 0x3f); \
! 500: eptr += 5; \
! 501: } \
! 502: }
! 503:
! 504: /* Get the next UTF-8 character, advancing the pointer. This is called when we
! 505: know we are in UTF-8 mode. */
! 506:
! 507: #define GETCHARINC(c, eptr) \
! 508: c = *eptr++; \
! 509: if (c >= 0xc0) GETUTF8INC(c, eptr);
! 510:
! 511: /* Get the next character, testing for UTF-8 mode, and advancing the pointer.
! 512: This is called when we don't know if we are in UTF-8 mode. */
! 513:
! 514: #define GETCHARINCTEST(c, eptr) \
! 515: c = *eptr++; \
! 516: if (utf8 && c >= 0xc0) GETUTF8INC(c, eptr);
! 517:
! 518: /* Base macro to pick up the remaining bytes of a UTF-8 character, not
! 519: advancing the pointer, incrementing the length. */
! 520:
! 521: #define GETUTF8LEN(c, eptr, len) \
! 522: { \
! 523: if ((c & 0x20) == 0) \
! 524: { \
! 525: c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \
! 526: len++; \
! 527: } \
! 528: else if ((c & 0x10) == 0) \
! 529: { \
! 530: c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
! 531: len += 2; \
! 532: } \
! 533: else if ((c & 0x08) == 0) \
! 534: {\
! 535: c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | \
! 536: ((eptr[2] & 0x3f) << 6) | (eptr[3] & 0x3f); \
! 537: len += 3; \
! 538: } \
! 539: else if ((c & 0x04) == 0) \
! 540: { \
! 541: c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | \
! 542: ((eptr[2] & 0x3f) << 12) | ((eptr[3] & 0x3f) << 6) | \
! 543: (eptr[4] & 0x3f); \
! 544: len += 4; \
! 545: } \
! 546: else \
! 547: {\
! 548: c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | \
! 549: ((eptr[2] & 0x3f) << 18) | ((eptr[3] & 0x3f) << 12) | \
! 550: ((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \
! 551: len += 5; \
! 552: } \
! 553: }
! 554:
! 555: /* Get the next UTF-8 character, not advancing the pointer, incrementing length
! 556: if there are extra bytes. This is called when we know we are in UTF-8 mode. */
! 557:
! 558: #define GETCHARLEN(c, eptr, len) \
! 559: c = *eptr; \
! 560: if (c >= 0xc0) GETUTF8LEN(c, eptr, len);
! 561:
! 562: /* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the
! 563: pointer, incrementing length if there are extra bytes. This is called when we
! 564: do not know if we are in UTF-8 mode. */
! 565:
! 566: #define GETCHARLENTEST(c, eptr, len) \
! 567: c = *eptr; \
! 568: if (utf8 && c >= 0xc0) GETUTF8LEN(c, eptr, len);
! 569:
! 570: /* If the pointer is not at the start of a character, move it back until
! 571: it is. This is called only in UTF-8 mode - we don't put a test within the macro
! 572: because almost all calls are already within a block of UTF-8 only code. */
! 573:
! 574: #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--
! 575:
! 576: #endif /* SUPPORT_UTF8 */
! 577:
! 578:
! 579: /* In case there is no definition of offsetof() provided - though any proper
! 580: Standard C system should have one. */
! 581:
! 582: #ifndef offsetof
! 583: #define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field))
! 584: #endif
! 585:
! 586:
! 587: /* These are the public options that can change during matching. */
! 588:
! 589: #define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL)
! 590:
! 591: /* Private flags containing information about the compiled regex. They used to
! 592: live at the top end of the options word, but that got almost full, so now they
! 593: are in a 16-bit flags word. From release 8.00, PCRE_NOPARTIAL is unused, as
! 594: the restrictions on partial matching have been lifted. It remains for backwards
! 595: compatibility. */
! 596:
! 597: #define PCRE_NOPARTIAL 0x0001 /* can't use partial with this regex */
! 598: #define PCRE_FIRSTSET 0x0002 /* first_byte is set */
! 599: #define PCRE_REQCHSET 0x0004 /* req_byte is set */
! 600: #define PCRE_STARTLINE 0x0008 /* start after \n for multiline */
! 601: #define PCRE_JCHANGED 0x0010 /* j option used in regex */
! 602: #define PCRE_HASCRORLF 0x0020 /* explicit \r or \n in pattern */
! 603:
! 604: /* Options for the "extra" block produced by pcre_study(). */
! 605:
! 606: #define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */
! 607: #define PCRE_STUDY_MINLEN 0x02 /* a minimum length field exists */
! 608:
! 609: /* Masks for identifying the public options that are permitted at compile
! 610: time, run time, or study time, respectively. */
! 611:
! 612: #define PCRE_NEWLINE_BITS (PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_ANY| \
! 613: PCRE_NEWLINE_ANYCRLF)
! 614:
! 615: #define PUBLIC_COMPILE_OPTIONS \
! 616: (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
! 617: PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
! 618: PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \
! 619: PCRE_DUPNAMES|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \
! 620: PCRE_JAVASCRIPT_COMPAT|PCRE_UCP|PCRE_NO_START_OPTIMIZE)
! 621:
! 622: #define PUBLIC_EXEC_OPTIONS \
! 623: (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NOTEMPTY_ATSTART| \
! 624: PCRE_NO_UTF8_CHECK|PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_NEWLINE_BITS| \
! 625: PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE|PCRE_NO_START_OPTIMIZE)
! 626:
! 627: #define PUBLIC_DFA_EXEC_OPTIONS \
! 628: (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NOTEMPTY_ATSTART| \
! 629: PCRE_NO_UTF8_CHECK|PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_DFA_SHORTEST| \
! 630: PCRE_DFA_RESTART|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \
! 631: PCRE_NO_START_OPTIMIZE)
! 632:
! 633: #define PUBLIC_STUDY_OPTIONS 0 /* None defined */
! 634:
! 635: /* Magic number to provide a small check against being handed junk. Also used
! 636: to detect whether a pattern was compiled on a host of different endianness. */
! 637:
! 638: #define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */
! 639:
! 640: /* Negative values for the firstchar and reqchar variables */
! 641:
! 642: #define REQ_UNSET (-2)
! 643: #define REQ_NONE (-1)
! 644:
! 645: /* The maximum remaining length of subject we are prepared to search for a
! 646: req_byte match. */
! 647:
! 648: #define REQ_BYTE_MAX 1000
! 649:
! 650: /* Flags added to firstbyte or reqbyte; a "non-literal" item is either a
! 651: variable-length repeat, or a anything other than literal characters. */
! 652:
! 653: #define REQ_CASELESS 0x0100 /* indicates caselessness */
! 654: #define REQ_VARY 0x0200 /* reqbyte followed non-literal item */
! 655:
! 656: /* Miscellaneous definitions. The #ifndef is to pacify compiler warnings in
! 657: environments where these macros are defined elsewhere. Unfortunately, there
! 658: is no way to do the same for the typedef. */
! 659:
! 660: typedef int BOOL;
! 661:
! 662: #ifndef FALSE
! 663: #define FALSE 0
! 664: #define TRUE 1
! 665: #endif
! 666:
! 667: /* If PCRE is to support UTF-8 on EBCDIC platforms, we cannot use normal
! 668: character constants like '*' because the compiler would emit their EBCDIC code,
! 669: which is different from their ASCII/UTF-8 code. Instead we define macros for
! 670: the characters so that they always use the ASCII/UTF-8 code when UTF-8 support
! 671: is enabled. When UTF-8 support is not enabled, the definitions use character
! 672: literals. Both character and string versions of each character are needed, and
! 673: there are some longer strings as well.
! 674:
! 675: This means that, on EBCDIC platforms, the PCRE library can handle either
! 676: EBCDIC, or UTF-8, but not both. To support both in the same compiled library
! 677: would need different lookups depending on whether PCRE_UTF8 was set or not.
! 678: This would make it impossible to use characters in switch/case statements,
! 679: which would reduce performance. For a theoretical use (which nobody has asked
! 680: for) in a minority area (EBCDIC platforms), this is not sensible. Any
! 681: application that did need both could compile two versions of the library, using
! 682: macros to give the functions distinct names. */
! 683:
! 684: #ifndef SUPPORT_UTF8
! 685:
! 686: /* UTF-8 support is not enabled; use the platform-dependent character literals
! 687: so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */
! 688:
! 689: #define CHAR_HT '\t'
! 690: #define CHAR_VT '\v'
! 691: #define CHAR_FF '\f'
! 692: #define CHAR_CR '\r'
! 693: #define CHAR_NL '\n'
! 694: #define CHAR_BS '\b'
! 695: #define CHAR_BEL '\a'
! 696: #ifdef EBCDIC
! 697: #define CHAR_ESC '\047'
! 698: #define CHAR_DEL '\007'
! 699: #else
! 700: #define CHAR_ESC '\033'
! 701: #define CHAR_DEL '\177'
! 702: #endif
! 703:
! 704: #define CHAR_SPACE ' '
! 705: #define CHAR_EXCLAMATION_MARK '!'
! 706: #define CHAR_QUOTATION_MARK '"'
! 707: #define CHAR_NUMBER_SIGN '#'
! 708: #define CHAR_DOLLAR_SIGN '$'
! 709: #define CHAR_PERCENT_SIGN '%'
! 710: #define CHAR_AMPERSAND '&'
! 711: #define CHAR_APOSTROPHE '\''
! 712: #define CHAR_LEFT_PARENTHESIS '('
! 713: #define CHAR_RIGHT_PARENTHESIS ')'
! 714: #define CHAR_ASTERISK '*'
! 715: #define CHAR_PLUS '+'
! 716: #define CHAR_COMMA ','
! 717: #define CHAR_MINUS '-'
! 718: #define CHAR_DOT '.'
! 719: #define CHAR_SLASH '/'
! 720: #define CHAR_0 '0'
! 721: #define CHAR_1 '1'
! 722: #define CHAR_2 '2'
! 723: #define CHAR_3 '3'
! 724: #define CHAR_4 '4'
! 725: #define CHAR_5 '5'
! 726: #define CHAR_6 '6'
! 727: #define CHAR_7 '7'
! 728: #define CHAR_8 '8'
! 729: #define CHAR_9 '9'
! 730: #define CHAR_COLON ':'
! 731: #define CHAR_SEMICOLON ';'
! 732: #define CHAR_LESS_THAN_SIGN '<'
! 733: #define CHAR_EQUALS_SIGN '='
! 734: #define CHAR_GREATER_THAN_SIGN '>'
! 735: #define CHAR_QUESTION_MARK '?'
! 736: #define CHAR_COMMERCIAL_AT '@'
! 737: #define CHAR_A 'A'
! 738: #define CHAR_B 'B'
! 739: #define CHAR_C 'C'
! 740: #define CHAR_D 'D'
! 741: #define CHAR_E 'E'
! 742: #define CHAR_F 'F'
! 743: #define CHAR_G 'G'
! 744: #define CHAR_H 'H'
! 745: #define CHAR_I 'I'
! 746: #define CHAR_J 'J'
! 747: #define CHAR_K 'K'
! 748: #define CHAR_L 'L'
! 749: #define CHAR_M 'M'
! 750: #define CHAR_N 'N'
! 751: #define CHAR_O 'O'
! 752: #define CHAR_P 'P'
! 753: #define CHAR_Q 'Q'
! 754: #define CHAR_R 'R'
! 755: #define CHAR_S 'S'
! 756: #define CHAR_T 'T'
! 757: #define CHAR_U 'U'
! 758: #define CHAR_V 'V'
! 759: #define CHAR_W 'W'
! 760: #define CHAR_X 'X'
! 761: #define CHAR_Y 'Y'
! 762: #define CHAR_Z 'Z'
! 763: #define CHAR_LEFT_SQUARE_BRACKET '['
! 764: #define CHAR_BACKSLASH '\\'
! 765: #define CHAR_RIGHT_SQUARE_BRACKET ']'
! 766: #define CHAR_CIRCUMFLEX_ACCENT '^'
! 767: #define CHAR_UNDERSCORE '_'
! 768: #define CHAR_GRAVE_ACCENT '`'
! 769: #define CHAR_a 'a'
! 770: #define CHAR_b 'b'
! 771: #define CHAR_c 'c'
! 772: #define CHAR_d 'd'
! 773: #define CHAR_e 'e'
! 774: #define CHAR_f 'f'
! 775: #define CHAR_g 'g'
! 776: #define CHAR_h 'h'
! 777: #define CHAR_i 'i'
! 778: #define CHAR_j 'j'
! 779: #define CHAR_k 'k'
! 780: #define CHAR_l 'l'
! 781: #define CHAR_m 'm'
! 782: #define CHAR_n 'n'
! 783: #define CHAR_o 'o'
! 784: #define CHAR_p 'p'
! 785: #define CHAR_q 'q'
! 786: #define CHAR_r 'r'
! 787: #define CHAR_s 's'
! 788: #define CHAR_t 't'
! 789: #define CHAR_u 'u'
! 790: #define CHAR_v 'v'
! 791: #define CHAR_w 'w'
! 792: #define CHAR_x 'x'
! 793: #define CHAR_y 'y'
! 794: #define CHAR_z 'z'
! 795: #define CHAR_LEFT_CURLY_BRACKET '{'
! 796: #define CHAR_VERTICAL_LINE '|'
! 797: #define CHAR_RIGHT_CURLY_BRACKET '}'
! 798: #define CHAR_TILDE '~'
! 799:
! 800: #define STR_HT "\t"
! 801: #define STR_VT "\v"
! 802: #define STR_FF "\f"
! 803: #define STR_CR "\r"
! 804: #define STR_NL "\n"
! 805: #define STR_BS "\b"
! 806: #define STR_BEL "\a"
! 807: #ifdef EBCDIC
! 808: #define STR_ESC "\047"
! 809: #define STR_DEL "\007"
! 810: #else
! 811: #define STR_ESC "\033"
! 812: #define STR_DEL "\177"
! 813: #endif
! 814:
! 815: #define STR_SPACE " "
! 816: #define STR_EXCLAMATION_MARK "!"
! 817: #define STR_QUOTATION_MARK "\""
! 818: #define STR_NUMBER_SIGN "#"
! 819: #define STR_DOLLAR_SIGN "$"
! 820: #define STR_PERCENT_SIGN "%"
! 821: #define STR_AMPERSAND "&"
! 822: #define STR_APOSTROPHE "'"
! 823: #define STR_LEFT_PARENTHESIS "("
! 824: #define STR_RIGHT_PARENTHESIS ")"
! 825: #define STR_ASTERISK "*"
! 826: #define STR_PLUS "+"
! 827: #define STR_COMMA ","
! 828: #define STR_MINUS "-"
! 829: #define STR_DOT "."
! 830: #define STR_SLASH "/"
! 831: #define STR_0 "0"
! 832: #define STR_1 "1"
! 833: #define STR_2 "2"
! 834: #define STR_3 "3"
! 835: #define STR_4 "4"
! 836: #define STR_5 "5"
! 837: #define STR_6 "6"
! 838: #define STR_7 "7"
! 839: #define STR_8 "8"
! 840: #define STR_9 "9"
! 841: #define STR_COLON ":"
! 842: #define STR_SEMICOLON ";"
! 843: #define STR_LESS_THAN_SIGN "<"
! 844: #define STR_EQUALS_SIGN "="
! 845: #define STR_GREATER_THAN_SIGN ">"
! 846: #define STR_QUESTION_MARK "?"
! 847: #define STR_COMMERCIAL_AT "@"
! 848: #define STR_A "A"
! 849: #define STR_B "B"
! 850: #define STR_C "C"
! 851: #define STR_D "D"
! 852: #define STR_E "E"
! 853: #define STR_F "F"
! 854: #define STR_G "G"
! 855: #define STR_H "H"
! 856: #define STR_I "I"
! 857: #define STR_J "J"
! 858: #define STR_K "K"
! 859: #define STR_L "L"
! 860: #define STR_M "M"
! 861: #define STR_N "N"
! 862: #define STR_O "O"
! 863: #define STR_P "P"
! 864: #define STR_Q "Q"
! 865: #define STR_R "R"
! 866: #define STR_S "S"
! 867: #define STR_T "T"
! 868: #define STR_U "U"
! 869: #define STR_V "V"
! 870: #define STR_W "W"
! 871: #define STR_X "X"
! 872: #define STR_Y "Y"
! 873: #define STR_Z "Z"
! 874: #define STR_LEFT_SQUARE_BRACKET "["
! 875: #define STR_BACKSLASH "\\"
! 876: #define STR_RIGHT_SQUARE_BRACKET "]"
! 877: #define STR_CIRCUMFLEX_ACCENT "^"
! 878: #define STR_UNDERSCORE "_"
! 879: #define STR_GRAVE_ACCENT "`"
! 880: #define STR_a "a"
! 881: #define STR_b "b"
! 882: #define STR_c "c"
! 883: #define STR_d "d"
! 884: #define STR_e "e"
! 885: #define STR_f "f"
! 886: #define STR_g "g"
! 887: #define STR_h "h"
! 888: #define STR_i "i"
! 889: #define STR_j "j"
! 890: #define STR_k "k"
! 891: #define STR_l "l"
! 892: #define STR_m "m"
! 893: #define STR_n "n"
! 894: #define STR_o "o"
! 895: #define STR_p "p"
! 896: #define STR_q "q"
! 897: #define STR_r "r"
! 898: #define STR_s "s"
! 899: #define STR_t "t"
! 900: #define STR_u "u"
! 901: #define STR_v "v"
! 902: #define STR_w "w"
! 903: #define STR_x "x"
! 904: #define STR_y "y"
! 905: #define STR_z "z"
! 906: #define STR_LEFT_CURLY_BRACKET "{"
! 907: #define STR_VERTICAL_LINE "|"
! 908: #define STR_RIGHT_CURLY_BRACKET "}"
! 909: #define STR_TILDE "~"
! 910:
! 911: #define STRING_ACCEPT0 "ACCEPT\0"
! 912: #define STRING_COMMIT0 "COMMIT\0"
! 913: #define STRING_F0 "F\0"
! 914: #define STRING_FAIL0 "FAIL\0"
! 915: #define STRING_MARK0 "MARK\0"
! 916: #define STRING_PRUNE0 "PRUNE\0"
! 917: #define STRING_SKIP0 "SKIP\0"
! 918: #define STRING_THEN "THEN"
! 919:
! 920: #define STRING_alpha0 "alpha\0"
! 921: #define STRING_lower0 "lower\0"
! 922: #define STRING_upper0 "upper\0"
! 923: #define STRING_alnum0 "alnum\0"
! 924: #define STRING_ascii0 "ascii\0"
! 925: #define STRING_blank0 "blank\0"
! 926: #define STRING_cntrl0 "cntrl\0"
! 927: #define STRING_digit0 "digit\0"
! 928: #define STRING_graph0 "graph\0"
! 929: #define STRING_print0 "print\0"
! 930: #define STRING_punct0 "punct\0"
! 931: #define STRING_space0 "space\0"
! 932: #define STRING_word0 "word\0"
! 933: #define STRING_xdigit "xdigit"
! 934:
! 935: #define STRING_DEFINE "DEFINE"
! 936:
! 937: #define STRING_CR_RIGHTPAR "CR)"
! 938: #define STRING_LF_RIGHTPAR "LF)"
! 939: #define STRING_CRLF_RIGHTPAR "CRLF)"
! 940: #define STRING_ANY_RIGHTPAR "ANY)"
! 941: #define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)"
! 942: #define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)"
! 943: #define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)"
! 944: #define STRING_UTF8_RIGHTPAR "UTF8)"
! 945: #define STRING_UCP_RIGHTPAR "UCP)"
! 946: #define STRING_NO_START_OPT_RIGHTPAR "NO_START_OPT)"
! 947:
! 948: #else /* SUPPORT_UTF8 */
! 949:
! 950: /* UTF-8 support is enabled; always use UTF-8 (=ASCII) character codes. This
! 951: works in both modes non-EBCDIC platforms, and on EBCDIC platforms in UTF-8 mode
! 952: only. */
! 953:
! 954: #define CHAR_HT '\011'
! 955: #define CHAR_VT '\013'
! 956: #define CHAR_FF '\014'
! 957: #define CHAR_CR '\015'
! 958: #define CHAR_NL '\012'
! 959: #define CHAR_BS '\010'
! 960: #define CHAR_BEL '\007'
! 961: #define CHAR_ESC '\033'
! 962: #define CHAR_DEL '\177'
! 963:
! 964: #define CHAR_SPACE '\040'
! 965: #define CHAR_EXCLAMATION_MARK '\041'
! 966: #define CHAR_QUOTATION_MARK '\042'
! 967: #define CHAR_NUMBER_SIGN '\043'
! 968: #define CHAR_DOLLAR_SIGN '\044'
! 969: #define CHAR_PERCENT_SIGN '\045'
! 970: #define CHAR_AMPERSAND '\046'
! 971: #define CHAR_APOSTROPHE '\047'
! 972: #define CHAR_LEFT_PARENTHESIS '\050'
! 973: #define CHAR_RIGHT_PARENTHESIS '\051'
! 974: #define CHAR_ASTERISK '\052'
! 975: #define CHAR_PLUS '\053'
! 976: #define CHAR_COMMA '\054'
! 977: #define CHAR_MINUS '\055'
! 978: #define CHAR_DOT '\056'
! 979: #define CHAR_SLASH '\057'
! 980: #define CHAR_0 '\060'
! 981: #define CHAR_1 '\061'
! 982: #define CHAR_2 '\062'
! 983: #define CHAR_3 '\063'
! 984: #define CHAR_4 '\064'
! 985: #define CHAR_5 '\065'
! 986: #define CHAR_6 '\066'
! 987: #define CHAR_7 '\067'
! 988: #define CHAR_8 '\070'
! 989: #define CHAR_9 '\071'
! 990: #define CHAR_COLON '\072'
! 991: #define CHAR_SEMICOLON '\073'
! 992: #define CHAR_LESS_THAN_SIGN '\074'
! 993: #define CHAR_EQUALS_SIGN '\075'
! 994: #define CHAR_GREATER_THAN_SIGN '\076'
! 995: #define CHAR_QUESTION_MARK '\077'
! 996: #define CHAR_COMMERCIAL_AT '\100'
! 997: #define CHAR_A '\101'
! 998: #define CHAR_B '\102'
! 999: #define CHAR_C '\103'
! 1000: #define CHAR_D '\104'
! 1001: #define CHAR_E '\105'
! 1002: #define CHAR_F '\106'
! 1003: #define CHAR_G '\107'
! 1004: #define CHAR_H '\110'
! 1005: #define CHAR_I '\111'
! 1006: #define CHAR_J '\112'
! 1007: #define CHAR_K '\113'
! 1008: #define CHAR_L '\114'
! 1009: #define CHAR_M '\115'
! 1010: #define CHAR_N '\116'
! 1011: #define CHAR_O '\117'
! 1012: #define CHAR_P '\120'
! 1013: #define CHAR_Q '\121'
! 1014: #define CHAR_R '\122'
! 1015: #define CHAR_S '\123'
! 1016: #define CHAR_T '\124'
! 1017: #define CHAR_U '\125'
! 1018: #define CHAR_V '\126'
! 1019: #define CHAR_W '\127'
! 1020: #define CHAR_X '\130'
! 1021: #define CHAR_Y '\131'
! 1022: #define CHAR_Z '\132'
! 1023: #define CHAR_LEFT_SQUARE_BRACKET '\133'
! 1024: #define CHAR_BACKSLASH '\134'
! 1025: #define CHAR_RIGHT_SQUARE_BRACKET '\135'
! 1026: #define CHAR_CIRCUMFLEX_ACCENT '\136'
! 1027: #define CHAR_UNDERSCORE '\137'
! 1028: #define CHAR_GRAVE_ACCENT '\140'
! 1029: #define CHAR_a '\141'
! 1030: #define CHAR_b '\142'
! 1031: #define CHAR_c '\143'
! 1032: #define CHAR_d '\144'
! 1033: #define CHAR_e '\145'
! 1034: #define CHAR_f '\146'
! 1035: #define CHAR_g '\147'
! 1036: #define CHAR_h '\150'
! 1037: #define CHAR_i '\151'
! 1038: #define CHAR_j '\152'
! 1039: #define CHAR_k '\153'
! 1040: #define CHAR_l '\154'
! 1041: #define CHAR_m '\155'
! 1042: #define CHAR_n '\156'
! 1043: #define CHAR_o '\157'
! 1044: #define CHAR_p '\160'
! 1045: #define CHAR_q '\161'
! 1046: #define CHAR_r '\162'
! 1047: #define CHAR_s '\163'
! 1048: #define CHAR_t '\164'
! 1049: #define CHAR_u '\165'
! 1050: #define CHAR_v '\166'
! 1051: #define CHAR_w '\167'
! 1052: #define CHAR_x '\170'
! 1053: #define CHAR_y '\171'
! 1054: #define CHAR_z '\172'
! 1055: #define CHAR_LEFT_CURLY_BRACKET '\173'
! 1056: #define CHAR_VERTICAL_LINE '\174'
! 1057: #define CHAR_RIGHT_CURLY_BRACKET '\175'
! 1058: #define CHAR_TILDE '\176'
! 1059:
! 1060: #define STR_HT "\011"
! 1061: #define STR_VT "\013"
! 1062: #define STR_FF "\014"
! 1063: #define STR_CR "\015"
! 1064: #define STR_NL "\012"
! 1065: #define STR_BS "\010"
! 1066: #define STR_BEL "\007"
! 1067: #define STR_ESC "\033"
! 1068: #define STR_DEL "\177"
! 1069:
! 1070: #define STR_SPACE "\040"
! 1071: #define STR_EXCLAMATION_MARK "\041"
! 1072: #define STR_QUOTATION_MARK "\042"
! 1073: #define STR_NUMBER_SIGN "\043"
! 1074: #define STR_DOLLAR_SIGN "\044"
! 1075: #define STR_PERCENT_SIGN "\045"
! 1076: #define STR_AMPERSAND "\046"
! 1077: #define STR_APOSTROPHE "\047"
! 1078: #define STR_LEFT_PARENTHESIS "\050"
! 1079: #define STR_RIGHT_PARENTHESIS "\051"
! 1080: #define STR_ASTERISK "\052"
! 1081: #define STR_PLUS "\053"
! 1082: #define STR_COMMA "\054"
! 1083: #define STR_MINUS "\055"
! 1084: #define STR_DOT "\056"
! 1085: #define STR_SLASH "\057"
! 1086: #define STR_0 "\060"
! 1087: #define STR_1 "\061"
! 1088: #define STR_2 "\062"
! 1089: #define STR_3 "\063"
! 1090: #define STR_4 "\064"
! 1091: #define STR_5 "\065"
! 1092: #define STR_6 "\066"
! 1093: #define STR_7 "\067"
! 1094: #define STR_8 "\070"
! 1095: #define STR_9 "\071"
! 1096: #define STR_COLON "\072"
! 1097: #define STR_SEMICOLON "\073"
! 1098: #define STR_LESS_THAN_SIGN "\074"
! 1099: #define STR_EQUALS_SIGN "\075"
! 1100: #define STR_GREATER_THAN_SIGN "\076"
! 1101: #define STR_QUESTION_MARK "\077"
! 1102: #define STR_COMMERCIAL_AT "\100"
! 1103: #define STR_A "\101"
! 1104: #define STR_B "\102"
! 1105: #define STR_C "\103"
! 1106: #define STR_D "\104"
! 1107: #define STR_E "\105"
! 1108: #define STR_F "\106"
! 1109: #define STR_G "\107"
! 1110: #define STR_H "\110"
! 1111: #define STR_I "\111"
! 1112: #define STR_J "\112"
! 1113: #define STR_K "\113"
! 1114: #define STR_L "\114"
! 1115: #define STR_M "\115"
! 1116: #define STR_N "\116"
! 1117: #define STR_O "\117"
! 1118: #define STR_P "\120"
! 1119: #define STR_Q "\121"
! 1120: #define STR_R "\122"
! 1121: #define STR_S "\123"
! 1122: #define STR_T "\124"
! 1123: #define STR_U "\125"
! 1124: #define STR_V "\126"
! 1125: #define STR_W "\127"
! 1126: #define STR_X "\130"
! 1127: #define STR_Y "\131"
! 1128: #define STR_Z "\132"
! 1129: #define STR_LEFT_SQUARE_BRACKET "\133"
! 1130: #define STR_BACKSLASH "\134"
! 1131: #define STR_RIGHT_SQUARE_BRACKET "\135"
! 1132: #define STR_CIRCUMFLEX_ACCENT "\136"
! 1133: #define STR_UNDERSCORE "\137"
! 1134: #define STR_GRAVE_ACCENT "\140"
! 1135: #define STR_a "\141"
! 1136: #define STR_b "\142"
! 1137: #define STR_c "\143"
! 1138: #define STR_d "\144"
! 1139: #define STR_e "\145"
! 1140: #define STR_f "\146"
! 1141: #define STR_g "\147"
! 1142: #define STR_h "\150"
! 1143: #define STR_i "\151"
! 1144: #define STR_j "\152"
! 1145: #define STR_k "\153"
! 1146: #define STR_l "\154"
! 1147: #define STR_m "\155"
! 1148: #define STR_n "\156"
! 1149: #define STR_o "\157"
! 1150: #define STR_p "\160"
! 1151: #define STR_q "\161"
! 1152: #define STR_r "\162"
! 1153: #define STR_s "\163"
! 1154: #define STR_t "\164"
! 1155: #define STR_u "\165"
! 1156: #define STR_v "\166"
! 1157: #define STR_w "\167"
! 1158: #define STR_x "\170"
! 1159: #define STR_y "\171"
! 1160: #define STR_z "\172"
! 1161: #define STR_LEFT_CURLY_BRACKET "\173"
! 1162: #define STR_VERTICAL_LINE "\174"
! 1163: #define STR_RIGHT_CURLY_BRACKET "\175"
! 1164: #define STR_TILDE "\176"
! 1165:
! 1166: #define STRING_ACCEPT0 STR_A STR_C STR_C STR_E STR_P STR_T "\0"
! 1167: #define STRING_COMMIT0 STR_C STR_O STR_M STR_M STR_I STR_T "\0"
! 1168: #define STRING_F0 STR_F "\0"
! 1169: #define STRING_FAIL0 STR_F STR_A STR_I STR_L "\0"
! 1170: #define STRING_MARK0 STR_M STR_A STR_R STR_K "\0"
! 1171: #define STRING_PRUNE0 STR_P STR_R STR_U STR_N STR_E "\0"
! 1172: #define STRING_SKIP0 STR_S STR_K STR_I STR_P "\0"
! 1173: #define STRING_THEN STR_T STR_H STR_E STR_N
! 1174:
! 1175: #define STRING_alpha0 STR_a STR_l STR_p STR_h STR_a "\0"
! 1176: #define STRING_lower0 STR_l STR_o STR_w STR_e STR_r "\0"
! 1177: #define STRING_upper0 STR_u STR_p STR_p STR_e STR_r "\0"
! 1178: #define STRING_alnum0 STR_a STR_l STR_n STR_u STR_m "\0"
! 1179: #define STRING_ascii0 STR_a STR_s STR_c STR_i STR_i "\0"
! 1180: #define STRING_blank0 STR_b STR_l STR_a STR_n STR_k "\0"
! 1181: #define STRING_cntrl0 STR_c STR_n STR_t STR_r STR_l "\0"
! 1182: #define STRING_digit0 STR_d STR_i STR_g STR_i STR_t "\0"
! 1183: #define STRING_graph0 STR_g STR_r STR_a STR_p STR_h "\0"
! 1184: #define STRING_print0 STR_p STR_r STR_i STR_n STR_t "\0"
! 1185: #define STRING_punct0 STR_p STR_u STR_n STR_c STR_t "\0"
! 1186: #define STRING_space0 STR_s STR_p STR_a STR_c STR_e "\0"
! 1187: #define STRING_word0 STR_w STR_o STR_r STR_d "\0"
! 1188: #define STRING_xdigit STR_x STR_d STR_i STR_g STR_i STR_t
! 1189:
! 1190: #define STRING_DEFINE STR_D STR_E STR_F STR_I STR_N STR_E
! 1191:
! 1192: #define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS
! 1193: #define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS
! 1194: #define STRING_CRLF_RIGHTPAR STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
! 1195: #define STRING_ANY_RIGHTPAR STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS
! 1196: #define STRING_ANYCRLF_RIGHTPAR STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
! 1197: #define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
! 1198: #define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS
! 1199: #define STRING_UTF8_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS
! 1200: #define STRING_UCP_RIGHTPAR STR_U STR_C STR_P STR_RIGHT_PARENTHESIS
! 1201: #define STRING_NO_START_OPT_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_S STR_T STR_A STR_R STR_T STR_UNDERSCORE STR_O STR_P STR_T STR_RIGHT_PARENTHESIS
! 1202:
! 1203: #endif /* SUPPORT_UTF8 */
! 1204:
! 1205: /* Escape items that are just an encoding of a particular data value. */
! 1206:
! 1207: #ifndef ESC_e
! 1208: #define ESC_e CHAR_ESC
! 1209: #endif
! 1210:
! 1211: #ifndef ESC_f
! 1212: #define ESC_f CHAR_FF
! 1213: #endif
! 1214:
! 1215: #ifndef ESC_n
! 1216: #define ESC_n CHAR_NL
! 1217: #endif
! 1218:
! 1219: #ifndef ESC_r
! 1220: #define ESC_r CHAR_CR
! 1221: #endif
! 1222:
! 1223: /* We can't officially use ESC_t because it is a POSIX reserved identifier
! 1224: (presumably because of all the others like size_t). */
! 1225:
! 1226: #ifndef ESC_tee
! 1227: #define ESC_tee CHAR_HT
! 1228: #endif
! 1229:
! 1230: /* Codes for different types of Unicode property */
! 1231:
! 1232: #define PT_ANY 0 /* Any property - matches all chars */
! 1233: #define PT_LAMP 1 /* L& - the union of Lu, Ll, Lt */
! 1234: #define PT_GC 2 /* Specified general characteristic (e.g. L) */
! 1235: #define PT_PC 3 /* Specified particular characteristic (e.g. Lu) */
! 1236: #define PT_SC 4 /* Script (e.g. Han) */
! 1237: #define PT_ALNUM 5 /* Alphanumeric - the union of L and N */
! 1238: #define PT_SPACE 6 /* Perl space - Z plus 9,10,12,13 */
! 1239: #define PT_PXSPACE 7 /* POSIX space - Z plus 9,10,11,12,13 */
! 1240: #define PT_WORD 8 /* Word - L plus N plus underscore */
! 1241:
! 1242: /* Flag bits and data types for the extended class (OP_XCLASS) for classes that
! 1243: contain UTF-8 characters with values greater than 255. */
! 1244:
! 1245: #define XCL_NOT 0x01 /* Flag: this is a negative class */
! 1246: #define XCL_MAP 0x02 /* Flag: a 32-byte map is present */
! 1247:
! 1248: #define XCL_END 0 /* Marks end of individual items */
! 1249: #define XCL_SINGLE 1 /* Single item (one multibyte char) follows */
! 1250: #define XCL_RANGE 2 /* A range (two multibyte chars) follows */
! 1251: #define XCL_PROP 3 /* Unicode property (2-byte property code follows) */
! 1252: #define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */
! 1253:
! 1254: /* These are escaped items that aren't just an encoding of a particular data
! 1255: value such as \n. They must have non-zero values, as check_escape() returns
! 1256: their negation. Also, they must appear in the same order as in the opcode
! 1257: definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it
! 1258: corresponds to "." in DOTALL mode rather than an escape sequence. It is also
! 1259: used for [^] in JavaScript compatibility mode. In non-DOTALL mode, "." behaves
! 1260: like \N.
! 1261:
! 1262: The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc.
! 1263: when PCRE_UCP is set, when replacement of \d etc by \p sequences is required.
! 1264: They must be contiguous, and remain in order so that the replacements can be
! 1265: looked up from a table.
! 1266:
! 1267: The final escape must be ESC_REF as subsequent values are used for
! 1268: backreferences (\1, \2, \3, etc). There are two tests in the code for an escape
! 1269: greater than ESC_b and less than ESC_Z to detect the types that may be
! 1270: repeated. These are the types that consume characters. If any new escapes are
! 1271: put in between that don't consume a character, that code will have to change.
! 1272: */
! 1273:
! 1274: enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s,
! 1275: ESC_W, ESC_w, ESC_N, ESC_dum, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H,
! 1276: ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z,
! 1277: ESC_E, ESC_Q, ESC_g, ESC_k,
! 1278: ESC_DU, ESC_du, ESC_SU, ESC_su, ESC_WU, ESC_wu,
! 1279: ESC_REF };
! 1280:
! 1281: /* Opcode table: Starting from 1 (i.e. after OP_END), the values up to
! 1282: OP_EOD must correspond in order to the list of escapes immediately above.
! 1283:
! 1284: *** NOTE NOTE NOTE *** Whenever this list is updated, the two macro definitions
! 1285: that follow must also be updated to match. There are also tables called
! 1286: "coptable" and "poptable" in pcre_dfa_exec.c that must be updated. */
! 1287:
! 1288: enum {
! 1289: OP_END, /* 0 End of pattern */
! 1290:
! 1291: /* Values corresponding to backslashed metacharacters */
! 1292:
! 1293: OP_SOD, /* 1 Start of data: \A */
! 1294: OP_SOM, /* 2 Start of match (subject + offset): \G */
! 1295: OP_SET_SOM, /* 3 Set start of match (\K) */
! 1296: OP_NOT_WORD_BOUNDARY, /* 4 \B */
! 1297: OP_WORD_BOUNDARY, /* 5 \b */
! 1298: OP_NOT_DIGIT, /* 6 \D */
! 1299: OP_DIGIT, /* 7 \d */
! 1300: OP_NOT_WHITESPACE, /* 8 \S */
! 1301: OP_WHITESPACE, /* 9 \s */
! 1302: OP_NOT_WORDCHAR, /* 10 \W */
! 1303: OP_WORDCHAR, /* 11 \w */
! 1304: OP_ANY, /* 12 Match any character except newline */
! 1305: OP_ALLANY, /* 13 Match any character */
! 1306: OP_ANYBYTE, /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */
! 1307: OP_NOTPROP, /* 15 \P (not Unicode property) */
! 1308: OP_PROP, /* 16 \p (Unicode property) */
! 1309: OP_ANYNL, /* 17 \R (any newline sequence) */
! 1310: OP_NOT_HSPACE, /* 18 \H (not horizontal whitespace) */
! 1311: OP_HSPACE, /* 19 \h (horizontal whitespace) */
! 1312: OP_NOT_VSPACE, /* 20 \V (not vertical whitespace) */
! 1313: OP_VSPACE, /* 21 \v (vertical whitespace) */
! 1314: OP_EXTUNI, /* 22 \X (extended Unicode sequence */
! 1315: OP_EODN, /* 23 End of data or \n at end of data: \Z. */
! 1316: OP_EOD, /* 24 End of data: \z */
! 1317:
! 1318: OP_OPT, /* 25 Set runtime options */
! 1319: OP_CIRC, /* 26 Start of line - varies with multiline switch */
! 1320: OP_DOLL, /* 27 End of line - varies with multiline switch */
! 1321: OP_CHAR, /* 28 Match one character, casefully */
! 1322: OP_CHARNC, /* 29 Match one character, caselessly */
! 1323: OP_NOT, /* 30 Match one character, not the following one */
! 1324:
! 1325: OP_STAR, /* 31 The maximizing and minimizing versions of */
! 1326: OP_MINSTAR, /* 32 these six opcodes must come in pairs, with */
! 1327: OP_PLUS, /* 33 the minimizing one second. */
! 1328: OP_MINPLUS, /* 34 This first set applies to single characters.*/
! 1329: OP_QUERY, /* 35 */
! 1330: OP_MINQUERY, /* 36 */
! 1331:
! 1332: OP_UPTO, /* 37 From 0 to n matches */
! 1333: OP_MINUPTO, /* 38 */
! 1334: OP_EXACT, /* 39 Exactly n matches */
! 1335:
! 1336: OP_POSSTAR, /* 40 Possessified star */
! 1337: OP_POSPLUS, /* 41 Possessified plus */
! 1338: OP_POSQUERY, /* 42 Posesssified query */
! 1339: OP_POSUPTO, /* 43 Possessified upto */
! 1340:
! 1341: OP_NOTSTAR, /* 44 The maximizing and minimizing versions of */
! 1342: OP_NOTMINSTAR, /* 45 these six opcodes must come in pairs, with */
! 1343: OP_NOTPLUS, /* 46 the minimizing one second. They must be in */
! 1344: OP_NOTMINPLUS, /* 47 exactly the same order as those above. */
! 1345: OP_NOTQUERY, /* 48 This set applies to "not" single characters. */
! 1346: OP_NOTMINQUERY, /* 49 */
! 1347:
! 1348: OP_NOTUPTO, /* 50 From 0 to n matches */
! 1349: OP_NOTMINUPTO, /* 51 */
! 1350: OP_NOTEXACT, /* 52 Exactly n matches */
! 1351:
! 1352: OP_NOTPOSSTAR, /* 53 Possessified versions */
! 1353: OP_NOTPOSPLUS, /* 54 */
! 1354: OP_NOTPOSQUERY, /* 55 */
! 1355: OP_NOTPOSUPTO, /* 56 */
! 1356:
! 1357: OP_TYPESTAR, /* 57 The maximizing and minimizing versions of */
! 1358: OP_TYPEMINSTAR, /* 58 these six opcodes must come in pairs, with */
! 1359: OP_TYPEPLUS, /* 59 the minimizing one second. These codes must */
! 1360: OP_TYPEMINPLUS, /* 60 be in exactly the same order as those above. */
! 1361: OP_TYPEQUERY, /* 61 This set applies to character types such as \d */
! 1362: OP_TYPEMINQUERY, /* 62 */
! 1363:
! 1364: OP_TYPEUPTO, /* 63 From 0 to n matches */
! 1365: OP_TYPEMINUPTO, /* 64 */
! 1366: OP_TYPEEXACT, /* 65 Exactly n matches */
! 1367:
! 1368: OP_TYPEPOSSTAR, /* 66 Possessified versions */
! 1369: OP_TYPEPOSPLUS, /* 67 */
! 1370: OP_TYPEPOSQUERY, /* 68 */
! 1371: OP_TYPEPOSUPTO, /* 69 */
! 1372:
! 1373: OP_CRSTAR, /* 70 The maximizing and minimizing versions of */
! 1374: OP_CRMINSTAR, /* 71 all these opcodes must come in pairs, with */
! 1375: OP_CRPLUS, /* 72 the minimizing one second. These codes must */
! 1376: OP_CRMINPLUS, /* 73 be in exactly the same order as those above. */
! 1377: OP_CRQUERY, /* 74 These are for character classes and back refs */
! 1378: OP_CRMINQUERY, /* 75 */
! 1379: OP_CRRANGE, /* 76 These are different to the three sets above. */
! 1380: OP_CRMINRANGE, /* 77 */
! 1381:
! 1382: OP_CLASS, /* 78 Match a character class, chars < 256 only */
! 1383: OP_NCLASS, /* 79 Same, but the bitmap was created from a negative
! 1384: class - the difference is relevant only when a UTF-8
! 1385: character > 255 is encountered. */
! 1386:
! 1387: OP_XCLASS, /* 80 Extended class for handling UTF-8 chars within the
! 1388: class. This does both positive and negative. */
! 1389:
! 1390: OP_REF, /* 81 Match a back reference */
! 1391: OP_RECURSE, /* 82 Match a numbered subpattern (possibly recursive) */
! 1392: OP_CALLOUT, /* 83 Call out to external function if provided */
! 1393:
! 1394: OP_ALT, /* 84 Start of alternation */
! 1395: OP_KET, /* 85 End of group that doesn't have an unbounded repeat */
! 1396: OP_KETRMAX, /* 86 These two must remain together and in this */
! 1397: OP_KETRMIN, /* 87 order. They are for groups the repeat for ever. */
! 1398:
! 1399: /* The assertions must come before BRA, CBRA, ONCE, and COND.*/
! 1400:
! 1401: OP_ASSERT, /* 88 Positive lookahead */
! 1402: OP_ASSERT_NOT, /* 89 Negative lookahead */
! 1403: OP_ASSERTBACK, /* 90 Positive lookbehind */
! 1404: OP_ASSERTBACK_NOT, /* 91 Negative lookbehind */
! 1405: OP_REVERSE, /* 92 Move pointer back - used in lookbehind assertions */
! 1406:
! 1407: /* ONCE, BRA, CBRA, and COND must come after the assertions, with ONCE first,
! 1408: as there's a test for >= ONCE for a subpattern that isn't an assertion. */
! 1409:
! 1410: OP_ONCE, /* 93 Atomic group */
! 1411: OP_BRA, /* 94 Start of non-capturing bracket */
! 1412: OP_CBRA, /* 95 Start of capturing bracket */
! 1413: OP_COND, /* 96 Conditional group */
! 1414:
! 1415: /* These three must follow the previous three, in the same order. There's a
! 1416: check for >= SBRA to distinguish the two sets. */
! 1417:
! 1418: OP_SBRA, /* 97 Start of non-capturing bracket, check empty */
! 1419: OP_SCBRA, /* 98 Start of capturing bracket, check empty */
! 1420: OP_SCOND, /* 99 Conditional group, check empty */
! 1421:
! 1422: /* The next two pairs must (respectively) be kept together. */
! 1423:
! 1424: OP_CREF, /* 100 Used to hold a capture number as condition */
! 1425: OP_NCREF, /* 101 Same, but generaged by a name reference*/
! 1426: OP_RREF, /* 102 Used to hold a recursion number as condition */
! 1427: OP_NRREF, /* 103 Same, but generaged by a name reference*/
! 1428: OP_DEF, /* 104 The DEFINE condition */
! 1429:
! 1430: OP_BRAZERO, /* 105 These two must remain together and in this */
! 1431: OP_BRAMINZERO, /* 106 order. */
! 1432:
! 1433: /* These are backtracking control verbs */
! 1434:
! 1435: OP_MARK, /* 107 always has an argument */
! 1436: OP_PRUNE, /* 108 */
! 1437: OP_PRUNE_ARG, /* 109 same, but with argument */
! 1438: OP_SKIP, /* 110 */
! 1439: OP_SKIP_ARG, /* 111 same, but with argument */
! 1440: OP_THEN, /* 112 */
! 1441: OP_THEN_ARG, /* 113 same, but with argument */
! 1442: OP_COMMIT, /* 114 */
! 1443:
! 1444: /* These are forced failure and success verbs */
! 1445:
! 1446: OP_FAIL, /* 115 */
! 1447: OP_ACCEPT, /* 116 */
! 1448: OP_CLOSE, /* 117 Used before OP_ACCEPT to close open captures */
! 1449:
! 1450: /* This is used to skip a subpattern with a {0} quantifier */
! 1451:
! 1452: OP_SKIPZERO, /* 118 */
! 1453:
! 1454: /* This is not an opcode, but is used to check that tables indexed by opcode
! 1455: are the correct length, in order to catch updating errors - there have been
! 1456: some in the past. */
! 1457:
! 1458: OP_TABLE_LENGTH
! 1459: };
! 1460:
! 1461: /* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro
! 1462: definitions that follow must also be updated to match. There are also tables
! 1463: called "coptable" and "poptable" in pcre_dfa_exec.c that must be updated. */
! 1464:
! 1465:
! 1466: /* This macro defines textual names for all the opcodes. These are used only
! 1467: for debugging. The macro is referenced only in pcre_printint.c. */
! 1468:
! 1469: #define OP_NAME_LIST \
! 1470: "End", "\\A", "\\G", "\\K", "\\B", "\\b", "\\D", "\\d", \
! 1471: "\\S", "\\s", "\\W", "\\w", "Any", "AllAny", "Anybyte", \
! 1472: "notprop", "prop", "\\R", "\\H", "\\h", "\\V", "\\v", \
! 1473: "extuni", "\\Z", "\\z", \
! 1474: "Opt", "^", "$", "char", "charnc", "not", \
! 1475: "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
! 1476: "*+","++", "?+", "{", \
! 1477: "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
! 1478: "*+","++", "?+", "{", \
! 1479: "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
! 1480: "*+","++", "?+", "{", \
! 1481: "*", "*?", "+", "+?", "?", "??", "{", "{", \
! 1482: "class", "nclass", "xclass", "Ref", "Recurse", "Callout", \
! 1483: "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \
! 1484: "AssertB", "AssertB not", "Reverse", \
! 1485: "Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond", \
! 1486: "Cond ref", "Cond nref", "Cond rec", "Cond nrec", "Cond def", \
! 1487: "Brazero", "Braminzero", \
! 1488: "*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP", \
! 1489: "*THEN", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \
! 1490: "Close", "Skip zero"
! 1491:
! 1492:
! 1493: /* This macro defines the length of fixed length operations in the compiled
! 1494: regex. The lengths are used when searching for specific things, and also in the
! 1495: debugging printing of a compiled regex. We use a macro so that it can be
! 1496: defined close to the definitions of the opcodes themselves.
! 1497:
! 1498: As things have been extended, some of these are no longer fixed lenths, but are
! 1499: minima instead. For example, the length of a single-character repeat may vary
! 1500: in UTF-8 mode. The code that uses this table must know about such things. */
! 1501:
! 1502: #define OP_LENGTHS \
! 1503: 1, /* End */ \
! 1504: 1, 1, 1, 1, 1, /* \A, \G, \K, \B, \b */ \
! 1505: 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ \
! 1506: 1, 1, 1, /* Any, AllAny, Anybyte */ \
! 1507: 3, 3, /* \P, \p */ \
! 1508: 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ \
! 1509: 1, /* \X */ \
! 1510: 1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \
! 1511: 2, /* Char - the minimum length */ \
! 1512: 2, /* Charnc - the minimum length */ \
! 1513: 2, /* not */ \
! 1514: /* Positive single-char repeats ** These are */ \
! 1515: 2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \
! 1516: 4, 4, 4, /* upto, minupto, exact ** UTF-8 mode */ \
! 1517: 2, 2, 2, 4, /* *+, ++, ?+, upto+ */ \
! 1518: /* Negative single-char repeats - only for chars < 256 */ \
! 1519: 2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \
! 1520: 4, 4, 4, /* NOT upto, minupto, exact */ \
! 1521: 2, 2, 2, 4, /* Possessive *, +, ?, upto */ \
! 1522: /* Positive type repeats */ \
! 1523: 2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \
! 1524: 4, 4, 4, /* Type upto, minupto, exact */ \
! 1525: 2, 2, 2, 4, /* Possessive *+, ++, ?+, upto+ */ \
! 1526: /* Character class & ref repeats */ \
! 1527: 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \
! 1528: 5, 5, /* CRRANGE, CRMINRANGE */ \
! 1529: 33, /* CLASS */ \
! 1530: 33, /* NCLASS */ \
! 1531: 0, /* XCLASS - variable length */ \
! 1532: 3, /* REF */ \
! 1533: 1+LINK_SIZE, /* RECURSE */ \
! 1534: 2+2*LINK_SIZE, /* CALLOUT */ \
! 1535: 1+LINK_SIZE, /* Alt */ \
! 1536: 1+LINK_SIZE, /* Ket */ \
! 1537: 1+LINK_SIZE, /* KetRmax */ \
! 1538: 1+LINK_SIZE, /* KetRmin */ \
! 1539: 1+LINK_SIZE, /* Assert */ \
! 1540: 1+LINK_SIZE, /* Assert not */ \
! 1541: 1+LINK_SIZE, /* Assert behind */ \
! 1542: 1+LINK_SIZE, /* Assert behind not */ \
! 1543: 1+LINK_SIZE, /* Reverse */ \
! 1544: 1+LINK_SIZE, /* ONCE */ \
! 1545: 1+LINK_SIZE, /* BRA */ \
! 1546: 3+LINK_SIZE, /* CBRA */ \
! 1547: 1+LINK_SIZE, /* COND */ \
! 1548: 1+LINK_SIZE, /* SBRA */ \
! 1549: 3+LINK_SIZE, /* SCBRA */ \
! 1550: 1+LINK_SIZE, /* SCOND */ \
! 1551: 3, 3, /* CREF, NCREF */ \
! 1552: 3, 3, /* RREF, NRREF */ \
! 1553: 1, /* DEF */ \
! 1554: 1, 1, /* BRAZERO, BRAMINZERO */ \
! 1555: 3, 1, 3, /* MARK, PRUNE, PRUNE_ARG */ \
! 1556: 1, 3, /* SKIP, SKIP_ARG */ \
! 1557: 1+LINK_SIZE, 3+LINK_SIZE, /* THEN, THEN_ARG */ \
! 1558: 1, 1, 1, 3, 1 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
! 1559:
! 1560:
! 1561: /* A magic value for OP_RREF and OP_NRREF to indicate the "any recursion"
! 1562: condition. */
! 1563:
! 1564: #define RREF_ANY 0xffff
! 1565:
! 1566: /* Compile time error code numbers. They are given names so that they can more
! 1567: easily be tracked. When a new number is added, the table called eint in
! 1568: pcreposix.c must be updated. */
! 1569:
! 1570: enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
! 1571: ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19,
! 1572: ERR20, ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29,
! 1573: ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
! 1574: ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
! 1575: ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
! 1576: ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68,
! 1577: ERRCOUNT };
! 1578:
! 1579: /* The real format of the start of the pcre block; the index of names and the
! 1580: code vector run on as long as necessary after the end. We store an explicit
! 1581: offset to the name table so that if a regex is compiled on one host, saved, and
! 1582: then run on another where the size of pointers is different, all might still
! 1583: be well. For the case of compiled-on-4 and run-on-8, we include an extra
! 1584: pointer that is always NULL. For future-proofing, a few dummy fields were
! 1585: originally included - even though you can never get this planning right - but
! 1586: there is only one left now.
! 1587:
! 1588: NOTE NOTE NOTE:
! 1589: Because people can now save and re-use compiled patterns, any additions to this
! 1590: structure should be made at the end, and something earlier (e.g. a new
! 1591: flag in the options or one of the dummy fields) should indicate that the new
! 1592: fields are present. Currently PCRE always sets the dummy fields to zero.
! 1593: NOTE NOTE NOTE
! 1594: */
! 1595:
! 1596: typedef struct real_pcre {
! 1597: pcre_uint32 magic_number;
! 1598: pcre_uint32 size; /* Total that was malloced */
! 1599: pcre_uint32 options; /* Public options */
! 1600: pcre_uint16 flags; /* Private flags */
! 1601: pcre_uint16 dummy1; /* For future use */
! 1602: pcre_uint16 top_bracket;
! 1603: pcre_uint16 top_backref;
! 1604: pcre_uint16 first_byte;
! 1605: pcre_uint16 req_byte;
! 1606: pcre_uint16 name_table_offset; /* Offset to name table that follows */
! 1607: pcre_uint16 name_entry_size; /* Size of any name items */
! 1608: pcre_uint16 name_count; /* Number of name items */
! 1609: pcre_uint16 ref_count; /* Reference count */
! 1610:
! 1611: const unsigned char *tables; /* Pointer to tables or NULL for std */
! 1612: const unsigned char *nullpad; /* NULL padding */
! 1613: } real_pcre;
! 1614:
! 1615: /* The format of the block used to store data from pcre_study(). The same
! 1616: remark (see NOTE above) about extending this structure applies. */
! 1617:
! 1618: typedef struct pcre_study_data {
! 1619: pcre_uint32 size; /* Total that was malloced */
! 1620: pcre_uint32 flags; /* Private flags */
! 1621: uschar start_bits[32]; /* Starting char bits */
! 1622: pcre_uint32 minlength; /* Minimum subject length */
! 1623: } pcre_study_data;
! 1624:
! 1625: /* Structure for building a chain of open capturing subpatterns during
! 1626: compiling, so that instructions to close them can be compiled when (*ACCEPT) is
! 1627: encountered. This is also used to identify subpatterns that contain recursive
! 1628: back references to themselves, so that they can be made atomic. */
! 1629:
! 1630: typedef struct open_capitem {
! 1631: struct open_capitem *next; /* Chain link */
! 1632: pcre_uint16 number; /* Capture number */
! 1633: pcre_uint16 flag; /* Set TRUE if recursive back ref */
! 1634: } open_capitem;
! 1635:
! 1636: /* Structure for passing "static" information around between the functions
! 1637: doing the compiling, so that they are thread-safe. */
! 1638:
! 1639: typedef struct compile_data {
! 1640: const uschar *lcc; /* Points to lower casing table */
! 1641: const uschar *fcc; /* Points to case-flipping table */
! 1642: const uschar *cbits; /* Points to character type table */
! 1643: const uschar *ctypes; /* Points to table of type maps */
! 1644: const uschar *start_workspace;/* The start of working space */
! 1645: const uschar *start_code; /* The start of the compiled code */
! 1646: const uschar *start_pattern; /* The start of the pattern */
! 1647: const uschar *end_pattern; /* The end of the pattern */
! 1648: open_capitem *open_caps; /* Chain of open capture items */
! 1649: uschar *hwm; /* High watermark of workspace */
! 1650: uschar *name_table; /* The name/number table */
! 1651: int names_found; /* Number of entries so far */
! 1652: int name_entry_size; /* Size of each entry */
! 1653: int bracount; /* Count of capturing parens as we compile */
! 1654: int final_bracount; /* Saved value after first pass */
! 1655: int top_backref; /* Maximum back reference */
! 1656: unsigned int backref_map; /* Bitmap of low back refs */
! 1657: int external_options; /* External (initial) options */
! 1658: int external_flags; /* External flag bits to be set */
! 1659: int req_varyopt; /* "After variable item" flag for reqbyte */
! 1660: BOOL had_accept; /* (*ACCEPT) encountered */
! 1661: BOOL check_lookbehind; /* Lookbehinds need later checking */
! 1662: int nltype; /* Newline type */
! 1663: int nllen; /* Newline string length */
! 1664: uschar nl[4]; /* Newline string when fixed length */
! 1665: } compile_data;
! 1666:
! 1667: /* Structure for maintaining a chain of pointers to the currently incomplete
! 1668: branches, for testing for left recursion. */
! 1669:
! 1670: typedef struct branch_chain {
! 1671: struct branch_chain *outer;
! 1672: uschar *current_branch;
! 1673: } branch_chain;
! 1674:
! 1675: /* Structure for items in a linked list that represents an explicit recursive
! 1676: call within the pattern. */
! 1677:
! 1678: typedef struct recursion_info {
! 1679: struct recursion_info *prevrec; /* Previous recursion record (or NULL) */
! 1680: int group_num; /* Number of group that was called */
! 1681: const uschar *after_call; /* "Return value": points after the call in the expr */
! 1682: int *offset_save; /* Pointer to start of saved offsets */
! 1683: int saved_max; /* Number of saved offsets */
! 1684: int save_offset_top; /* Current value of offset_top */
! 1685: } recursion_info;
! 1686:
! 1687: /* Structure for building a chain of data for holding the values of the subject
! 1688: pointer at the start of each subpattern, so as to detect when an empty string
! 1689: has been matched by a subpattern - to break infinite loops. */
! 1690:
! 1691: typedef struct eptrblock {
! 1692: struct eptrblock *epb_prev;
! 1693: USPTR epb_saved_eptr;
! 1694: } eptrblock;
! 1695:
! 1696:
! 1697: /* Structure for passing "static" information around between the functions
! 1698: doing traditional NFA matching, so that they are thread-safe. */
! 1699:
! 1700: typedef struct match_data {
! 1701: unsigned long int match_call_count; /* As it says */
! 1702: unsigned long int match_limit; /* As it says */
! 1703: unsigned long int match_limit_recursion; /* As it says */
! 1704: int *offset_vector; /* Offset vector */
! 1705: int offset_end; /* One past the end */
! 1706: int offset_max; /* The maximum usable for return data */
! 1707: int nltype; /* Newline type */
! 1708: int nllen; /* Newline string length */
! 1709: int name_count; /* Number of names in name table */
! 1710: int name_entry_size; /* Size of entry in names table */
! 1711: uschar *name_table; /* Table of names */
! 1712: uschar nl[4]; /* Newline string when fixed */
! 1713: const uschar *lcc; /* Points to lower casing table */
! 1714: const uschar *ctypes; /* Points to table of type maps */
! 1715: BOOL offset_overflow; /* Set if too many extractions */
! 1716: BOOL notbol; /* NOTBOL flag */
! 1717: BOOL noteol; /* NOTEOL flag */
! 1718: BOOL utf8; /* UTF8 flag */
! 1719: BOOL jscript_compat; /* JAVASCRIPT_COMPAT flag */
! 1720: BOOL use_ucp; /* PCRE_UCP flag */
! 1721: BOOL endonly; /* Dollar not before final \n */
! 1722: BOOL notempty; /* Empty string match not wanted */
! 1723: BOOL notempty_atstart; /* Empty string match at start not wanted */
! 1724: BOOL hitend; /* Hit the end of the subject at some point */
! 1725: BOOL bsr_anycrlf; /* \R is just any CRLF, not full Unicode */
! 1726: const uschar *start_code; /* For use when recursing */
! 1727: USPTR start_subject; /* Start of the subject string */
! 1728: USPTR end_subject; /* End of the subject string */
! 1729: USPTR start_match_ptr; /* Start of matched string */
! 1730: USPTR end_match_ptr; /* Subject position at end match */
! 1731: USPTR start_used_ptr; /* Earliest consulted character */
! 1732: int partial; /* PARTIAL options */
! 1733: int end_offset_top; /* Highwater mark at end of match */
! 1734: int capture_last; /* Most recent capture number */
! 1735: int start_offset; /* The start offset value */
! 1736: eptrblock *eptrchain; /* Chain of eptrblocks for tail recursions */
! 1737: int eptrn; /* Next free eptrblock */
! 1738: recursion_info *recursive; /* Linked list of recursion data */
! 1739: void *callout_data; /* To pass back to callouts */
! 1740: const uschar *mark; /* Mark pointer to pass back */
! 1741: } match_data;
! 1742:
! 1743: /* A similar structure is used for the same purpose by the DFA matching
! 1744: functions. */
! 1745:
! 1746: typedef struct dfa_match_data {
! 1747: const uschar *start_code; /* Start of the compiled pattern */
! 1748: const uschar *start_subject; /* Start of the subject string */
! 1749: const uschar *end_subject; /* End of subject string */
! 1750: const uschar *start_used_ptr; /* Earliest consulted character */
! 1751: const uschar *tables; /* Character tables */
! 1752: int start_offset; /* The start offset value */
! 1753: int moptions; /* Match options */
! 1754: int poptions; /* Pattern options */
! 1755: int nltype; /* Newline type */
! 1756: int nllen; /* Newline string length */
! 1757: uschar nl[4]; /* Newline string when fixed */
! 1758: void *callout_data; /* To pass back to callouts */
! 1759: } dfa_match_data;
! 1760:
! 1761: /* Bit definitions for entries in the pcre_ctypes table. */
! 1762:
! 1763: #define ctype_space 0x01
! 1764: #define ctype_letter 0x02
! 1765: #define ctype_digit 0x04
! 1766: #define ctype_xdigit 0x08
! 1767: #define ctype_word 0x10 /* alphanumeric or '_' */
! 1768: #define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */
! 1769:
! 1770: /* Offsets for the bitmap tables in pcre_cbits. Each table contains a set
! 1771: of bits for a class map. Some classes are built by combining these tables. */
! 1772:
! 1773: #define cbit_space 0 /* [:space:] or \s */
! 1774: #define cbit_xdigit 32 /* [:xdigit:] */
! 1775: #define cbit_digit 64 /* [:digit:] or \d */
! 1776: #define cbit_upper 96 /* [:upper:] */
! 1777: #define cbit_lower 128 /* [:lower:] */
! 1778: #define cbit_word 160 /* [:word:] or \w */
! 1779: #define cbit_graph 192 /* [:graph:] */
! 1780: #define cbit_print 224 /* [:print:] */
! 1781: #define cbit_punct 256 /* [:punct:] */
! 1782: #define cbit_cntrl 288 /* [:cntrl:] */
! 1783: #define cbit_length 320 /* Length of the cbits table */
! 1784:
! 1785: /* Offsets of the various tables from the base tables pointer, and
! 1786: total length. */
! 1787:
! 1788: #define lcc_offset 0
! 1789: #define fcc_offset 256
! 1790: #define cbits_offset 512
! 1791: #define ctypes_offset (cbits_offset + cbit_length)
! 1792: #define tables_length (ctypes_offset + 256)
! 1793:
! 1794: /* Layout of the UCP type table that translates property names into types and
! 1795: codes. Each entry used to point directly to a name, but to reduce the number of
! 1796: relocations in shared libraries, it now has an offset into a single string
! 1797: instead. */
! 1798:
! 1799: typedef struct {
! 1800: pcre_uint16 name_offset;
! 1801: pcre_uint16 type;
! 1802: pcre_uint16 value;
! 1803: } ucp_type_table;
! 1804:
! 1805:
! 1806: /* Internal shared data tables. These are tables that are used by more than one
! 1807: of the exported public functions. They have to be "external" in the C sense,
! 1808: but are not part of the PCRE public API. The data for these tables is in the
! 1809: pcre_tables.c module. */
! 1810:
! 1811: extern const int _pcre_utf8_table1[];
! 1812: extern const int _pcre_utf8_table2[];
! 1813: extern const int _pcre_utf8_table3[];
! 1814: extern const uschar _pcre_utf8_table4[];
! 1815:
! 1816: extern const int _pcre_utf8_table1_size;
! 1817:
! 1818: extern const char _pcre_utt_names[];
! 1819: extern const ucp_type_table _pcre_utt[];
! 1820: extern const int _pcre_utt_size;
! 1821:
! 1822: extern const uschar _pcre_default_tables[];
! 1823:
! 1824: extern const uschar _pcre_OP_lengths[];
! 1825:
! 1826:
! 1827: /* Internal shared functions. These are functions that are used by more than
! 1828: one of the exported public functions. They have to be "external" in the C
! 1829: sense, but are not part of the PCRE public API. */
! 1830:
! 1831: extern const uschar *_pcre_find_bracket(const uschar *, BOOL, int);
! 1832: extern BOOL _pcre_is_newline(USPTR, int, USPTR, int *, BOOL);
! 1833: extern int _pcre_ord2utf8(int, uschar *);
! 1834: extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *,
! 1835: const pcre_study_data *, pcre_study_data *);
! 1836: extern int _pcre_valid_utf8(USPTR, int);
! 1837: extern BOOL _pcre_was_newline(USPTR, int, USPTR, int *, BOOL);
! 1838: extern BOOL _pcre_xclass(int, const uschar *);
! 1839:
! 1840:
! 1841: /* Unicode character database (UCD) */
! 1842:
! 1843: typedef struct {
! 1844: uschar script;
! 1845: uschar chartype;
! 1846: pcre_int32 other_case;
! 1847: } ucd_record;
! 1848:
! 1849: extern const ucd_record _pcre_ucd_records[];
! 1850: extern const uschar _pcre_ucd_stage1[];
! 1851: extern const pcre_uint16 _pcre_ucd_stage2[];
! 1852: extern const int _pcre_ucp_gentype[];
! 1853:
! 1854:
! 1855: /* UCD access macros */
! 1856:
! 1857: #define UCD_BLOCK_SIZE 128
! 1858: #define GET_UCD(ch) (_pcre_ucd_records + \
! 1859: _pcre_ucd_stage2[_pcre_ucd_stage1[(ch) / UCD_BLOCK_SIZE] * \
! 1860: UCD_BLOCK_SIZE + ch % UCD_BLOCK_SIZE])
! 1861:
! 1862: #define UCD_CHARTYPE(ch) GET_UCD(ch)->chartype
! 1863: #define UCD_SCRIPT(ch) GET_UCD(ch)->script
! 1864: #define UCD_CATEGORY(ch) _pcre_ucp_gentype[UCD_CHARTYPE(ch)]
! 1865: #define UCD_OTHERCASE(ch) (ch + GET_UCD(ch)->other_case)
! 1866:
! 1867: #endif
! 1868:
! 1869: /* End of pcre_internal.h */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>