Annotation of embedaddon/pcre/pcre_internal.h, revision 1.1.1.1
1.1 misho 1: /*************************************************
2: * Perl-Compatible Regular Expressions *
3: *************************************************/
4:
5:
6: /* PCRE is a library of functions to support regular expressions whose syntax
7: and semantics are as close as possible to those of the Perl 5 language.
8:
9: Written by Philip Hazel
10: Copyright (c) 1997-2011 University of Cambridge
11:
12: -----------------------------------------------------------------------------
13: Redistribution and use in source and binary forms, with or without
14: modification, are permitted provided that the following conditions are met:
15:
16: * Redistributions of source code must retain the above copyright notice,
17: this list of conditions and the following disclaimer.
18:
19: * Redistributions in binary form must reproduce the above copyright
20: notice, this list of conditions and the following disclaimer in the
21: documentation and/or other materials provided with the distribution.
22:
23: * Neither the name of the University of Cambridge nor the names of its
24: contributors may be used to endorse or promote products derived from
25: this software without specific prior written permission.
26:
27: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37: POSSIBILITY OF SUCH DAMAGE.
38: -----------------------------------------------------------------------------
39: */
40:
41: /* This header contains definitions that are shared between the different
42: modules, but which are not relevant to the exported API. This includes some
43: functions whose names all begin with "_pcre_". */
44:
45: #ifndef PCRE_INTERNAL_H
46: #define PCRE_INTERNAL_H
47:
48: /* Define PCRE_DEBUG to get debugging output on stdout. */
49:
50: #if 0
51: #define PCRE_DEBUG
52: #endif
53:
54: /* We do not support both EBCDIC and UTF-8 at the same time. The "configure"
55: script prevents both being selected, but not everybody uses "configure". */
56:
57: #if defined EBCDIC && defined SUPPORT_UTF8
58: #error The use of both EBCDIC and SUPPORT_UTF8 is not supported.
59: #endif
60:
61: /* If SUPPORT_UCP is defined, SUPPORT_UTF8 must also be defined. The
62: "configure" script ensures this, but not everybody uses "configure". */
63:
64: #if defined SUPPORT_UCP && !defined SUPPORT_UTF8
65: #define SUPPORT_UTF8 1
66: #endif
67:
68: /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
69: inline, and there are *still* stupid compilers about that don't like indented
70: pre-processor statements, or at least there were when I first wrote this. After
71: all, it had only been about 10 years then...
72:
73: It turns out that the Mac Debugging.h header also defines the macro DPRINTF, so
74: be absolutely sure we get our version. */
75:
76: #undef DPRINTF
77: #ifdef PCRE_DEBUG
78: #define DPRINTF(p) printf p
79: #else
80: #define DPRINTF(p) /* Nothing */
81: #endif
82:
83:
84: /* Standard C headers plus the external interface definition. The only time
85: setjmp and stdarg are used is when NO_RECURSE is set. */
86:
87: #include <ctype.h>
88: #include <limits.h>
89: #include <stddef.h>
90: #include <stdio.h>
91: #include <stdlib.h>
92: #include <string.h>
93:
94: /* When compiling a DLL for Windows, the exported symbols have to be declared
95: using some MS magic. I found some useful information on this web page:
96: http://msdn2.microsoft.com/en-us/library/y4h7bcy6(VS.80).aspx. According to the
97: information there, using __declspec(dllexport) without "extern" we have a
98: definition; with "extern" we have a declaration. The settings here override the
99: setting in pcre.h (which is included below); it defines only PCRE_EXP_DECL,
100: which is all that is needed for applications (they just import the symbols). We
101: use:
102:
103: PCRE_EXP_DECL for declarations
104: PCRE_EXP_DEFN for definitions of exported functions
105: PCRE_EXP_DATA_DEFN for definitions of exported variables
106:
107: The reason for the two DEFN macros is that in non-Windows environments, one
108: does not want to have "extern" before variable definitions because it leads to
109: compiler warnings. So we distinguish between functions and variables. In
110: Windows, the two should always be the same.
111:
112: The reason for wrapping this in #ifndef PCRE_EXP_DECL is so that pcretest,
113: which is an application, but needs to import this file in order to "peek" at
114: internals, can #include pcre.h first to get an application's-eye view.
115:
116: In principle, people compiling for non-Windows, non-Unix-like (i.e. uncommon,
117: special-purpose environments) might want to stick other stuff in front of
118: exported symbols. That's why, in the non-Windows case, we set PCRE_EXP_DEFN and
119: PCRE_EXP_DATA_DEFN only if they are not already set. */
120:
121: #ifndef PCRE_EXP_DECL
122: # ifdef _WIN32
123: # ifndef PCRE_STATIC
124: # define PCRE_EXP_DECL extern __declspec(dllexport)
125: # define PCRE_EXP_DEFN __declspec(dllexport)
126: # define PCRE_EXP_DATA_DEFN __declspec(dllexport)
127: # else
128: # define PCRE_EXP_DECL extern
129: # define PCRE_EXP_DEFN
130: # define PCRE_EXP_DATA_DEFN
131: # endif
132: # else
133: # ifdef __cplusplus
134: # define PCRE_EXP_DECL extern "C"
135: # else
136: # define PCRE_EXP_DECL extern
137: # endif
138: # ifndef PCRE_EXP_DEFN
139: # define PCRE_EXP_DEFN PCRE_EXP_DECL
140: # endif
141: # ifndef PCRE_EXP_DATA_DEFN
142: # define PCRE_EXP_DATA_DEFN
143: # endif
144: # endif
145: #endif
146:
147: /* When compiling with the MSVC compiler, it is sometimes necessary to include
148: a "calling convention" before exported function names. (This is secondhand
149: information; I know nothing about MSVC myself). For example, something like
150:
151: void __cdecl function(....)
152:
153: might be needed. In order so make this easy, all the exported functions have
154: PCRE_CALL_CONVENTION just before their names. It is rarely needed; if not
155: set, we ensure here that it has no effect. */
156:
157: #ifndef PCRE_CALL_CONVENTION
158: #define PCRE_CALL_CONVENTION
159: #endif
160:
161: /* We need to have types that specify unsigned 16-bit and 32-bit integers. We
162: cannot determine these outside the compilation (e.g. by running a program as
163: part of "configure") because PCRE is often cross-compiled for use on other
164: systems. Instead we make use of the maximum sizes that are available at
165: preprocessor time in standard C environments. */
166:
167: #if USHRT_MAX == 65535
168: typedef unsigned short pcre_uint16;
169: typedef short pcre_int16;
170: #elif UINT_MAX == 65535
171: typedef unsigned int pcre_uint16;
172: typedef int pcre_int16;
173: #else
174: #error Cannot determine a type for 16-bit unsigned integers
175: #endif
176:
177: #if UINT_MAX == 4294967295
178: typedef unsigned int pcre_uint32;
179: typedef int pcre_int32;
180: #elif ULONG_MAX == 4294967295
181: typedef unsigned long int pcre_uint32;
182: typedef long int pcre_int32;
183: #else
184: #error Cannot determine a type for 32-bit unsigned integers
185: #endif
186:
187: /* When checking for integer overflow in pcre_compile(), we need to handle
188: large integers. If a 64-bit integer type is available, we can use that.
189: Otherwise we have to cast to double, which of course requires floating point
190: arithmetic. Handle this by defining a macro for the appropriate type. If
191: stdint.h is available, include it; it may define INT64_MAX. Systems that do not
192: have stdint.h (e.g. Solaris) may have inttypes.h. The macro int64_t may be set
193: by "configure". */
194:
195: #if HAVE_STDINT_H
196: #include <stdint.h>
197: #elif HAVE_INTTYPES_H
198: #include <inttypes.h>
199: #endif
200:
201: #if defined INT64_MAX || defined int64_t
202: #define INT64_OR_DOUBLE int64_t
203: #else
204: #define INT64_OR_DOUBLE double
205: #endif
206:
207: /* All character handling must be done as unsigned characters. Otherwise there
208: are problems with top-bit-set characters and functions such as isspace().
209: However, we leave the interface to the outside world as char *, because that
210: should make things easier for callers. We define a short type for unsigned char
211: to save lots of typing. I tried "uchar", but it causes problems on Digital
212: Unix, where it is defined in sys/types, so use "uschar" instead. */
213:
214: typedef unsigned char uschar;
215:
216: /* This is an unsigned int value that no character can ever have. UTF-8
217: characters only go up to 0x7fffffff (though Unicode doesn't go beyond
218: 0x0010ffff). */
219:
220: #define NOTACHAR 0xffffffff
221:
222: /* PCRE is able to support several different kinds of newline (CR, LF, CRLF,
223: "any" and "anycrlf" at present). The following macros are used to package up
224: testing for newlines. NLBLOCK, PSSTART, and PSEND are defined in the various
225: modules to indicate in which datablock the parameters exist, and what the
226: start/end of string field names are. */
227:
228: #define NLTYPE_FIXED 0 /* Newline is a fixed length string */
229: #define NLTYPE_ANY 1 /* Newline is any Unicode line ending */
230: #define NLTYPE_ANYCRLF 2 /* Newline is CR, LF, or CRLF */
231:
232: /* This macro checks for a newline at the given position */
233:
234: #define IS_NEWLINE(p) \
235: ((NLBLOCK->nltype != NLTYPE_FIXED)? \
236: ((p) < NLBLOCK->PSEND && \
237: _pcre_is_newline((p), NLBLOCK->nltype, NLBLOCK->PSEND, &(NLBLOCK->nllen),\
238: utf8)) \
239: : \
240: ((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \
241: (p)[0] == NLBLOCK->nl[0] && \
242: (NLBLOCK->nllen == 1 || (p)[1] == NLBLOCK->nl[1]) \
243: ) \
244: )
245:
246: /* This macro checks for a newline immediately preceding the given position */
247:
248: #define WAS_NEWLINE(p) \
249: ((NLBLOCK->nltype != NLTYPE_FIXED)? \
250: ((p) > NLBLOCK->PSSTART && \
251: _pcre_was_newline((p), NLBLOCK->nltype, NLBLOCK->PSSTART, \
252: &(NLBLOCK->nllen), utf8)) \
253: : \
254: ((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \
255: (p)[-NLBLOCK->nllen] == NLBLOCK->nl[0] && \
256: (NLBLOCK->nllen == 1 || (p)[-NLBLOCK->nllen+1] == NLBLOCK->nl[1]) \
257: ) \
258: )
259:
260: /* When PCRE is compiled as a C++ library, the subject pointer can be replaced
261: with a custom type. This makes it possible, for example, to allow pcre_exec()
262: to process subject strings that are discontinuous by using a smart pointer
263: class. It must always be possible to inspect all of the subject string in
264: pcre_exec() because of the way it backtracks. Two macros are required in the
265: normal case, for sign-unspecified and unsigned char pointers. The former is
266: used for the external interface and appears in pcre.h, which is why its name
267: must begin with PCRE_. */
268:
269: #ifdef CUSTOM_SUBJECT_PTR
270: #define PCRE_SPTR CUSTOM_SUBJECT_PTR
271: #define USPTR CUSTOM_SUBJECT_PTR
272: #else
273: #define PCRE_SPTR const char *
274: #define USPTR const unsigned char *
275: #endif
276:
277:
278:
279: /* Include the public PCRE header and the definitions of UCP character property
280: values. */
281:
282: #include "pcre.h"
283: #include "ucp.h"
284:
285: /* When compiling for use with the Virtual Pascal compiler, these functions
286: need to have their names changed. PCRE must be compiled with the -DVPCOMPAT
287: option on the command line. */
288:
289: #ifdef VPCOMPAT
290: #define strlen(s) _strlen(s)
291: #define strncmp(s1,s2,m) _strncmp(s1,s2,m)
292: #define memcmp(s,c,n) _memcmp(s,c,n)
293: #define memcpy(d,s,n) _memcpy(d,s,n)
294: #define memmove(d,s,n) _memmove(d,s,n)
295: #define memset(s,c,n) _memset(s,c,n)
296: #else /* VPCOMPAT */
297:
298: /* To cope with SunOS4 and other systems that lack memmove() but have bcopy(),
299: define a macro for memmove() if HAVE_MEMMOVE is false, provided that HAVE_BCOPY
300: is set. Otherwise, include an emulating function for those systems that have
301: neither (there some non-Unix environments where this is the case). */
302:
303: #ifndef HAVE_MEMMOVE
304: #undef memmove /* some systems may have a macro */
305: #ifdef HAVE_BCOPY
306: #define memmove(a, b, c) bcopy(b, a, c)
307: #else /* HAVE_BCOPY */
308: static void *
309: pcre_memmove(void *d, const void *s, size_t n)
310: {
311: size_t i;
312: unsigned char *dest = (unsigned char *)d;
313: const unsigned char *src = (const unsigned char *)s;
314: if (dest > src)
315: {
316: dest += n;
317: src += n;
318: for (i = 0; i < n; ++i) *(--dest) = *(--src);
319: return (void *)dest;
320: }
321: else
322: {
323: for (i = 0; i < n; ++i) *dest++ = *src++;
324: return (void *)(dest - n);
325: }
326: }
327: #define memmove(a, b, c) pcre_memmove(a, b, c)
328: #endif /* not HAVE_BCOPY */
329: #endif /* not HAVE_MEMMOVE */
330: #endif /* not VPCOMPAT */
331:
332:
333: /* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored
334: in big-endian order) by default. These are used, for example, to link from the
335: start of a subpattern to its alternatives and its end. The use of 2 bytes per
336: offset limits the size of the compiled regex to around 64K, which is big enough
337: for almost everybody. However, I received a request for an even bigger limit.
338: For this reason, and also to make the code easier to maintain, the storing and
339: loading of offsets from the byte string is now handled by the macros that are
340: defined here.
341:
342: The macros are controlled by the value of LINK_SIZE. This defaults to 2 in
343: the config.h file, but can be overridden by using -D on the command line. This
344: is automated on Unix systems via the "configure" command. */
345:
346: #if LINK_SIZE == 2
347:
348: #define PUT(a,n,d) \
349: (a[n] = (d) >> 8), \
350: (a[(n)+1] = (d) & 255)
351:
352: #define GET(a,n) \
353: (((a)[n] << 8) | (a)[(n)+1])
354:
355: #define MAX_PATTERN_SIZE (1 << 16)
356:
357:
358: #elif LINK_SIZE == 3
359:
360: #define PUT(a,n,d) \
361: (a[n] = (d) >> 16), \
362: (a[(n)+1] = (d) >> 8), \
363: (a[(n)+2] = (d) & 255)
364:
365: #define GET(a,n) \
366: (((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2])
367:
368: #define MAX_PATTERN_SIZE (1 << 24)
369:
370:
371: #elif LINK_SIZE == 4
372:
373: #define PUT(a,n,d) \
374: (a[n] = (d) >> 24), \
375: (a[(n)+1] = (d) >> 16), \
376: (a[(n)+2] = (d) >> 8), \
377: (a[(n)+3] = (d) & 255)
378:
379: #define GET(a,n) \
380: (((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3])
381:
382: #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */
383:
384:
385: #else
386: #error LINK_SIZE must be either 2, 3, or 4
387: #endif
388:
389:
390: /* Convenience macro defined in terms of the others */
391:
392: #define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE
393:
394:
395: /* PCRE uses some other 2-byte quantities that do not change when the size of
396: offsets changes. There are used for repeat counts and for other things such as
397: capturing parenthesis numbers in back references. */
398:
399: #define PUT2(a,n,d) \
400: a[n] = (d) >> 8; \
401: a[(n)+1] = (d) & 255
402:
403: #define GET2(a,n) \
404: (((a)[n] << 8) | (a)[(n)+1])
405:
406: #define PUT2INC(a,n,d) PUT2(a,n,d), a += 2
407:
408:
409: /* When UTF-8 encoding is being used, a character is no longer just a single
410: byte. The macros for character handling generate simple sequences when used in
411: byte-mode, and more complicated ones for UTF-8 characters. GETCHARLENTEST is
412: not used when UTF-8 is not supported, so it is not defined, and BACKCHAR should
413: never be called in byte mode. To make sure they can never even appear when
414: UTF-8 support is omitted, we don't even define them. */
415:
416: #ifndef SUPPORT_UTF8
417: #define GETCHAR(c, eptr) c = *eptr;
418: #define GETCHARTEST(c, eptr) c = *eptr;
419: #define GETCHARINC(c, eptr) c = *eptr++;
420: #define GETCHARINCTEST(c, eptr) c = *eptr++;
421: #define GETCHARLEN(c, eptr, len) c = *eptr;
422: /* #define GETCHARLENTEST(c, eptr, len) */
423: /* #define BACKCHAR(eptr) */
424:
425: #else /* SUPPORT_UTF8 */
426:
427: /* These macros were originally written in the form of loops that used data
428: from the tables whose names start with _pcre_utf8_table. They were rewritten by
429: a user so as not to use loops, because in some environments this gives a
430: significant performance advantage, and it seems never to do any harm. */
431:
432: /* Base macro to pick up the remaining bytes of a UTF-8 character, not
433: advancing the pointer. */
434:
435: #define GETUTF8(c, eptr) \
436: { \
437: if ((c & 0x20) == 0) \
438: c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \
439: else if ((c & 0x10) == 0) \
440: c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
441: else if ((c & 0x08) == 0) \
442: c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | \
443: ((eptr[2] & 0x3f) << 6) | (eptr[3] & 0x3f); \
444: else if ((c & 0x04) == 0) \
445: c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | \
446: ((eptr[2] & 0x3f) << 12) | ((eptr[3] & 0x3f) << 6) | \
447: (eptr[4] & 0x3f); \
448: else \
449: c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | \
450: ((eptr[2] & 0x3f) << 18) | ((eptr[3] & 0x3f) << 12) | \
451: ((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \
452: }
453:
454: /* Get the next UTF-8 character, not advancing the pointer. This is called when
455: we know we are in UTF-8 mode. */
456:
457: #define GETCHAR(c, eptr) \
458: c = *eptr; \
459: if (c >= 0xc0) GETUTF8(c, eptr);
460:
461: /* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
462: pointer. */
463:
464: #define GETCHARTEST(c, eptr) \
465: c = *eptr; \
466: if (utf8 && c >= 0xc0) GETUTF8(c, eptr);
467:
468: /* Base macro to pick up the remaining bytes of a UTF-8 character, advancing
469: the pointer. */
470:
471: #define GETUTF8INC(c, eptr) \
472: { \
473: if ((c & 0x20) == 0) \
474: c = ((c & 0x1f) << 6) | (*eptr++ & 0x3f); \
475: else if ((c & 0x10) == 0) \
476: { \
477: c = ((c & 0x0f) << 12) | ((*eptr & 0x3f) << 6) | (eptr[1] & 0x3f); \
478: eptr += 2; \
479: } \
480: else if ((c & 0x08) == 0) \
481: { \
482: c = ((c & 0x07) << 18) | ((*eptr & 0x3f) << 12) | \
483: ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
484: eptr += 3; \
485: } \
486: else if ((c & 0x04) == 0) \
487: { \
488: c = ((c & 0x03) << 24) | ((*eptr & 0x3f) << 18) | \
489: ((eptr[1] & 0x3f) << 12) | ((eptr[2] & 0x3f) << 6) | \
490: (eptr[3] & 0x3f); \
491: eptr += 4; \
492: } \
493: else \
494: { \
495: c = ((c & 0x01) << 30) | ((*eptr & 0x3f) << 24) | \
496: ((eptr[1] & 0x3f) << 18) | ((eptr[2] & 0x3f) << 12) | \
497: ((eptr[3] & 0x3f) << 6) | (eptr[4] & 0x3f); \
498: eptr += 5; \
499: } \
500: }
501:
502: /* Get the next UTF-8 character, advancing the pointer. This is called when we
503: know we are in UTF-8 mode. */
504:
505: #define GETCHARINC(c, eptr) \
506: c = *eptr++; \
507: if (c >= 0xc0) GETUTF8INC(c, eptr);
508:
509: /* Get the next character, testing for UTF-8 mode, and advancing the pointer.
510: This is called when we don't know if we are in UTF-8 mode. */
511:
512: #define GETCHARINCTEST(c, eptr) \
513: c = *eptr++; \
514: if (utf8 && c >= 0xc0) GETUTF8INC(c, eptr);
515:
516: /* Base macro to pick up the remaining bytes of a UTF-8 character, not
517: advancing the pointer, incrementing the length. */
518:
519: #define GETUTF8LEN(c, eptr, len) \
520: { \
521: if ((c & 0x20) == 0) \
522: { \
523: c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \
524: len++; \
525: } \
526: else if ((c & 0x10) == 0) \
527: { \
528: c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
529: len += 2; \
530: } \
531: else if ((c & 0x08) == 0) \
532: {\
533: c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | \
534: ((eptr[2] & 0x3f) << 6) | (eptr[3] & 0x3f); \
535: len += 3; \
536: } \
537: else if ((c & 0x04) == 0) \
538: { \
539: c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | \
540: ((eptr[2] & 0x3f) << 12) | ((eptr[3] & 0x3f) << 6) | \
541: (eptr[4] & 0x3f); \
542: len += 4; \
543: } \
544: else \
545: {\
546: c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | \
547: ((eptr[2] & 0x3f) << 18) | ((eptr[3] & 0x3f) << 12) | \
548: ((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \
549: len += 5; \
550: } \
551: }
552:
553: /* Get the next UTF-8 character, not advancing the pointer, incrementing length
554: if there are extra bytes. This is called when we know we are in UTF-8 mode. */
555:
556: #define GETCHARLEN(c, eptr, len) \
557: c = *eptr; \
558: if (c >= 0xc0) GETUTF8LEN(c, eptr, len);
559:
560: /* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the
561: pointer, incrementing length if there are extra bytes. This is called when we
562: do not know if we are in UTF-8 mode. */
563:
564: #define GETCHARLENTEST(c, eptr, len) \
565: c = *eptr; \
566: if (utf8 && c >= 0xc0) GETUTF8LEN(c, eptr, len);
567:
568: /* If the pointer is not at the start of a character, move it back until
569: it is. This is called only in UTF-8 mode - we don't put a test within the macro
570: because almost all calls are already within a block of UTF-8 only code. */
571:
572: #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--
573:
574: #endif /* SUPPORT_UTF8 */
575:
576:
577: /* In case there is no definition of offsetof() provided - though any proper
578: Standard C system should have one. */
579:
580: #ifndef offsetof
581: #define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field))
582: #endif
583:
584:
585: /* Private flags containing information about the compiled regex. They used to
586: live at the top end of the options word, but that got almost full, so now they
587: are in a 16-bit flags word. From release 8.00, PCRE_NOPARTIAL is unused, as
588: the restrictions on partial matching have been lifted. It remains for backwards
589: compatibility. */
590:
591: #define PCRE_NOPARTIAL 0x0001 /* can't use partial with this regex */
592: #define PCRE_FIRSTSET 0x0002 /* first_byte is set */
593: #define PCRE_REQCHSET 0x0004 /* req_byte is set */
594: #define PCRE_STARTLINE 0x0008 /* start after \n for multiline */
595: #define PCRE_JCHANGED 0x0010 /* j option used in regex */
596: #define PCRE_HASCRORLF 0x0020 /* explicit \r or \n in pattern */
597: #define PCRE_HASTHEN 0x0040 /* pattern contains (*THEN) */
598:
599: /* Flags for the "extra" block produced by pcre_study(). */
600:
601: #define PCRE_STUDY_MAPPED 0x0001 /* a map of starting chars exists */
602: #define PCRE_STUDY_MINLEN 0x0002 /* a minimum length field exists */
603:
604: /* Masks for identifying the public options that are permitted at compile
605: time, run time, or study time, respectively. */
606:
607: #define PCRE_NEWLINE_BITS (PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_ANY| \
608: PCRE_NEWLINE_ANYCRLF)
609:
610: #define PUBLIC_COMPILE_OPTIONS \
611: (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
612: PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
613: PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \
614: PCRE_DUPNAMES|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \
615: PCRE_JAVASCRIPT_COMPAT|PCRE_UCP|PCRE_NO_START_OPTIMIZE)
616:
617: #define PUBLIC_EXEC_OPTIONS \
618: (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NOTEMPTY_ATSTART| \
619: PCRE_NO_UTF8_CHECK|PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_NEWLINE_BITS| \
620: PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE|PCRE_NO_START_OPTIMIZE)
621:
622: #define PUBLIC_DFA_EXEC_OPTIONS \
623: (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NOTEMPTY_ATSTART| \
624: PCRE_NO_UTF8_CHECK|PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_DFA_SHORTEST| \
625: PCRE_DFA_RESTART|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \
626: PCRE_NO_START_OPTIMIZE)
627:
628: #define PUBLIC_STUDY_OPTIONS \
629: PCRE_STUDY_JIT_COMPILE
630:
631: /* Magic number to provide a small check against being handed junk. Also used
632: to detect whether a pattern was compiled on a host of different endianness. */
633:
634: #define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */
635:
636: /* Negative values for the firstchar and reqchar variables */
637:
638: #define REQ_UNSET (-2)
639: #define REQ_NONE (-1)
640:
641: /* The maximum remaining length of subject we are prepared to search for a
642: req_byte match. */
643:
644: #define REQ_BYTE_MAX 1000
645:
646: /* Flags added to firstbyte or reqbyte; a "non-literal" item is either a
647: variable-length repeat, or a anything other than literal characters. */
648:
649: #define REQ_CASELESS 0x0100 /* indicates caselessness */
650: #define REQ_VARY 0x0200 /* reqbyte followed non-literal item */
651:
652: /* Miscellaneous definitions. The #ifndef is to pacify compiler warnings in
653: environments where these macros are defined elsewhere. Unfortunately, there
654: is no way to do the same for the typedef. */
655:
656: typedef int BOOL;
657:
658: #ifndef FALSE
659: #define FALSE 0
660: #define TRUE 1
661: #endif
662:
663: /* If PCRE is to support UTF-8 on EBCDIC platforms, we cannot use normal
664: character constants like '*' because the compiler would emit their EBCDIC code,
665: which is different from their ASCII/UTF-8 code. Instead we define macros for
666: the characters so that they always use the ASCII/UTF-8 code when UTF-8 support
667: is enabled. When UTF-8 support is not enabled, the definitions use character
668: literals. Both character and string versions of each character are needed, and
669: there are some longer strings as well.
670:
671: This means that, on EBCDIC platforms, the PCRE library can handle either
672: EBCDIC, or UTF-8, but not both. To support both in the same compiled library
673: would need different lookups depending on whether PCRE_UTF8 was set or not.
674: This would make it impossible to use characters in switch/case statements,
675: which would reduce performance. For a theoretical use (which nobody has asked
676: for) in a minority area (EBCDIC platforms), this is not sensible. Any
677: application that did need both could compile two versions of the library, using
678: macros to give the functions distinct names. */
679:
680: #ifndef SUPPORT_UTF8
681:
682: /* UTF-8 support is not enabled; use the platform-dependent character literals
683: so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */
684:
685: #define CHAR_HT '\t'
686: #define CHAR_VT '\v'
687: #define CHAR_FF '\f'
688: #define CHAR_CR '\r'
689: #define CHAR_NL '\n'
690: #define CHAR_BS '\b'
691: #define CHAR_BEL '\a'
692: #ifdef EBCDIC
693: #define CHAR_ESC '\047'
694: #define CHAR_DEL '\007'
695: #else
696: #define CHAR_ESC '\033'
697: #define CHAR_DEL '\177'
698: #endif
699:
700: #define CHAR_SPACE ' '
701: #define CHAR_EXCLAMATION_MARK '!'
702: #define CHAR_QUOTATION_MARK '"'
703: #define CHAR_NUMBER_SIGN '#'
704: #define CHAR_DOLLAR_SIGN '$'
705: #define CHAR_PERCENT_SIGN '%'
706: #define CHAR_AMPERSAND '&'
707: #define CHAR_APOSTROPHE '\''
708: #define CHAR_LEFT_PARENTHESIS '('
709: #define CHAR_RIGHT_PARENTHESIS ')'
710: #define CHAR_ASTERISK '*'
711: #define CHAR_PLUS '+'
712: #define CHAR_COMMA ','
713: #define CHAR_MINUS '-'
714: #define CHAR_DOT '.'
715: #define CHAR_SLASH '/'
716: #define CHAR_0 '0'
717: #define CHAR_1 '1'
718: #define CHAR_2 '2'
719: #define CHAR_3 '3'
720: #define CHAR_4 '4'
721: #define CHAR_5 '5'
722: #define CHAR_6 '6'
723: #define CHAR_7 '7'
724: #define CHAR_8 '8'
725: #define CHAR_9 '9'
726: #define CHAR_COLON ':'
727: #define CHAR_SEMICOLON ';'
728: #define CHAR_LESS_THAN_SIGN '<'
729: #define CHAR_EQUALS_SIGN '='
730: #define CHAR_GREATER_THAN_SIGN '>'
731: #define CHAR_QUESTION_MARK '?'
732: #define CHAR_COMMERCIAL_AT '@'
733: #define CHAR_A 'A'
734: #define CHAR_B 'B'
735: #define CHAR_C 'C'
736: #define CHAR_D 'D'
737: #define CHAR_E 'E'
738: #define CHAR_F 'F'
739: #define CHAR_G 'G'
740: #define CHAR_H 'H'
741: #define CHAR_I 'I'
742: #define CHAR_J 'J'
743: #define CHAR_K 'K'
744: #define CHAR_L 'L'
745: #define CHAR_M 'M'
746: #define CHAR_N 'N'
747: #define CHAR_O 'O'
748: #define CHAR_P 'P'
749: #define CHAR_Q 'Q'
750: #define CHAR_R 'R'
751: #define CHAR_S 'S'
752: #define CHAR_T 'T'
753: #define CHAR_U 'U'
754: #define CHAR_V 'V'
755: #define CHAR_W 'W'
756: #define CHAR_X 'X'
757: #define CHAR_Y 'Y'
758: #define CHAR_Z 'Z'
759: #define CHAR_LEFT_SQUARE_BRACKET '['
760: #define CHAR_BACKSLASH '\\'
761: #define CHAR_RIGHT_SQUARE_BRACKET ']'
762: #define CHAR_CIRCUMFLEX_ACCENT '^'
763: #define CHAR_UNDERSCORE '_'
764: #define CHAR_GRAVE_ACCENT '`'
765: #define CHAR_a 'a'
766: #define CHAR_b 'b'
767: #define CHAR_c 'c'
768: #define CHAR_d 'd'
769: #define CHAR_e 'e'
770: #define CHAR_f 'f'
771: #define CHAR_g 'g'
772: #define CHAR_h 'h'
773: #define CHAR_i 'i'
774: #define CHAR_j 'j'
775: #define CHAR_k 'k'
776: #define CHAR_l 'l'
777: #define CHAR_m 'm'
778: #define CHAR_n 'n'
779: #define CHAR_o 'o'
780: #define CHAR_p 'p'
781: #define CHAR_q 'q'
782: #define CHAR_r 'r'
783: #define CHAR_s 's'
784: #define CHAR_t 't'
785: #define CHAR_u 'u'
786: #define CHAR_v 'v'
787: #define CHAR_w 'w'
788: #define CHAR_x 'x'
789: #define CHAR_y 'y'
790: #define CHAR_z 'z'
791: #define CHAR_LEFT_CURLY_BRACKET '{'
792: #define CHAR_VERTICAL_LINE '|'
793: #define CHAR_RIGHT_CURLY_BRACKET '}'
794: #define CHAR_TILDE '~'
795:
796: #define STR_HT "\t"
797: #define STR_VT "\v"
798: #define STR_FF "\f"
799: #define STR_CR "\r"
800: #define STR_NL "\n"
801: #define STR_BS "\b"
802: #define STR_BEL "\a"
803: #ifdef EBCDIC
804: #define STR_ESC "\047"
805: #define STR_DEL "\007"
806: #else
807: #define STR_ESC "\033"
808: #define STR_DEL "\177"
809: #endif
810:
811: #define STR_SPACE " "
812: #define STR_EXCLAMATION_MARK "!"
813: #define STR_QUOTATION_MARK "\""
814: #define STR_NUMBER_SIGN "#"
815: #define STR_DOLLAR_SIGN "$"
816: #define STR_PERCENT_SIGN "%"
817: #define STR_AMPERSAND "&"
818: #define STR_APOSTROPHE "'"
819: #define STR_LEFT_PARENTHESIS "("
820: #define STR_RIGHT_PARENTHESIS ")"
821: #define STR_ASTERISK "*"
822: #define STR_PLUS "+"
823: #define STR_COMMA ","
824: #define STR_MINUS "-"
825: #define STR_DOT "."
826: #define STR_SLASH "/"
827: #define STR_0 "0"
828: #define STR_1 "1"
829: #define STR_2 "2"
830: #define STR_3 "3"
831: #define STR_4 "4"
832: #define STR_5 "5"
833: #define STR_6 "6"
834: #define STR_7 "7"
835: #define STR_8 "8"
836: #define STR_9 "9"
837: #define STR_COLON ":"
838: #define STR_SEMICOLON ";"
839: #define STR_LESS_THAN_SIGN "<"
840: #define STR_EQUALS_SIGN "="
841: #define STR_GREATER_THAN_SIGN ">"
842: #define STR_QUESTION_MARK "?"
843: #define STR_COMMERCIAL_AT "@"
844: #define STR_A "A"
845: #define STR_B "B"
846: #define STR_C "C"
847: #define STR_D "D"
848: #define STR_E "E"
849: #define STR_F "F"
850: #define STR_G "G"
851: #define STR_H "H"
852: #define STR_I "I"
853: #define STR_J "J"
854: #define STR_K "K"
855: #define STR_L "L"
856: #define STR_M "M"
857: #define STR_N "N"
858: #define STR_O "O"
859: #define STR_P "P"
860: #define STR_Q "Q"
861: #define STR_R "R"
862: #define STR_S "S"
863: #define STR_T "T"
864: #define STR_U "U"
865: #define STR_V "V"
866: #define STR_W "W"
867: #define STR_X "X"
868: #define STR_Y "Y"
869: #define STR_Z "Z"
870: #define STR_LEFT_SQUARE_BRACKET "["
871: #define STR_BACKSLASH "\\"
872: #define STR_RIGHT_SQUARE_BRACKET "]"
873: #define STR_CIRCUMFLEX_ACCENT "^"
874: #define STR_UNDERSCORE "_"
875: #define STR_GRAVE_ACCENT "`"
876: #define STR_a "a"
877: #define STR_b "b"
878: #define STR_c "c"
879: #define STR_d "d"
880: #define STR_e "e"
881: #define STR_f "f"
882: #define STR_g "g"
883: #define STR_h "h"
884: #define STR_i "i"
885: #define STR_j "j"
886: #define STR_k "k"
887: #define STR_l "l"
888: #define STR_m "m"
889: #define STR_n "n"
890: #define STR_o "o"
891: #define STR_p "p"
892: #define STR_q "q"
893: #define STR_r "r"
894: #define STR_s "s"
895: #define STR_t "t"
896: #define STR_u "u"
897: #define STR_v "v"
898: #define STR_w "w"
899: #define STR_x "x"
900: #define STR_y "y"
901: #define STR_z "z"
902: #define STR_LEFT_CURLY_BRACKET "{"
903: #define STR_VERTICAL_LINE "|"
904: #define STR_RIGHT_CURLY_BRACKET "}"
905: #define STR_TILDE "~"
906:
907: #define STRING_ACCEPT0 "ACCEPT\0"
908: #define STRING_COMMIT0 "COMMIT\0"
909: #define STRING_F0 "F\0"
910: #define STRING_FAIL0 "FAIL\0"
911: #define STRING_MARK0 "MARK\0"
912: #define STRING_PRUNE0 "PRUNE\0"
913: #define STRING_SKIP0 "SKIP\0"
914: #define STRING_THEN "THEN"
915:
916: #define STRING_alpha0 "alpha\0"
917: #define STRING_lower0 "lower\0"
918: #define STRING_upper0 "upper\0"
919: #define STRING_alnum0 "alnum\0"
920: #define STRING_ascii0 "ascii\0"
921: #define STRING_blank0 "blank\0"
922: #define STRING_cntrl0 "cntrl\0"
923: #define STRING_digit0 "digit\0"
924: #define STRING_graph0 "graph\0"
925: #define STRING_print0 "print\0"
926: #define STRING_punct0 "punct\0"
927: #define STRING_space0 "space\0"
928: #define STRING_word0 "word\0"
929: #define STRING_xdigit "xdigit"
930:
931: #define STRING_DEFINE "DEFINE"
932:
933: #define STRING_CR_RIGHTPAR "CR)"
934: #define STRING_LF_RIGHTPAR "LF)"
935: #define STRING_CRLF_RIGHTPAR "CRLF)"
936: #define STRING_ANY_RIGHTPAR "ANY)"
937: #define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)"
938: #define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)"
939: #define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)"
940: #define STRING_UTF8_RIGHTPAR "UTF8)"
941: #define STRING_UCP_RIGHTPAR "UCP)"
942: #define STRING_NO_START_OPT_RIGHTPAR "NO_START_OPT)"
943:
944: #else /* SUPPORT_UTF8 */
945:
946: /* UTF-8 support is enabled; always use UTF-8 (=ASCII) character codes. This
947: works in both modes non-EBCDIC platforms, and on EBCDIC platforms in UTF-8 mode
948: only. */
949:
950: #define CHAR_HT '\011'
951: #define CHAR_VT '\013'
952: #define CHAR_FF '\014'
953: #define CHAR_CR '\015'
954: #define CHAR_NL '\012'
955: #define CHAR_BS '\010'
956: #define CHAR_BEL '\007'
957: #define CHAR_ESC '\033'
958: #define CHAR_DEL '\177'
959:
960: #define CHAR_SPACE '\040'
961: #define CHAR_EXCLAMATION_MARK '\041'
962: #define CHAR_QUOTATION_MARK '\042'
963: #define CHAR_NUMBER_SIGN '\043'
964: #define CHAR_DOLLAR_SIGN '\044'
965: #define CHAR_PERCENT_SIGN '\045'
966: #define CHAR_AMPERSAND '\046'
967: #define CHAR_APOSTROPHE '\047'
968: #define CHAR_LEFT_PARENTHESIS '\050'
969: #define CHAR_RIGHT_PARENTHESIS '\051'
970: #define CHAR_ASTERISK '\052'
971: #define CHAR_PLUS '\053'
972: #define CHAR_COMMA '\054'
973: #define CHAR_MINUS '\055'
974: #define CHAR_DOT '\056'
975: #define CHAR_SLASH '\057'
976: #define CHAR_0 '\060'
977: #define CHAR_1 '\061'
978: #define CHAR_2 '\062'
979: #define CHAR_3 '\063'
980: #define CHAR_4 '\064'
981: #define CHAR_5 '\065'
982: #define CHAR_6 '\066'
983: #define CHAR_7 '\067'
984: #define CHAR_8 '\070'
985: #define CHAR_9 '\071'
986: #define CHAR_COLON '\072'
987: #define CHAR_SEMICOLON '\073'
988: #define CHAR_LESS_THAN_SIGN '\074'
989: #define CHAR_EQUALS_SIGN '\075'
990: #define CHAR_GREATER_THAN_SIGN '\076'
991: #define CHAR_QUESTION_MARK '\077'
992: #define CHAR_COMMERCIAL_AT '\100'
993: #define CHAR_A '\101'
994: #define CHAR_B '\102'
995: #define CHAR_C '\103'
996: #define CHAR_D '\104'
997: #define CHAR_E '\105'
998: #define CHAR_F '\106'
999: #define CHAR_G '\107'
1000: #define CHAR_H '\110'
1001: #define CHAR_I '\111'
1002: #define CHAR_J '\112'
1003: #define CHAR_K '\113'
1004: #define CHAR_L '\114'
1005: #define CHAR_M '\115'
1006: #define CHAR_N '\116'
1007: #define CHAR_O '\117'
1008: #define CHAR_P '\120'
1009: #define CHAR_Q '\121'
1010: #define CHAR_R '\122'
1011: #define CHAR_S '\123'
1012: #define CHAR_T '\124'
1013: #define CHAR_U '\125'
1014: #define CHAR_V '\126'
1015: #define CHAR_W '\127'
1016: #define CHAR_X '\130'
1017: #define CHAR_Y '\131'
1018: #define CHAR_Z '\132'
1019: #define CHAR_LEFT_SQUARE_BRACKET '\133'
1020: #define CHAR_BACKSLASH '\134'
1021: #define CHAR_RIGHT_SQUARE_BRACKET '\135'
1022: #define CHAR_CIRCUMFLEX_ACCENT '\136'
1023: #define CHAR_UNDERSCORE '\137'
1024: #define CHAR_GRAVE_ACCENT '\140'
1025: #define CHAR_a '\141'
1026: #define CHAR_b '\142'
1027: #define CHAR_c '\143'
1028: #define CHAR_d '\144'
1029: #define CHAR_e '\145'
1030: #define CHAR_f '\146'
1031: #define CHAR_g '\147'
1032: #define CHAR_h '\150'
1033: #define CHAR_i '\151'
1034: #define CHAR_j '\152'
1035: #define CHAR_k '\153'
1036: #define CHAR_l '\154'
1037: #define CHAR_m '\155'
1038: #define CHAR_n '\156'
1039: #define CHAR_o '\157'
1040: #define CHAR_p '\160'
1041: #define CHAR_q '\161'
1042: #define CHAR_r '\162'
1043: #define CHAR_s '\163'
1044: #define CHAR_t '\164'
1045: #define CHAR_u '\165'
1046: #define CHAR_v '\166'
1047: #define CHAR_w '\167'
1048: #define CHAR_x '\170'
1049: #define CHAR_y '\171'
1050: #define CHAR_z '\172'
1051: #define CHAR_LEFT_CURLY_BRACKET '\173'
1052: #define CHAR_VERTICAL_LINE '\174'
1053: #define CHAR_RIGHT_CURLY_BRACKET '\175'
1054: #define CHAR_TILDE '\176'
1055:
1056: #define STR_HT "\011"
1057: #define STR_VT "\013"
1058: #define STR_FF "\014"
1059: #define STR_CR "\015"
1060: #define STR_NL "\012"
1061: #define STR_BS "\010"
1062: #define STR_BEL "\007"
1063: #define STR_ESC "\033"
1064: #define STR_DEL "\177"
1065:
1066: #define STR_SPACE "\040"
1067: #define STR_EXCLAMATION_MARK "\041"
1068: #define STR_QUOTATION_MARK "\042"
1069: #define STR_NUMBER_SIGN "\043"
1070: #define STR_DOLLAR_SIGN "\044"
1071: #define STR_PERCENT_SIGN "\045"
1072: #define STR_AMPERSAND "\046"
1073: #define STR_APOSTROPHE "\047"
1074: #define STR_LEFT_PARENTHESIS "\050"
1075: #define STR_RIGHT_PARENTHESIS "\051"
1076: #define STR_ASTERISK "\052"
1077: #define STR_PLUS "\053"
1078: #define STR_COMMA "\054"
1079: #define STR_MINUS "\055"
1080: #define STR_DOT "\056"
1081: #define STR_SLASH "\057"
1082: #define STR_0 "\060"
1083: #define STR_1 "\061"
1084: #define STR_2 "\062"
1085: #define STR_3 "\063"
1086: #define STR_4 "\064"
1087: #define STR_5 "\065"
1088: #define STR_6 "\066"
1089: #define STR_7 "\067"
1090: #define STR_8 "\070"
1091: #define STR_9 "\071"
1092: #define STR_COLON "\072"
1093: #define STR_SEMICOLON "\073"
1094: #define STR_LESS_THAN_SIGN "\074"
1095: #define STR_EQUALS_SIGN "\075"
1096: #define STR_GREATER_THAN_SIGN "\076"
1097: #define STR_QUESTION_MARK "\077"
1098: #define STR_COMMERCIAL_AT "\100"
1099: #define STR_A "\101"
1100: #define STR_B "\102"
1101: #define STR_C "\103"
1102: #define STR_D "\104"
1103: #define STR_E "\105"
1104: #define STR_F "\106"
1105: #define STR_G "\107"
1106: #define STR_H "\110"
1107: #define STR_I "\111"
1108: #define STR_J "\112"
1109: #define STR_K "\113"
1110: #define STR_L "\114"
1111: #define STR_M "\115"
1112: #define STR_N "\116"
1113: #define STR_O "\117"
1114: #define STR_P "\120"
1115: #define STR_Q "\121"
1116: #define STR_R "\122"
1117: #define STR_S "\123"
1118: #define STR_T "\124"
1119: #define STR_U "\125"
1120: #define STR_V "\126"
1121: #define STR_W "\127"
1122: #define STR_X "\130"
1123: #define STR_Y "\131"
1124: #define STR_Z "\132"
1125: #define STR_LEFT_SQUARE_BRACKET "\133"
1126: #define STR_BACKSLASH "\134"
1127: #define STR_RIGHT_SQUARE_BRACKET "\135"
1128: #define STR_CIRCUMFLEX_ACCENT "\136"
1129: #define STR_UNDERSCORE "\137"
1130: #define STR_GRAVE_ACCENT "\140"
1131: #define STR_a "\141"
1132: #define STR_b "\142"
1133: #define STR_c "\143"
1134: #define STR_d "\144"
1135: #define STR_e "\145"
1136: #define STR_f "\146"
1137: #define STR_g "\147"
1138: #define STR_h "\150"
1139: #define STR_i "\151"
1140: #define STR_j "\152"
1141: #define STR_k "\153"
1142: #define STR_l "\154"
1143: #define STR_m "\155"
1144: #define STR_n "\156"
1145: #define STR_o "\157"
1146: #define STR_p "\160"
1147: #define STR_q "\161"
1148: #define STR_r "\162"
1149: #define STR_s "\163"
1150: #define STR_t "\164"
1151: #define STR_u "\165"
1152: #define STR_v "\166"
1153: #define STR_w "\167"
1154: #define STR_x "\170"
1155: #define STR_y "\171"
1156: #define STR_z "\172"
1157: #define STR_LEFT_CURLY_BRACKET "\173"
1158: #define STR_VERTICAL_LINE "\174"
1159: #define STR_RIGHT_CURLY_BRACKET "\175"
1160: #define STR_TILDE "\176"
1161:
1162: #define STRING_ACCEPT0 STR_A STR_C STR_C STR_E STR_P STR_T "\0"
1163: #define STRING_COMMIT0 STR_C STR_O STR_M STR_M STR_I STR_T "\0"
1164: #define STRING_F0 STR_F "\0"
1165: #define STRING_FAIL0 STR_F STR_A STR_I STR_L "\0"
1166: #define STRING_MARK0 STR_M STR_A STR_R STR_K "\0"
1167: #define STRING_PRUNE0 STR_P STR_R STR_U STR_N STR_E "\0"
1168: #define STRING_SKIP0 STR_S STR_K STR_I STR_P "\0"
1169: #define STRING_THEN STR_T STR_H STR_E STR_N
1170:
1171: #define STRING_alpha0 STR_a STR_l STR_p STR_h STR_a "\0"
1172: #define STRING_lower0 STR_l STR_o STR_w STR_e STR_r "\0"
1173: #define STRING_upper0 STR_u STR_p STR_p STR_e STR_r "\0"
1174: #define STRING_alnum0 STR_a STR_l STR_n STR_u STR_m "\0"
1175: #define STRING_ascii0 STR_a STR_s STR_c STR_i STR_i "\0"
1176: #define STRING_blank0 STR_b STR_l STR_a STR_n STR_k "\0"
1177: #define STRING_cntrl0 STR_c STR_n STR_t STR_r STR_l "\0"
1178: #define STRING_digit0 STR_d STR_i STR_g STR_i STR_t "\0"
1179: #define STRING_graph0 STR_g STR_r STR_a STR_p STR_h "\0"
1180: #define STRING_print0 STR_p STR_r STR_i STR_n STR_t "\0"
1181: #define STRING_punct0 STR_p STR_u STR_n STR_c STR_t "\0"
1182: #define STRING_space0 STR_s STR_p STR_a STR_c STR_e "\0"
1183: #define STRING_word0 STR_w STR_o STR_r STR_d "\0"
1184: #define STRING_xdigit STR_x STR_d STR_i STR_g STR_i STR_t
1185:
1186: #define STRING_DEFINE STR_D STR_E STR_F STR_I STR_N STR_E
1187:
1188: #define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS
1189: #define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS
1190: #define STRING_CRLF_RIGHTPAR STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
1191: #define STRING_ANY_RIGHTPAR STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS
1192: #define STRING_ANYCRLF_RIGHTPAR STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
1193: #define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
1194: #define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS
1195: #define STRING_UTF8_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS
1196: #define STRING_UCP_RIGHTPAR STR_U STR_C STR_P STR_RIGHT_PARENTHESIS
1197: #define STRING_NO_START_OPT_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_S STR_T STR_A STR_R STR_T STR_UNDERSCORE STR_O STR_P STR_T STR_RIGHT_PARENTHESIS
1198:
1199: #endif /* SUPPORT_UTF8 */
1200:
1201: /* Escape items that are just an encoding of a particular data value. */
1202:
1203: #ifndef ESC_e
1204: #define ESC_e CHAR_ESC
1205: #endif
1206:
1207: #ifndef ESC_f
1208: #define ESC_f CHAR_FF
1209: #endif
1210:
1211: #ifndef ESC_n
1212: #define ESC_n CHAR_NL
1213: #endif
1214:
1215: #ifndef ESC_r
1216: #define ESC_r CHAR_CR
1217: #endif
1218:
1219: /* We can't officially use ESC_t because it is a POSIX reserved identifier
1220: (presumably because of all the others like size_t). */
1221:
1222: #ifndef ESC_tee
1223: #define ESC_tee CHAR_HT
1224: #endif
1225:
1226: /* Codes for different types of Unicode property */
1227:
1228: #define PT_ANY 0 /* Any property - matches all chars */
1229: #define PT_LAMP 1 /* L& - the union of Lu, Ll, Lt */
1230: #define PT_GC 2 /* Specified general characteristic (e.g. L) */
1231: #define PT_PC 3 /* Specified particular characteristic (e.g. Lu) */
1232: #define PT_SC 4 /* Script (e.g. Han) */
1233: #define PT_ALNUM 5 /* Alphanumeric - the union of L and N */
1234: #define PT_SPACE 6 /* Perl space - Z plus 9,10,12,13 */
1235: #define PT_PXSPACE 7 /* POSIX space - Z plus 9,10,11,12,13 */
1236: #define PT_WORD 8 /* Word - L plus N plus underscore */
1237:
1238: /* Flag bits and data types for the extended class (OP_XCLASS) for classes that
1239: contain UTF-8 characters with values greater than 255. */
1240:
1241: #define XCL_NOT 0x01 /* Flag: this is a negative class */
1242: #define XCL_MAP 0x02 /* Flag: a 32-byte map is present */
1243:
1244: #define XCL_END 0 /* Marks end of individual items */
1245: #define XCL_SINGLE 1 /* Single item (one multibyte char) follows */
1246: #define XCL_RANGE 2 /* A range (two multibyte chars) follows */
1247: #define XCL_PROP 3 /* Unicode property (2-byte property code follows) */
1248: #define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */
1249:
1250: /* These are escaped items that aren't just an encoding of a particular data
1251: value such as \n. They must have non-zero values, as check_escape() returns
1252: their negation. Also, they must appear in the same order as in the opcode
1253: definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it
1254: corresponds to "." in DOTALL mode rather than an escape sequence. It is also
1255: used for [^] in JavaScript compatibility mode, and for \C in non-utf8 mode. In
1256: non-DOTALL mode, "." behaves like \N.
1257:
1258: The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc.
1259: when PCRE_UCP is set, when replacement of \d etc by \p sequences is required.
1260: They must be contiguous, and remain in order so that the replacements can be
1261: looked up from a table.
1262:
1263: The final escape must be ESC_REF as subsequent values are used for
1264: backreferences (\1, \2, \3, etc). There are two tests in the code for an escape
1265: greater than ESC_b and less than ESC_Z to detect the types that may be
1266: repeated. These are the types that consume characters. If any new escapes are
1267: put in between that don't consume a character, that code will have to change.
1268: */
1269:
1270: enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s,
1271: ESC_W, ESC_w, ESC_N, ESC_dum, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H,
1272: ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z,
1273: ESC_E, ESC_Q, ESC_g, ESC_k,
1274: ESC_DU, ESC_du, ESC_SU, ESC_su, ESC_WU, ESC_wu,
1275: ESC_REF };
1276:
1277: /* Opcode table: Starting from 1 (i.e. after OP_END), the values up to
1278: OP_EOD must correspond in order to the list of escapes immediately above.
1279:
1280: *** NOTE NOTE NOTE *** Whenever this list is updated, the two macro definitions
1281: that follow must also be updated to match. There are also tables called
1282: "coptable" and "poptable" in pcre_dfa_exec.c that must be updated. */
1283:
1284: enum {
1285: OP_END, /* 0 End of pattern */
1286:
1287: /* Values corresponding to backslashed metacharacters */
1288:
1289: OP_SOD, /* 1 Start of data: \A */
1290: OP_SOM, /* 2 Start of match (subject + offset): \G */
1291: OP_SET_SOM, /* 3 Set start of match (\K) */
1292: OP_NOT_WORD_BOUNDARY, /* 4 \B */
1293: OP_WORD_BOUNDARY, /* 5 \b */
1294: OP_NOT_DIGIT, /* 6 \D */
1295: OP_DIGIT, /* 7 \d */
1296: OP_NOT_WHITESPACE, /* 8 \S */
1297: OP_WHITESPACE, /* 9 \s */
1298: OP_NOT_WORDCHAR, /* 10 \W */
1299: OP_WORDCHAR, /* 11 \w */
1300:
1301: OP_ANY, /* 12 Match any character except newline */
1302: OP_ALLANY, /* 13 Match any character */
1303: OP_ANYBYTE, /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */
1304: OP_NOTPROP, /* 15 \P (not Unicode property) */
1305: OP_PROP, /* 16 \p (Unicode property) */
1306: OP_ANYNL, /* 17 \R (any newline sequence) */
1307: OP_NOT_HSPACE, /* 18 \H (not horizontal whitespace) */
1308: OP_HSPACE, /* 19 \h (horizontal whitespace) */
1309: OP_NOT_VSPACE, /* 20 \V (not vertical whitespace) */
1310: OP_VSPACE, /* 21 \v (vertical whitespace) */
1311: OP_EXTUNI, /* 22 \X (extended Unicode sequence */
1312: OP_EODN, /* 23 End of data or \n at end of data: \Z. */
1313: OP_EOD, /* 24 End of data: \z */
1314:
1315: OP_CIRC, /* 25 Start of line - not multiline */
1316: OP_CIRCM, /* 26 Start of line - multiline */
1317: OP_DOLL, /* 27 End of line - not multiline */
1318: OP_DOLLM, /* 28 End of line - multiline */
1319: OP_CHAR, /* 29 Match one character, casefully */
1320: OP_CHARI, /* 30 Match one character, caselessly */
1321: OP_NOT, /* 31 Match one character, not the given one, casefully */
1322: OP_NOTI, /* 32 Match one character, not the given one, caselessly */
1323:
1324: /* The following sets of 13 opcodes must always be kept in step because
1325: the offset from the first one is used to generate the others. */
1326:
1327: /**** Single characters, caseful, must precede the caseless ones ****/
1328:
1329: OP_STAR, /* 33 The maximizing and minimizing versions of */
1330: OP_MINSTAR, /* 34 these six opcodes must come in pairs, with */
1331: OP_PLUS, /* 35 the minimizing one second. */
1332: OP_MINPLUS, /* 36 */
1333: OP_QUERY, /* 37 */
1334: OP_MINQUERY, /* 38 */
1335:
1336: OP_UPTO, /* 39 From 0 to n matches of one character, caseful*/
1337: OP_MINUPTO, /* 40 */
1338: OP_EXACT, /* 41 Exactly n matches */
1339:
1340: OP_POSSTAR, /* 42 Possessified star, caseful */
1341: OP_POSPLUS, /* 43 Possessified plus, caseful */
1342: OP_POSQUERY, /* 44 Posesssified query, caseful */
1343: OP_POSUPTO, /* 45 Possessified upto, caseful */
1344:
1345: /**** Single characters, caseless, must follow the caseful ones */
1346:
1347: OP_STARI, /* 46 */
1348: OP_MINSTARI, /* 47 */
1349: OP_PLUSI, /* 48 */
1350: OP_MINPLUSI, /* 49 */
1351: OP_QUERYI, /* 50 */
1352: OP_MINQUERYI, /* 51 */
1353:
1354: OP_UPTOI, /* 52 From 0 to n matches of one character, caseless */
1355: OP_MINUPTOI, /* 53 */
1356: OP_EXACTI, /* 54 */
1357:
1358: OP_POSSTARI, /* 55 Possessified star, caseless */
1359: OP_POSPLUSI, /* 56 Possessified plus, caseless */
1360: OP_POSQUERYI, /* 57 Posesssified query, caseless */
1361: OP_POSUPTOI, /* 58 Possessified upto, caseless */
1362:
1363: /**** The negated ones must follow the non-negated ones, and match them ****/
1364: /**** Negated single character, caseful; must precede the caseless ones ****/
1365:
1366: OP_NOTSTAR, /* 59 The maximizing and minimizing versions of */
1367: OP_NOTMINSTAR, /* 60 these six opcodes must come in pairs, with */
1368: OP_NOTPLUS, /* 61 the minimizing one second. They must be in */
1369: OP_NOTMINPLUS, /* 62 exactly the same order as those above. */
1370: OP_NOTQUERY, /* 63 */
1371: OP_NOTMINQUERY, /* 64 */
1372:
1373: OP_NOTUPTO, /* 65 From 0 to n matches, caseful */
1374: OP_NOTMINUPTO, /* 66 */
1375: OP_NOTEXACT, /* 67 Exactly n matches */
1376:
1377: OP_NOTPOSSTAR, /* 68 Possessified versions, caseful */
1378: OP_NOTPOSPLUS, /* 69 */
1379: OP_NOTPOSQUERY, /* 70 */
1380: OP_NOTPOSUPTO, /* 71 */
1381:
1382: /**** Negated single character, caseless; must follow the caseful ones ****/
1383:
1384: OP_NOTSTARI, /* 72 */
1385: OP_NOTMINSTARI, /* 73 */
1386: OP_NOTPLUSI, /* 74 */
1387: OP_NOTMINPLUSI, /* 75 */
1388: OP_NOTQUERYI, /* 76 */
1389: OP_NOTMINQUERYI, /* 77 */
1390:
1391: OP_NOTUPTOI, /* 78 From 0 to n matches, caseless */
1392: OP_NOTMINUPTOI, /* 79 */
1393: OP_NOTEXACTI, /* 80 Exactly n matches */
1394:
1395: OP_NOTPOSSTARI, /* 81 Possessified versions, caseless */
1396: OP_NOTPOSPLUSI, /* 82 */
1397: OP_NOTPOSQUERYI, /* 83 */
1398: OP_NOTPOSUPTOI, /* 84 */
1399:
1400: /**** Character types ****/
1401:
1402: OP_TYPESTAR, /* 85 The maximizing and minimizing versions of */
1403: OP_TYPEMINSTAR, /* 86 these six opcodes must come in pairs, with */
1404: OP_TYPEPLUS, /* 87 the minimizing one second. These codes must */
1405: OP_TYPEMINPLUS, /* 88 be in exactly the same order as those above. */
1406: OP_TYPEQUERY, /* 89 */
1407: OP_TYPEMINQUERY, /* 90 */
1408:
1409: OP_TYPEUPTO, /* 91 From 0 to n matches */
1410: OP_TYPEMINUPTO, /* 92 */
1411: OP_TYPEEXACT, /* 93 Exactly n matches */
1412:
1413: OP_TYPEPOSSTAR, /* 94 Possessified versions */
1414: OP_TYPEPOSPLUS, /* 95 */
1415: OP_TYPEPOSQUERY, /* 96 */
1416: OP_TYPEPOSUPTO, /* 97 */
1417:
1418: /* These are used for character classes and back references; only the
1419: first six are the same as the sets above. */
1420:
1421: OP_CRSTAR, /* 98 The maximizing and minimizing versions of */
1422: OP_CRMINSTAR, /* 99 all these opcodes must come in pairs, with */
1423: OP_CRPLUS, /* 100 the minimizing one second. These codes must */
1424: OP_CRMINPLUS, /* 101 be in exactly the same order as those above. */
1425: OP_CRQUERY, /* 102 */
1426: OP_CRMINQUERY, /* 103 */
1427:
1428: OP_CRRANGE, /* 104 These are different to the three sets above. */
1429: OP_CRMINRANGE, /* 105 */
1430:
1431: /* End of quantifier opcodes */
1432:
1433: OP_CLASS, /* 106 Match a character class, chars < 256 only */
1434: OP_NCLASS, /* 107 Same, but the bitmap was created from a negative
1435: class - the difference is relevant only when a
1436: UTF-8 character > 255 is encountered. */
1437: OP_XCLASS, /* 108 Extended class for handling UTF-8 chars within the
1438: class. This does both positive and negative. */
1439: OP_REF, /* 109 Match a back reference, casefully */
1440: OP_REFI, /* 110 Match a back reference, caselessly */
1441: OP_RECURSE, /* 111 Match a numbered subpattern (possibly recursive) */
1442: OP_CALLOUT, /* 112 Call out to external function if provided */
1443:
1444: OP_ALT, /* 113 Start of alternation */
1445: OP_KET, /* 114 End of group that doesn't have an unbounded repeat */
1446: OP_KETRMAX, /* 115 These two must remain together and in this */
1447: OP_KETRMIN, /* 116 order. They are for groups the repeat for ever. */
1448: OP_KETRPOS, /* 117 Possessive unlimited repeat. */
1449:
1450: /* The assertions must come before BRA, CBRA, ONCE, and COND, and the four
1451: asserts must remain in order. */
1452:
1453: OP_REVERSE, /* 118 Move pointer back - used in lookbehind assertions */
1454: OP_ASSERT, /* 119 Positive lookahead */
1455: OP_ASSERT_NOT, /* 120 Negative lookahead */
1456: OP_ASSERTBACK, /* 121 Positive lookbehind */
1457: OP_ASSERTBACK_NOT, /* 122 Negative lookbehind */
1458:
1459: /* ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND must come immediately
1460: after the assertions, with ONCE first, as there's a test for >= ONCE for a
1461: subpattern that isn't an assertion. The POS versions must immediately follow
1462: the non-POS versions in each case. */
1463:
1464: OP_ONCE, /* 123 Atomic group, contains captures */
1465: OP_ONCE_NC, /* 124 Atomic group containing no captures */
1466: OP_BRA, /* 125 Start of non-capturing bracket */
1467: OP_BRAPOS, /* 126 Ditto, with unlimited, possessive repeat */
1468: OP_CBRA, /* 127 Start of capturing bracket */
1469: OP_CBRAPOS, /* 128 Ditto, with unlimited, possessive repeat */
1470: OP_COND, /* 129 Conditional group */
1471:
1472: /* These five must follow the previous five, in the same order. There's a
1473: check for >= SBRA to distinguish the two sets. */
1474:
1475: OP_SBRA, /* 130 Start of non-capturing bracket, check empty */
1476: OP_SBRAPOS, /* 131 Ditto, with unlimited, possessive repeat */
1477: OP_SCBRA, /* 132 Start of capturing bracket, check empty */
1478: OP_SCBRAPOS, /* 133 Ditto, with unlimited, possessive repeat */
1479: OP_SCOND, /* 134 Conditional group, check empty */
1480:
1481: /* The next two pairs must (respectively) be kept together. */
1482:
1483: OP_CREF, /* 135 Used to hold a capture number as condition */
1484: OP_NCREF, /* 136 Same, but generated by a name reference*/
1485: OP_RREF, /* 137 Used to hold a recursion number as condition */
1486: OP_NRREF, /* 138 Same, but generated by a name reference*/
1487: OP_DEF, /* 139 The DEFINE condition */
1488:
1489: OP_BRAZERO, /* 140 These two must remain together and in this */
1490: OP_BRAMINZERO, /* 141 order. */
1491: OP_BRAPOSZERO, /* 142 */
1492:
1493: /* These are backtracking control verbs */
1494:
1495: OP_MARK, /* 143 always has an argument */
1496: OP_PRUNE, /* 144 */
1497: OP_PRUNE_ARG, /* 145 same, but with argument */
1498: OP_SKIP, /* 146 */
1499: OP_SKIP_ARG, /* 147 same, but with argument */
1500: OP_THEN, /* 148 */
1501: OP_THEN_ARG, /* 149 same, but with argument */
1502: OP_COMMIT, /* 150 */
1503:
1504: /* These are forced failure and success verbs */
1505:
1506: OP_FAIL, /* 151 */
1507: OP_ACCEPT, /* 152 */
1508: OP_ASSERT_ACCEPT, /* 153 Used inside assertions */
1509: OP_CLOSE, /* 154 Used before OP_ACCEPT to close open captures */
1510:
1511: /* This is used to skip a subpattern with a {0} quantifier */
1512:
1513: OP_SKIPZERO, /* 155 */
1514:
1515: /* This is not an opcode, but is used to check that tables indexed by opcode
1516: are the correct length, in order to catch updating errors - there have been
1517: some in the past. */
1518:
1519: OP_TABLE_LENGTH
1520: };
1521:
1522: /* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro
1523: definitions that follow must also be updated to match. There are also tables
1524: called "coptable" and "poptable" in pcre_dfa_exec.c that must be updated. */
1525:
1526:
1527: /* This macro defines textual names for all the opcodes. These are used only
1528: for debugging, and some of them are only partial names. The macro is referenced
1529: only in pcre_printint.c, which fills out the full names in many cases (and in
1530: some cases doesn't actually use these names at all). */
1531:
1532: #define OP_NAME_LIST \
1533: "End", "\\A", "\\G", "\\K", "\\B", "\\b", "\\D", "\\d", \
1534: "\\S", "\\s", "\\W", "\\w", "Any", "AllAny", "Anybyte", \
1535: "notprop", "prop", "\\R", "\\H", "\\h", "\\V", "\\v", \
1536: "extuni", "\\Z", "\\z", \
1537: "^", "^", "$", "$", "char", "chari", "not", "noti", \
1538: "*", "*?", "+", "+?", "?", "??", \
1539: "{", "{", "{", \
1540: "*+","++", "?+", "{", \
1541: "*", "*?", "+", "+?", "?", "??", \
1542: "{", "{", "{", \
1543: "*+","++", "?+", "{", \
1544: "*", "*?", "+", "+?", "?", "??", \
1545: "{", "{", "{", \
1546: "*+","++", "?+", "{", \
1547: "*", "*?", "+", "+?", "?", "??", \
1548: "{", "{", "{", \
1549: "*+","++", "?+", "{", \
1550: "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
1551: "*+","++", "?+", "{", \
1552: "*", "*?", "+", "+?", "?", "??", "{", "{", \
1553: "class", "nclass", "xclass", "Ref", "Refi", \
1554: "Recurse", "Callout", \
1555: "Alt", "Ket", "KetRmax", "KetRmin", "KetRpos", \
1556: "Reverse", "Assert", "Assert not", "AssertB", "AssertB not", \
1557: "Once", "Once_NC", \
1558: "Bra", "BraPos", "CBra", "CBraPos", \
1559: "Cond", \
1560: "SBra", "SBraPos", "SCBra", "SCBraPos", \
1561: "SCond", \
1562: "Cond ref", "Cond nref", "Cond rec", "Cond nrec", "Cond def", \
1563: "Brazero", "Braminzero", "Braposzero", \
1564: "*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP", \
1565: "*THEN", "*THEN", "*COMMIT", "*FAIL", \
1566: "*ACCEPT", "*ASSERT_ACCEPT", \
1567: "Close", "Skip zero"
1568:
1569:
1570: /* This macro defines the length of fixed length operations in the compiled
1571: regex. The lengths are used when searching for specific things, and also in the
1572: debugging printing of a compiled regex. We use a macro so that it can be
1573: defined close to the definitions of the opcodes themselves.
1574:
1575: As things have been extended, some of these are no longer fixed lenths, but are
1576: minima instead. For example, the length of a single-character repeat may vary
1577: in UTF-8 mode. The code that uses this table must know about such things. */
1578:
1579: #define OP_LENGTHS \
1580: 1, /* End */ \
1581: 1, 1, 1, 1, 1, /* \A, \G, \K, \B, \b */ \
1582: 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ \
1583: 1, 1, 1, /* Any, AllAny, Anybyte */ \
1584: 3, 3, /* \P, \p */ \
1585: 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ \
1586: 1, /* \X */ \
1587: 1, 1, 1, 1, 1, 1, /* \Z, \z, ^, ^M, $, $M */ \
1588: 2, /* Char - the minimum length */ \
1589: 2, /* Chari - the minimum length */ \
1590: 2, /* not */ \
1591: 2, /* noti */ \
1592: /* Positive single-char repeats ** These are */ \
1593: 2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \
1594: 4, 4, 4, /* upto, minupto, exact ** mode */ \
1595: 2, 2, 2, 4, /* *+, ++, ?+, upto+ */ \
1596: 2, 2, 2, 2, 2, 2, /* *I, *?I, +I, +?I, ?I, ??I ** UTF-8 */ \
1597: 4, 4, 4, /* upto I, minupto I, exact I */ \
1598: 2, 2, 2, 4, /* *+I, ++I, ?+I, upto+I */ \
1599: /* Negative single-char repeats - only for chars < 256 */ \
1600: 2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \
1601: 4, 4, 4, /* NOT upto, minupto, exact */ \
1602: 2, 2, 2, 4, /* Possessive NOT *, +, ?, upto */ \
1603: 2, 2, 2, 2, 2, 2, /* NOT *I, *?I, +I, +?I, ?I, ??I */ \
1604: 4, 4, 4, /* NOT upto I, minupto I, exact I */ \
1605: 2, 2, 2, 4, /* Possessive NOT *I, +I, ?I, upto I */ \
1606: /* Positive type repeats */ \
1607: 2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \
1608: 4, 4, 4, /* Type upto, minupto, exact */ \
1609: 2, 2, 2, 4, /* Possessive *+, ++, ?+, upto+ */ \
1610: /* Character class & ref repeats */ \
1611: 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \
1612: 5, 5, /* CRRANGE, CRMINRANGE */ \
1613: 33, /* CLASS */ \
1614: 33, /* NCLASS */ \
1615: 0, /* XCLASS - variable length */ \
1616: 3, /* REF */ \
1617: 3, /* REFI */ \
1618: 1+LINK_SIZE, /* RECURSE */ \
1619: 2+2*LINK_SIZE, /* CALLOUT */ \
1620: 1+LINK_SIZE, /* Alt */ \
1621: 1+LINK_SIZE, /* Ket */ \
1622: 1+LINK_SIZE, /* KetRmax */ \
1623: 1+LINK_SIZE, /* KetRmin */ \
1624: 1+LINK_SIZE, /* KetRpos */ \
1625: 1+LINK_SIZE, /* Reverse */ \
1626: 1+LINK_SIZE, /* Assert */ \
1627: 1+LINK_SIZE, /* Assert not */ \
1628: 1+LINK_SIZE, /* Assert behind */ \
1629: 1+LINK_SIZE, /* Assert behind not */ \
1630: 1+LINK_SIZE, /* ONCE */ \
1631: 1+LINK_SIZE, /* ONCE_NC */ \
1632: 1+LINK_SIZE, /* BRA */ \
1633: 1+LINK_SIZE, /* BRAPOS */ \
1634: 3+LINK_SIZE, /* CBRA */ \
1635: 3+LINK_SIZE, /* CBRAPOS */ \
1636: 1+LINK_SIZE, /* COND */ \
1637: 1+LINK_SIZE, /* SBRA */ \
1638: 1+LINK_SIZE, /* SBRAPOS */ \
1639: 3+LINK_SIZE, /* SCBRA */ \
1640: 3+LINK_SIZE, /* SCBRAPOS */ \
1641: 1+LINK_SIZE, /* SCOND */ \
1642: 3, 3, /* CREF, NCREF */ \
1643: 3, 3, /* RREF, NRREF */ \
1644: 1, /* DEF */ \
1645: 1, 1, 1, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ \
1646: 3, 1, 3, /* MARK, PRUNE, PRUNE_ARG */ \
1647: 1, 3, /* SKIP, SKIP_ARG */ \
1648: 1, 3, /* THEN, THEN_ARG */ \
1649: 1, 1, 1, 1, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */ \
1650: 3, 1 /* CLOSE, SKIPZERO */
1651:
1652: /* A magic value for OP_RREF and OP_NRREF to indicate the "any recursion"
1653: condition. */
1654:
1655: #define RREF_ANY 0xffff
1656:
1657: /* Compile time error code numbers. They are given names so that they can more
1658: easily be tracked. When a new number is added, the table called eint in
1659: pcreposix.c must be updated. */
1660:
1661: enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
1662: ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19,
1663: ERR20, ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29,
1664: ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
1665: ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
1666: ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
1667: ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69,
1668: ERR70, ERR71, ERR72, ERRCOUNT };
1669:
1670: /* The real format of the start of the pcre block; the index of names and the
1671: code vector run on as long as necessary after the end. We store an explicit
1672: offset to the name table so that if a regex is compiled on one host, saved, and
1673: then run on another where the size of pointers is different, all might still
1674: be well. For the case of compiled-on-4 and run-on-8, we include an extra
1675: pointer that is always NULL. For future-proofing, a few dummy fields were
1676: originally included - even though you can never get this planning right - but
1677: there is only one left now.
1678:
1679: NOTE NOTE NOTE:
1680: Because people can now save and re-use compiled patterns, any additions to this
1681: structure should be made at the end, and something earlier (e.g. a new
1682: flag in the options or one of the dummy fields) should indicate that the new
1683: fields are present. Currently PCRE always sets the dummy fields to zero.
1684: NOTE NOTE NOTE
1685: */
1686:
1687: typedef struct real_pcre {
1688: pcre_uint32 magic_number;
1689: pcre_uint32 size; /* Total that was malloced */
1690: pcre_uint32 options; /* Public options */
1691: pcre_uint16 flags; /* Private flags */
1692: pcre_uint16 dummy1; /* For future use */
1693: pcre_uint16 top_bracket;
1694: pcre_uint16 top_backref;
1695: pcre_uint16 first_byte;
1696: pcre_uint16 req_byte;
1697: pcre_uint16 name_table_offset; /* Offset to name table that follows */
1698: pcre_uint16 name_entry_size; /* Size of any name items */
1699: pcre_uint16 name_count; /* Number of name items */
1700: pcre_uint16 ref_count; /* Reference count */
1701:
1702: const unsigned char *tables; /* Pointer to tables or NULL for std */
1703: const unsigned char *nullpad; /* NULL padding */
1704: } real_pcre;
1705:
1706: /* The format of the block used to store data from pcre_study(). The same
1707: remark (see NOTE above) about extending this structure applies. */
1708:
1709: typedef struct pcre_study_data {
1710: pcre_uint32 size; /* Total that was malloced */
1711: pcre_uint32 flags; /* Private flags */
1712: uschar start_bits[32]; /* Starting char bits */
1713: pcre_uint32 minlength; /* Minimum subject length */
1714: } pcre_study_data;
1715:
1716: /* Structure for building a chain of open capturing subpatterns during
1717: compiling, so that instructions to close them can be compiled when (*ACCEPT) is
1718: encountered. This is also used to identify subpatterns that contain recursive
1719: back references to themselves, so that they can be made atomic. */
1720:
1721: typedef struct open_capitem {
1722: struct open_capitem *next; /* Chain link */
1723: pcre_uint16 number; /* Capture number */
1724: pcre_uint16 flag; /* Set TRUE if recursive back ref */
1725: } open_capitem;
1726:
1727: /* Structure for passing "static" information around between the functions
1728: doing the compiling, so that they are thread-safe. */
1729:
1730: typedef struct compile_data {
1731: const uschar *lcc; /* Points to lower casing table */
1732: const uschar *fcc; /* Points to case-flipping table */
1733: const uschar *cbits; /* Points to character type table */
1734: const uschar *ctypes; /* Points to table of type maps */
1735: const uschar *start_workspace;/* The start of working space */
1736: const uschar *start_code; /* The start of the compiled code */
1737: const uschar *start_pattern; /* The start of the pattern */
1738: const uschar *end_pattern; /* The end of the pattern */
1739: open_capitem *open_caps; /* Chain of open capture items */
1740: uschar *hwm; /* High watermark of workspace */
1741: uschar *name_table; /* The name/number table */
1742: int names_found; /* Number of entries so far */
1743: int name_entry_size; /* Size of each entry */
1744: int workspace_size; /* Size of workspace */
1745: int bracount; /* Count of capturing parens as we compile */
1746: int final_bracount; /* Saved value after first pass */
1747: int top_backref; /* Maximum back reference */
1748: unsigned int backref_map; /* Bitmap of low back refs */
1749: int assert_depth; /* Depth of nested assertions */
1750: int external_options; /* External (initial) options */
1751: int external_flags; /* External flag bits to be set */
1752: int req_varyopt; /* "After variable item" flag for reqbyte */
1753: BOOL had_accept; /* (*ACCEPT) encountered */
1754: BOOL check_lookbehind; /* Lookbehinds need later checking */
1755: int nltype; /* Newline type */
1756: int nllen; /* Newline string length */
1757: uschar nl[4]; /* Newline string when fixed length */
1758: } compile_data;
1759:
1760: /* Structure for maintaining a chain of pointers to the currently incomplete
1761: branches, for testing for left recursion while compiling. */
1762:
1763: typedef struct branch_chain {
1764: struct branch_chain *outer;
1765: uschar *current_branch;
1766: } branch_chain;
1767:
1768: /* Structure for items in a linked list that represents an explicit recursive
1769: call within the pattern; used by pcre_exec(). */
1770:
1771: typedef struct recursion_info {
1772: struct recursion_info *prevrec; /* Previous recursion record (or NULL) */
1773: int group_num; /* Number of group that was called */
1774: int *offset_save; /* Pointer to start of saved offsets */
1775: int saved_max; /* Number of saved offsets */
1776: USPTR subject_position; /* Position at start of recursion */
1777: } recursion_info;
1778:
1779: /* A similar structure for pcre_dfa_exec(). */
1780:
1781: typedef struct dfa_recursion_info {
1782: struct dfa_recursion_info *prevrec;
1783: int group_num;
1784: USPTR subject_position;
1785: } dfa_recursion_info;
1786:
1787: /* Structure for building a chain of data for holding the values of the subject
1788: pointer at the start of each subpattern, so as to detect when an empty string
1789: has been matched by a subpattern - to break infinite loops; used by
1790: pcre_exec(). */
1791:
1792: typedef struct eptrblock {
1793: struct eptrblock *epb_prev;
1794: USPTR epb_saved_eptr;
1795: } eptrblock;
1796:
1797:
1798: /* Structure for passing "static" information around between the functions
1799: doing traditional NFA matching, so that they are thread-safe. */
1800:
1801: typedef struct match_data {
1802: unsigned long int match_call_count; /* As it says */
1803: unsigned long int match_limit; /* As it says */
1804: unsigned long int match_limit_recursion; /* As it says */
1805: int *offset_vector; /* Offset vector */
1806: int offset_end; /* One past the end */
1807: int offset_max; /* The maximum usable for return data */
1808: int nltype; /* Newline type */
1809: int nllen; /* Newline string length */
1810: int name_count; /* Number of names in name table */
1811: int name_entry_size; /* Size of entry in names table */
1812: uschar *name_table; /* Table of names */
1813: uschar nl[4]; /* Newline string when fixed */
1814: const uschar *lcc; /* Points to lower casing table */
1815: const uschar *ctypes; /* Points to table of type maps */
1816: BOOL offset_overflow; /* Set if too many extractions */
1817: BOOL notbol; /* NOTBOL flag */
1818: BOOL noteol; /* NOTEOL flag */
1819: BOOL utf8; /* UTF8 flag */
1820: BOOL jscript_compat; /* JAVASCRIPT_COMPAT flag */
1821: BOOL use_ucp; /* PCRE_UCP flag */
1822: BOOL endonly; /* Dollar not before final \n */
1823: BOOL notempty; /* Empty string match not wanted */
1824: BOOL notempty_atstart; /* Empty string match at start not wanted */
1825: BOOL hitend; /* Hit the end of the subject at some point */
1826: BOOL bsr_anycrlf; /* \R is just any CRLF, not full Unicode */
1827: BOOL hasthen; /* Pattern contains (*THEN) */
1828: BOOL ignore_skip_arg; /* For re-run when SKIP name not found */
1829: const uschar *start_code; /* For use when recursing */
1830: USPTR start_subject; /* Start of the subject string */
1831: USPTR end_subject; /* End of the subject string */
1832: USPTR start_match_ptr; /* Start of matched string */
1833: USPTR end_match_ptr; /* Subject position at end match */
1834: USPTR start_used_ptr; /* Earliest consulted character */
1835: int partial; /* PARTIAL options */
1836: int end_offset_top; /* Highwater mark at end of match */
1837: int capture_last; /* Most recent capture number */
1838: int start_offset; /* The start offset value */
1839: int match_function_type; /* Set for certain special calls of MATCH() */
1840: eptrblock *eptrchain; /* Chain of eptrblocks for tail recursions */
1841: int eptrn; /* Next free eptrblock */
1842: recursion_info *recursive; /* Linked list of recursion data */
1843: void *callout_data; /* To pass back to callouts */
1844: const uschar *mark; /* Mark pointer to pass back on success */
1845: const uschar *nomatch_mark; /* Mark pointer to pass back on failure */
1846: const uschar *once_target; /* Where to back up to for atomic groups */
1847: } match_data;
1848:
1849: /* A similar structure is used for the same purpose by the DFA matching
1850: functions. */
1851:
1852: typedef struct dfa_match_data {
1853: const uschar *start_code; /* Start of the compiled pattern */
1854: const uschar *start_subject; /* Start of the subject string */
1855: const uschar *end_subject; /* End of subject string */
1856: const uschar *start_used_ptr; /* Earliest consulted character */
1857: const uschar *tables; /* Character tables */
1858: int start_offset; /* The start offset value */
1859: int moptions; /* Match options */
1860: int poptions; /* Pattern options */
1861: int nltype; /* Newline type */
1862: int nllen; /* Newline string length */
1863: uschar nl[4]; /* Newline string when fixed */
1864: void *callout_data; /* To pass back to callouts */
1865: dfa_recursion_info *recursive; /* Linked list of recursion data */
1866: } dfa_match_data;
1867:
1868: /* Bit definitions for entries in the pcre_ctypes table. */
1869:
1870: #define ctype_space 0x01
1871: #define ctype_letter 0x02
1872: #define ctype_digit 0x04
1873: #define ctype_xdigit 0x08
1874: #define ctype_word 0x10 /* alphanumeric or '_' */
1875: #define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */
1876:
1877: /* Offsets for the bitmap tables in pcre_cbits. Each table contains a set
1878: of bits for a class map. Some classes are built by combining these tables. */
1879:
1880: #define cbit_space 0 /* [:space:] or \s */
1881: #define cbit_xdigit 32 /* [:xdigit:] */
1882: #define cbit_digit 64 /* [:digit:] or \d */
1883: #define cbit_upper 96 /* [:upper:] */
1884: #define cbit_lower 128 /* [:lower:] */
1885: #define cbit_word 160 /* [:word:] or \w */
1886: #define cbit_graph 192 /* [:graph:] */
1887: #define cbit_print 224 /* [:print:] */
1888: #define cbit_punct 256 /* [:punct:] */
1889: #define cbit_cntrl 288 /* [:cntrl:] */
1890: #define cbit_length 320 /* Length of the cbits table */
1891:
1892: /* Offsets of the various tables from the base tables pointer, and
1893: total length. */
1894:
1895: #define lcc_offset 0
1896: #define fcc_offset 256
1897: #define cbits_offset 512
1898: #define ctypes_offset (cbits_offset + cbit_length)
1899: #define tables_length (ctypes_offset + 256)
1900:
1901: /* Layout of the UCP type table that translates property names into types and
1902: codes. Each entry used to point directly to a name, but to reduce the number of
1903: relocations in shared libraries, it now has an offset into a single string
1904: instead. */
1905:
1906: typedef struct {
1907: pcre_uint16 name_offset;
1908: pcre_uint16 type;
1909: pcre_uint16 value;
1910: } ucp_type_table;
1911:
1912:
1913: /* Internal shared data tables. These are tables that are used by more than one
1914: of the exported public functions. They have to be "external" in the C sense,
1915: but are not part of the PCRE public API. The data for these tables is in the
1916: pcre_tables.c module. */
1917:
1918: extern const int _pcre_utf8_table1[];
1919: extern const int _pcre_utf8_table2[];
1920: extern const int _pcre_utf8_table3[];
1921: extern const uschar _pcre_utf8_table4[];
1922:
1923: #ifdef SUPPORT_JIT
1924: extern const uschar _pcre_utf8_char_sizes[];
1925: #endif
1926:
1927: extern const int _pcre_utf8_table1_size;
1928:
1929: extern const char _pcre_utt_names[];
1930: extern const ucp_type_table _pcre_utt[];
1931: extern const int _pcre_utt_size;
1932:
1933: extern const uschar _pcre_default_tables[];
1934:
1935: extern const uschar _pcre_OP_lengths[];
1936:
1937:
1938: /* Internal shared functions. These are functions that are used by more than
1939: one of the exported public functions. They have to be "external" in the C
1940: sense, but are not part of the PCRE public API. */
1941:
1942: extern const uschar *_pcre_find_bracket(const uschar *, BOOL, int);
1943: extern BOOL _pcre_is_newline(USPTR, int, USPTR, int *, BOOL);
1944: extern int _pcre_ord2utf8(int, uschar *);
1945: extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *,
1946: const pcre_study_data *, pcre_study_data *);
1947: extern int _pcre_valid_utf8(USPTR, int, int *);
1948: extern BOOL _pcre_was_newline(USPTR, int, USPTR, int *, BOOL);
1949: extern BOOL _pcre_xclass(int, const uschar *);
1950:
1951: #ifdef SUPPORT_JIT
1952: extern void _pcre_jit_compile(const real_pcre *, pcre_extra *);
1953: extern int _pcre_jit_exec(const real_pcre *, void *, PCRE_SPTR,
1954: int, int, int, int, int *, int);
1955: extern void _pcre_jit_free(void *);
1956: extern int _pcre_jit_get_size(void *);
1957: #endif
1958:
1959: /* Unicode character database (UCD) */
1960:
1961: typedef struct {
1962: uschar script;
1963: uschar chartype;
1964: pcre_int32 other_case;
1965: } ucd_record;
1966:
1967: extern const ucd_record _pcre_ucd_records[];
1968: extern const uschar _pcre_ucd_stage1[];
1969: extern const pcre_uint16 _pcre_ucd_stage2[];
1970: extern const int _pcre_ucp_gentype[];
1971: #ifdef SUPPORT_JIT
1972: extern const int _pcre_ucp_typerange[];
1973: #endif
1974:
1975: /* UCD access macros */
1976:
1977: #define UCD_BLOCK_SIZE 128
1978: #define GET_UCD(ch) (_pcre_ucd_records + \
1979: _pcre_ucd_stage2[_pcre_ucd_stage1[(ch) / UCD_BLOCK_SIZE] * \
1980: UCD_BLOCK_SIZE + (ch) % UCD_BLOCK_SIZE])
1981:
1982: #define UCD_CHARTYPE(ch) GET_UCD(ch)->chartype
1983: #define UCD_SCRIPT(ch) GET_UCD(ch)->script
1984: #define UCD_CATEGORY(ch) _pcre_ucp_gentype[UCD_CHARTYPE(ch)]
1985: #define UCD_OTHERCASE(ch) (ch + GET_UCD(ch)->other_case)
1986:
1987: #endif
1988:
1989: /* End of pcre_internal.h */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>