Annotation of embedaddon/php/ext/pcre/pcrelib/pcre_internal.h, revision 1.1.1.1
1.1 misho 1: /*************************************************
2: * Perl-Compatible Regular Expressions *
3: *************************************************/
4:
5:
6: /* PCRE is a library of functions to support regular expressions whose syntax
7: and semantics are as close as possible to those of the Perl 5 language.
8:
9: Written by Philip Hazel
10: Copyright (c) 1997-2010 University of Cambridge
11:
12: -----------------------------------------------------------------------------
13: Redistribution and use in source and binary forms, with or without
14: modification, are permitted provided that the following conditions are met:
15:
16: * Redistributions of source code must retain the above copyright notice,
17: this list of conditions and the following disclaimer.
18:
19: * Redistributions in binary form must reproduce the above copyright
20: notice, this list of conditions and the following disclaimer in the
21: documentation and/or other materials provided with the distribution.
22:
23: * Neither the name of the University of Cambridge nor the names of its
24: contributors may be used to endorse or promote products derived from
25: this software without specific prior written permission.
26:
27: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37: POSSIBILITY OF SUCH DAMAGE.
38: -----------------------------------------------------------------------------
39: */
40:
41: /* This header contains definitions that are shared between the different
42: modules, but which are not relevant to the exported API. This includes some
43: functions whose names all begin with "_pcre_". */
44:
45: #ifndef PCRE_INTERNAL_H
46: #define PCRE_INTERNAL_H
47:
48: /* Define PCRE_DEBUG to get debugging output on stdout. */
49:
50: #if 0
51: #define PCRE_DEBUG
52: #endif
53:
54: /* We do not support both EBCDIC and UTF-8 at the same time. The "configure"
55: script prevents both being selected, but not everybody uses "configure". */
56:
57: #if defined EBCDIC && defined SUPPORT_UTF8
58: #error The use of both EBCDIC and SUPPORT_UTF8 is not supported.
59: #endif
60:
61: /* If SUPPORT_UCP is defined, SUPPORT_UTF8 must also be defined. The
62: "configure" script ensures this, but not everybody uses "configure". */
63:
64: #if defined SUPPORT_UCP && !defined SUPPORT_UTF8
65: #define SUPPORT_UTF8 1
66: #endif
67:
68: /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
69: inline, and there are *still* stupid compilers about that don't like indented
70: pre-processor statements, or at least there were when I first wrote this. After
71: all, it had only been about 10 years then...
72:
73: It turns out that the Mac Debugging.h header also defines the macro DPRINTF, so
74: be absolutely sure we get our version. */
75:
76: #undef DPRINTF
77: #ifdef PCRE_DEBUG
78: #define DPRINTF(p) printf p
79: #else
80: #define DPRINTF(p) /* Nothing */
81: #endif
82:
83:
84: /* Standard C headers plus the external interface definition. The only time
85: setjmp and stdarg are used is when NO_RECURSE is set. */
86:
87: #include <ctype.h>
88: #include <limits.h>
89: #include <stddef.h>
90: #include <stdio.h>
91: #include <stdlib.h>
92: #include <string.h>
93:
94: /* When compiling a DLL for Windows, the exported symbols have to be declared
95: using some MS magic. I found some useful information on this web page:
96: http://msdn2.microsoft.com/en-us/library/y4h7bcy6(VS.80).aspx. According to the
97: information there, using __declspec(dllexport) without "extern" we have a
98: definition; with "extern" we have a declaration. The settings here override the
99: setting in pcre.h (which is included below); it defines only PCRE_EXP_DECL,
100: which is all that is needed for applications (they just import the symbols). We
101: use:
102:
103: PCRE_EXP_DECL for declarations
104: PCRE_EXP_DEFN for definitions of exported functions
105: PCRE_EXP_DATA_DEFN for definitions of exported variables
106:
107: The reason for the two DEFN macros is that in non-Windows environments, one
108: does not want to have "extern" before variable definitions because it leads to
109: compiler warnings. So we distinguish between functions and variables. In
110: Windows, the two should always be the same.
111:
112: The reason for wrapping this in #ifndef PCRE_EXP_DECL is so that pcretest,
113: which is an application, but needs to import this file in order to "peek" at
114: internals, can #include pcre.h first to get an application's-eye view.
115:
116: In principle, people compiling for non-Windows, non-Unix-like (i.e. uncommon,
117: special-purpose environments) might want to stick other stuff in front of
118: exported symbols. That's why, in the non-Windows case, we set PCRE_EXP_DEFN and
119: PCRE_EXP_DATA_DEFN only if they are not already set. */
120:
121: #ifndef PCRE_EXP_DECL
122: # ifdef _WIN32
123: # ifndef PCRE_STATIC
124: # define PCRE_EXP_DECL extern __declspec(dllexport)
125: # define PCRE_EXP_DEFN __declspec(dllexport)
126: # define PCRE_EXP_DATA_DEFN __declspec(dllexport)
127: # else
128: # define PCRE_EXP_DECL extern
129: # define PCRE_EXP_DEFN
130: # define PCRE_EXP_DATA_DEFN
131: # endif
132: # else
133: # ifdef __cplusplus
134: # define PCRE_EXP_DECL extern "C"
135: # else
136: # define PCRE_EXP_DECL extern
137: # endif
138: # ifndef PCRE_EXP_DEFN
139: # define PCRE_EXP_DEFN PCRE_EXP_DECL
140: # endif
141: # ifndef PCRE_EXP_DATA_DEFN
142: # define PCRE_EXP_DATA_DEFN
143: # endif
144: # endif
145: #endif
146:
147: /* When compiling with the MSVC compiler, it is sometimes necessary to include
148: a "calling convention" before exported function names. (This is secondhand
149: information; I know nothing about MSVC myself). For example, something like
150:
151: void __cdecl function(....)
152:
153: might be needed. In order so make this easy, all the exported functions have
154: PCRE_CALL_CONVENTION just before their names. It is rarely needed; if not
155: set, we ensure here that it has no effect. */
156:
157: #ifndef PCRE_CALL_CONVENTION
158: #define PCRE_CALL_CONVENTION
159: #endif
160:
161: /* We need to have types that specify unsigned 16-bit and 32-bit integers. We
162: cannot determine these outside the compilation (e.g. by running a program as
163: part of "configure") because PCRE is often cross-compiled for use on other
164: systems. Instead we make use of the maximum sizes that are available at
165: preprocessor time in standard C environments. */
166:
167: #if USHRT_MAX == 65535
168: typedef unsigned short pcre_uint16;
169: typedef short pcre_int16;
170: #elif UINT_MAX == 65535
171: typedef unsigned int pcre_uint16;
172: typedef int pcre_int16;
173: #else
174: #error Cannot determine a type for 16-bit unsigned integers
175: #endif
176:
177: #if UINT_MAX == 4294967295
178: typedef unsigned int pcre_uint32;
179: typedef int pcre_int32;
180: #elif ULONG_MAX == 4294967295
181: typedef unsigned long int pcre_uint32;
182: typedef long int pcre_int32;
183: #else
184: #error Cannot determine a type for 32-bit unsigned integers
185: #endif
186:
187: /* When checking for integer overflow in pcre_compile(), we need to handle
188: large integers. If a 64-bit integer type is available, we can use that.
189: Otherwise we have to cast to double, which of course requires floating point
190: arithmetic. Handle this by defining a macro for the appropriate type. If
191: stdint.h is available, include it; it may define INT64_MAX. Systems that do not
192: have stdint.h (e.g. Solaris) may have inttypes.h. The macro int64_t may be set
193: by "configure". */
194:
195: #ifdef PHP_WIN32
196: #include "win32/php_stdint.h"
197: #elif HAVE_STDINT_H
198: #include <stdint.h>
199: #elif HAVE_INTTYPES_H
200: #include <inttypes.h>
201: #endif
202:
203: #if defined INT64_MAX || defined int64_t
204: #define INT64_OR_DOUBLE int64_t
205: #else
206: #define INT64_OR_DOUBLE double
207: #endif
208:
209: /* All character handling must be done as unsigned characters. Otherwise there
210: are problems with top-bit-set characters and functions such as isspace().
211: However, we leave the interface to the outside world as char *, because that
212: should make things easier for callers. We define a short type for unsigned char
213: to save lots of typing. I tried "uchar", but it causes problems on Digital
214: Unix, where it is defined in sys/types, so use "uschar" instead. */
215:
216: typedef unsigned char uschar;
217:
218: /* This is an unsigned int value that no character can ever have. UTF-8
219: characters only go up to 0x7fffffff (though Unicode doesn't go beyond
220: 0x0010ffff). */
221:
222: #define NOTACHAR 0xffffffff
223:
224: /* PCRE is able to support several different kinds of newline (CR, LF, CRLF,
225: "any" and "anycrlf" at present). The following macros are used to package up
226: testing for newlines. NLBLOCK, PSSTART, and PSEND are defined in the various
227: modules to indicate in which datablock the parameters exist, and what the
228: start/end of string field names are. */
229:
230: #define NLTYPE_FIXED 0 /* Newline is a fixed length string */
231: #define NLTYPE_ANY 1 /* Newline is any Unicode line ending */
232: #define NLTYPE_ANYCRLF 2 /* Newline is CR, LF, or CRLF */
233:
234: /* This macro checks for a newline at the given position */
235:
236: #define IS_NEWLINE(p) \
237: ((NLBLOCK->nltype != NLTYPE_FIXED)? \
238: ((p) < NLBLOCK->PSEND && \
239: _pcre_is_newline((p), NLBLOCK->nltype, NLBLOCK->PSEND, &(NLBLOCK->nllen),\
240: utf8)) \
241: : \
242: ((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \
243: (p)[0] == NLBLOCK->nl[0] && \
244: (NLBLOCK->nllen == 1 || (p)[1] == NLBLOCK->nl[1]) \
245: ) \
246: )
247:
248: /* This macro checks for a newline immediately preceding the given position */
249:
250: #define WAS_NEWLINE(p) \
251: ((NLBLOCK->nltype != NLTYPE_FIXED)? \
252: ((p) > NLBLOCK->PSSTART && \
253: _pcre_was_newline((p), NLBLOCK->nltype, NLBLOCK->PSSTART, \
254: &(NLBLOCK->nllen), utf8)) \
255: : \
256: ((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \
257: (p)[-NLBLOCK->nllen] == NLBLOCK->nl[0] && \
258: (NLBLOCK->nllen == 1 || (p)[-NLBLOCK->nllen+1] == NLBLOCK->nl[1]) \
259: ) \
260: )
261:
262: /* When PCRE is compiled as a C++ library, the subject pointer can be replaced
263: with a custom type. This makes it possible, for example, to allow pcre_exec()
264: to process subject strings that are discontinuous by using a smart pointer
265: class. It must always be possible to inspect all of the subject string in
266: pcre_exec() because of the way it backtracks. Two macros are required in the
267: normal case, for sign-unspecified and unsigned char pointers. The former is
268: used for the external interface and appears in pcre.h, which is why its name
269: must begin with PCRE_. */
270:
271: #ifdef CUSTOM_SUBJECT_PTR
272: #define PCRE_SPTR CUSTOM_SUBJECT_PTR
273: #define USPTR CUSTOM_SUBJECT_PTR
274: #else
275: #define PCRE_SPTR const char *
276: #define USPTR const unsigned char *
277: #endif
278:
279:
280:
281: /* Include the public PCRE header and the definitions of UCP character property
282: values. */
283:
284: #include "pcre.h"
285: #include "ucp.h"
286:
287: /* When compiling for use with the Virtual Pascal compiler, these functions
288: need to have their names changed. PCRE must be compiled with the -DVPCOMPAT
289: option on the command line. */
290:
291: #ifdef VPCOMPAT
292: #define strlen(s) _strlen(s)
293: #define strncmp(s1,s2,m) _strncmp(s1,s2,m)
294: #define memcmp(s,c,n) _memcmp(s,c,n)
295: #define memcpy(d,s,n) _memcpy(d,s,n)
296: #define memmove(d,s,n) _memmove(d,s,n)
297: #define memset(s,c,n) _memset(s,c,n)
298: #else /* VPCOMPAT */
299:
300: /* To cope with SunOS4 and other systems that lack memmove() but have bcopy(),
301: define a macro for memmove() if HAVE_MEMMOVE is false, provided that HAVE_BCOPY
302: is set. Otherwise, include an emulating function for those systems that have
303: neither (there some non-Unix environments where this is the case). */
304:
305: #ifndef HAVE_MEMMOVE
306: #undef memmove /* some systems may have a macro */
307: #ifdef HAVE_BCOPY
308: #define memmove(a, b, c) bcopy(b, a, c)
309: #else /* HAVE_BCOPY */
310: static void *
311: pcre_memmove(void *d, const void *s, size_t n)
312: {
313: size_t i;
314: unsigned char *dest = (unsigned char *)d;
315: const unsigned char *src = (const unsigned char *)s;
316: if (dest > src)
317: {
318: dest += n;
319: src += n;
320: for (i = 0; i < n; ++i) *(--dest) = *(--src);
321: return (void *)dest;
322: }
323: else
324: {
325: for (i = 0; i < n; ++i) *dest++ = *src++;
326: return (void *)(dest - n);
327: }
328: }
329: #define memmove(a, b, c) pcre_memmove(a, b, c)
330: #endif /* not HAVE_BCOPY */
331: #endif /* not HAVE_MEMMOVE */
332: #endif /* not VPCOMPAT */
333:
334:
335: /* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored
336: in big-endian order) by default. These are used, for example, to link from the
337: start of a subpattern to its alternatives and its end. The use of 2 bytes per
338: offset limits the size of the compiled regex to around 64K, which is big enough
339: for almost everybody. However, I received a request for an even bigger limit.
340: For this reason, and also to make the code easier to maintain, the storing and
341: loading of offsets from the byte string is now handled by the macros that are
342: defined here.
343:
344: The macros are controlled by the value of LINK_SIZE. This defaults to 2 in
345: the config.h file, but can be overridden by using -D on the command line. This
346: is automated on Unix systems via the "configure" command. */
347:
348: #if LINK_SIZE == 2
349:
350: #define PUT(a,n,d) \
351: (a[n] = (d) >> 8), \
352: (a[(n)+1] = (d) & 255)
353:
354: #define GET(a,n) \
355: (((a)[n] << 8) | (a)[(n)+1])
356:
357: #define MAX_PATTERN_SIZE (1 << 16)
358:
359:
360: #elif LINK_SIZE == 3
361:
362: #define PUT(a,n,d) \
363: (a[n] = (d) >> 16), \
364: (a[(n)+1] = (d) >> 8), \
365: (a[(n)+2] = (d) & 255)
366:
367: #define GET(a,n) \
368: (((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2])
369:
370: #define MAX_PATTERN_SIZE (1 << 24)
371:
372:
373: #elif LINK_SIZE == 4
374:
375: #define PUT(a,n,d) \
376: (a[n] = (d) >> 24), \
377: (a[(n)+1] = (d) >> 16), \
378: (a[(n)+2] = (d) >> 8), \
379: (a[(n)+3] = (d) & 255)
380:
381: #define GET(a,n) \
382: (((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3])
383:
384: #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */
385:
386:
387: #else
388: #error LINK_SIZE must be either 2, 3, or 4
389: #endif
390:
391:
392: /* Convenience macro defined in terms of the others */
393:
394: #define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE
395:
396:
397: /* PCRE uses some other 2-byte quantities that do not change when the size of
398: offsets changes. There are used for repeat counts and for other things such as
399: capturing parenthesis numbers in back references. */
400:
401: #define PUT2(a,n,d) \
402: a[n] = (d) >> 8; \
403: a[(n)+1] = (d) & 255
404:
405: #define GET2(a,n) \
406: (((a)[n] << 8) | (a)[(n)+1])
407:
408: #define PUT2INC(a,n,d) PUT2(a,n,d), a += 2
409:
410:
411: /* When UTF-8 encoding is being used, a character is no longer just a single
412: byte. The macros for character handling generate simple sequences when used in
413: byte-mode, and more complicated ones for UTF-8 characters. GETCHARLENTEST is
414: not used when UTF-8 is not supported, so it is not defined, and BACKCHAR should
415: never be called in byte mode. To make sure they can never even appear when
416: UTF-8 support is omitted, we don't even define them. */
417:
418: #ifndef SUPPORT_UTF8
419: #define GETCHAR(c, eptr) c = *eptr;
420: #define GETCHARTEST(c, eptr) c = *eptr;
421: #define GETCHARINC(c, eptr) c = *eptr++;
422: #define GETCHARINCTEST(c, eptr) c = *eptr++;
423: #define GETCHARLEN(c, eptr, len) c = *eptr;
424: /* #define GETCHARLENTEST(c, eptr, len) */
425: /* #define BACKCHAR(eptr) */
426:
427: #else /* SUPPORT_UTF8 */
428:
429: /* These macros were originally written in the form of loops that used data
430: from the tables whose names start with _pcre_utf8_table. They were rewritten by
431: a user so as not to use loops, because in some environments this gives a
432: significant performance advantage, and it seems never to do any harm. */
433:
434: /* Base macro to pick up the remaining bytes of a UTF-8 character, not
435: advancing the pointer. */
436:
437: #define GETUTF8(c, eptr) \
438: { \
439: if ((c & 0x20) == 0) \
440: c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \
441: else if ((c & 0x10) == 0) \
442: c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
443: else if ((c & 0x08) == 0) \
444: c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | \
445: ((eptr[2] & 0x3f) << 6) | (eptr[3] & 0x3f); \
446: else if ((c & 0x04) == 0) \
447: c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | \
448: ((eptr[2] & 0x3f) << 12) | ((eptr[3] & 0x3f) << 6) | \
449: (eptr[4] & 0x3f); \
450: else \
451: c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | \
452: ((eptr[2] & 0x3f) << 18) | ((eptr[3] & 0x3f) << 12) | \
453: ((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \
454: }
455:
456: /* Get the next UTF-8 character, not advancing the pointer. This is called when
457: we know we are in UTF-8 mode. */
458:
459: #define GETCHAR(c, eptr) \
460: c = *eptr; \
461: if (c >= 0xc0) GETUTF8(c, eptr);
462:
463: /* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
464: pointer. */
465:
466: #define GETCHARTEST(c, eptr) \
467: c = *eptr; \
468: if (utf8 && c >= 0xc0) GETUTF8(c, eptr);
469:
470: /* Base macro to pick up the remaining bytes of a UTF-8 character, advancing
471: the pointer. */
472:
473: #define GETUTF8INC(c, eptr) \
474: { \
475: if ((c & 0x20) == 0) \
476: c = ((c & 0x1f) << 6) | (*eptr++ & 0x3f); \
477: else if ((c & 0x10) == 0) \
478: { \
479: c = ((c & 0x0f) << 12) | ((*eptr & 0x3f) << 6) | (eptr[1] & 0x3f); \
480: eptr += 2; \
481: } \
482: else if ((c & 0x08) == 0) \
483: { \
484: c = ((c & 0x07) << 18) | ((*eptr & 0x3f) << 12) | \
485: ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
486: eptr += 3; \
487: } \
488: else if ((c & 0x04) == 0) \
489: { \
490: c = ((c & 0x03) << 24) | ((*eptr & 0x3f) << 18) | \
491: ((eptr[1] & 0x3f) << 12) | ((eptr[2] & 0x3f) << 6) | \
492: (eptr[3] & 0x3f); \
493: eptr += 4; \
494: } \
495: else \
496: { \
497: c = ((c & 0x01) << 30) | ((*eptr & 0x3f) << 24) | \
498: ((eptr[1] & 0x3f) << 18) | ((eptr[2] & 0x3f) << 12) | \
499: ((eptr[3] & 0x3f) << 6) | (eptr[4] & 0x3f); \
500: eptr += 5; \
501: } \
502: }
503:
504: /* Get the next UTF-8 character, advancing the pointer. This is called when we
505: know we are in UTF-8 mode. */
506:
507: #define GETCHARINC(c, eptr) \
508: c = *eptr++; \
509: if (c >= 0xc0) GETUTF8INC(c, eptr);
510:
511: /* Get the next character, testing for UTF-8 mode, and advancing the pointer.
512: This is called when we don't know if we are in UTF-8 mode. */
513:
514: #define GETCHARINCTEST(c, eptr) \
515: c = *eptr++; \
516: if (utf8 && c >= 0xc0) GETUTF8INC(c, eptr);
517:
518: /* Base macro to pick up the remaining bytes of a UTF-8 character, not
519: advancing the pointer, incrementing the length. */
520:
521: #define GETUTF8LEN(c, eptr, len) \
522: { \
523: if ((c & 0x20) == 0) \
524: { \
525: c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \
526: len++; \
527: } \
528: else if ((c & 0x10) == 0) \
529: { \
530: c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
531: len += 2; \
532: } \
533: else if ((c & 0x08) == 0) \
534: {\
535: c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | \
536: ((eptr[2] & 0x3f) << 6) | (eptr[3] & 0x3f); \
537: len += 3; \
538: } \
539: else if ((c & 0x04) == 0) \
540: { \
541: c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | \
542: ((eptr[2] & 0x3f) << 12) | ((eptr[3] & 0x3f) << 6) | \
543: (eptr[4] & 0x3f); \
544: len += 4; \
545: } \
546: else \
547: {\
548: c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | \
549: ((eptr[2] & 0x3f) << 18) | ((eptr[3] & 0x3f) << 12) | \
550: ((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \
551: len += 5; \
552: } \
553: }
554:
555: /* Get the next UTF-8 character, not advancing the pointer, incrementing length
556: if there are extra bytes. This is called when we know we are in UTF-8 mode. */
557:
558: #define GETCHARLEN(c, eptr, len) \
559: c = *eptr; \
560: if (c >= 0xc0) GETUTF8LEN(c, eptr, len);
561:
562: /* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the
563: pointer, incrementing length if there are extra bytes. This is called when we
564: do not know if we are in UTF-8 mode. */
565:
566: #define GETCHARLENTEST(c, eptr, len) \
567: c = *eptr; \
568: if (utf8 && c >= 0xc0) GETUTF8LEN(c, eptr, len);
569:
570: /* If the pointer is not at the start of a character, move it back until
571: it is. This is called only in UTF-8 mode - we don't put a test within the macro
572: because almost all calls are already within a block of UTF-8 only code. */
573:
574: #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--
575:
576: #endif /* SUPPORT_UTF8 */
577:
578:
579: /* In case there is no definition of offsetof() provided - though any proper
580: Standard C system should have one. */
581:
582: #ifndef offsetof
583: #define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field))
584: #endif
585:
586:
587: /* These are the public options that can change during matching. */
588:
589: #define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL)
590:
591: /* Private flags containing information about the compiled regex. They used to
592: live at the top end of the options word, but that got almost full, so now they
593: are in a 16-bit flags word. From release 8.00, PCRE_NOPARTIAL is unused, as
594: the restrictions on partial matching have been lifted. It remains for backwards
595: compatibility. */
596:
597: #define PCRE_NOPARTIAL 0x0001 /* can't use partial with this regex */
598: #define PCRE_FIRSTSET 0x0002 /* first_byte is set */
599: #define PCRE_REQCHSET 0x0004 /* req_byte is set */
600: #define PCRE_STARTLINE 0x0008 /* start after \n for multiline */
601: #define PCRE_JCHANGED 0x0010 /* j option used in regex */
602: #define PCRE_HASCRORLF 0x0020 /* explicit \r or \n in pattern */
603:
604: /* Options for the "extra" block produced by pcre_study(). */
605:
606: #define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */
607: #define PCRE_STUDY_MINLEN 0x02 /* a minimum length field exists */
608:
609: /* Masks for identifying the public options that are permitted at compile
610: time, run time, or study time, respectively. */
611:
612: #define PCRE_NEWLINE_BITS (PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_ANY| \
613: PCRE_NEWLINE_ANYCRLF)
614:
615: #define PUBLIC_COMPILE_OPTIONS \
616: (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
617: PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
618: PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \
619: PCRE_DUPNAMES|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \
620: PCRE_JAVASCRIPT_COMPAT|PCRE_UCP|PCRE_NO_START_OPTIMIZE)
621:
622: #define PUBLIC_EXEC_OPTIONS \
623: (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NOTEMPTY_ATSTART| \
624: PCRE_NO_UTF8_CHECK|PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_NEWLINE_BITS| \
625: PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE|PCRE_NO_START_OPTIMIZE)
626:
627: #define PUBLIC_DFA_EXEC_OPTIONS \
628: (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NOTEMPTY_ATSTART| \
629: PCRE_NO_UTF8_CHECK|PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_DFA_SHORTEST| \
630: PCRE_DFA_RESTART|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \
631: PCRE_NO_START_OPTIMIZE)
632:
633: #define PUBLIC_STUDY_OPTIONS 0 /* None defined */
634:
635: /* Magic number to provide a small check against being handed junk. Also used
636: to detect whether a pattern was compiled on a host of different endianness. */
637:
638: #define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */
639:
640: /* Negative values for the firstchar and reqchar variables */
641:
642: #define REQ_UNSET (-2)
643: #define REQ_NONE (-1)
644:
645: /* The maximum remaining length of subject we are prepared to search for a
646: req_byte match. */
647:
648: #define REQ_BYTE_MAX 1000
649:
650: /* Flags added to firstbyte or reqbyte; a "non-literal" item is either a
651: variable-length repeat, or a anything other than literal characters. */
652:
653: #define REQ_CASELESS 0x0100 /* indicates caselessness */
654: #define REQ_VARY 0x0200 /* reqbyte followed non-literal item */
655:
656: /* Miscellaneous definitions. The #ifndef is to pacify compiler warnings in
657: environments where these macros are defined elsewhere. Unfortunately, there
658: is no way to do the same for the typedef. */
659:
660: typedef int BOOL;
661:
662: #ifndef FALSE
663: #define FALSE 0
664: #define TRUE 1
665: #endif
666:
667: /* If PCRE is to support UTF-8 on EBCDIC platforms, we cannot use normal
668: character constants like '*' because the compiler would emit their EBCDIC code,
669: which is different from their ASCII/UTF-8 code. Instead we define macros for
670: the characters so that they always use the ASCII/UTF-8 code when UTF-8 support
671: is enabled. When UTF-8 support is not enabled, the definitions use character
672: literals. Both character and string versions of each character are needed, and
673: there are some longer strings as well.
674:
675: This means that, on EBCDIC platforms, the PCRE library can handle either
676: EBCDIC, or UTF-8, but not both. To support both in the same compiled library
677: would need different lookups depending on whether PCRE_UTF8 was set or not.
678: This would make it impossible to use characters in switch/case statements,
679: which would reduce performance. For a theoretical use (which nobody has asked
680: for) in a minority area (EBCDIC platforms), this is not sensible. Any
681: application that did need both could compile two versions of the library, using
682: macros to give the functions distinct names. */
683:
684: #ifndef SUPPORT_UTF8
685:
686: /* UTF-8 support is not enabled; use the platform-dependent character literals
687: so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */
688:
689: #define CHAR_HT '\t'
690: #define CHAR_VT '\v'
691: #define CHAR_FF '\f'
692: #define CHAR_CR '\r'
693: #define CHAR_NL '\n'
694: #define CHAR_BS '\b'
695: #define CHAR_BEL '\a'
696: #ifdef EBCDIC
697: #define CHAR_ESC '\047'
698: #define CHAR_DEL '\007'
699: #else
700: #define CHAR_ESC '\033'
701: #define CHAR_DEL '\177'
702: #endif
703:
704: #define CHAR_SPACE ' '
705: #define CHAR_EXCLAMATION_MARK '!'
706: #define CHAR_QUOTATION_MARK '"'
707: #define CHAR_NUMBER_SIGN '#'
708: #define CHAR_DOLLAR_SIGN '$'
709: #define CHAR_PERCENT_SIGN '%'
710: #define CHAR_AMPERSAND '&'
711: #define CHAR_APOSTROPHE '\''
712: #define CHAR_LEFT_PARENTHESIS '('
713: #define CHAR_RIGHT_PARENTHESIS ')'
714: #define CHAR_ASTERISK '*'
715: #define CHAR_PLUS '+'
716: #define CHAR_COMMA ','
717: #define CHAR_MINUS '-'
718: #define CHAR_DOT '.'
719: #define CHAR_SLASH '/'
720: #define CHAR_0 '0'
721: #define CHAR_1 '1'
722: #define CHAR_2 '2'
723: #define CHAR_3 '3'
724: #define CHAR_4 '4'
725: #define CHAR_5 '5'
726: #define CHAR_6 '6'
727: #define CHAR_7 '7'
728: #define CHAR_8 '8'
729: #define CHAR_9 '9'
730: #define CHAR_COLON ':'
731: #define CHAR_SEMICOLON ';'
732: #define CHAR_LESS_THAN_SIGN '<'
733: #define CHAR_EQUALS_SIGN '='
734: #define CHAR_GREATER_THAN_SIGN '>'
735: #define CHAR_QUESTION_MARK '?'
736: #define CHAR_COMMERCIAL_AT '@'
737: #define CHAR_A 'A'
738: #define CHAR_B 'B'
739: #define CHAR_C 'C'
740: #define CHAR_D 'D'
741: #define CHAR_E 'E'
742: #define CHAR_F 'F'
743: #define CHAR_G 'G'
744: #define CHAR_H 'H'
745: #define CHAR_I 'I'
746: #define CHAR_J 'J'
747: #define CHAR_K 'K'
748: #define CHAR_L 'L'
749: #define CHAR_M 'M'
750: #define CHAR_N 'N'
751: #define CHAR_O 'O'
752: #define CHAR_P 'P'
753: #define CHAR_Q 'Q'
754: #define CHAR_R 'R'
755: #define CHAR_S 'S'
756: #define CHAR_T 'T'
757: #define CHAR_U 'U'
758: #define CHAR_V 'V'
759: #define CHAR_W 'W'
760: #define CHAR_X 'X'
761: #define CHAR_Y 'Y'
762: #define CHAR_Z 'Z'
763: #define CHAR_LEFT_SQUARE_BRACKET '['
764: #define CHAR_BACKSLASH '\\'
765: #define CHAR_RIGHT_SQUARE_BRACKET ']'
766: #define CHAR_CIRCUMFLEX_ACCENT '^'
767: #define CHAR_UNDERSCORE '_'
768: #define CHAR_GRAVE_ACCENT '`'
769: #define CHAR_a 'a'
770: #define CHAR_b 'b'
771: #define CHAR_c 'c'
772: #define CHAR_d 'd'
773: #define CHAR_e 'e'
774: #define CHAR_f 'f'
775: #define CHAR_g 'g'
776: #define CHAR_h 'h'
777: #define CHAR_i 'i'
778: #define CHAR_j 'j'
779: #define CHAR_k 'k'
780: #define CHAR_l 'l'
781: #define CHAR_m 'm'
782: #define CHAR_n 'n'
783: #define CHAR_o 'o'
784: #define CHAR_p 'p'
785: #define CHAR_q 'q'
786: #define CHAR_r 'r'
787: #define CHAR_s 's'
788: #define CHAR_t 't'
789: #define CHAR_u 'u'
790: #define CHAR_v 'v'
791: #define CHAR_w 'w'
792: #define CHAR_x 'x'
793: #define CHAR_y 'y'
794: #define CHAR_z 'z'
795: #define CHAR_LEFT_CURLY_BRACKET '{'
796: #define CHAR_VERTICAL_LINE '|'
797: #define CHAR_RIGHT_CURLY_BRACKET '}'
798: #define CHAR_TILDE '~'
799:
800: #define STR_HT "\t"
801: #define STR_VT "\v"
802: #define STR_FF "\f"
803: #define STR_CR "\r"
804: #define STR_NL "\n"
805: #define STR_BS "\b"
806: #define STR_BEL "\a"
807: #ifdef EBCDIC
808: #define STR_ESC "\047"
809: #define STR_DEL "\007"
810: #else
811: #define STR_ESC "\033"
812: #define STR_DEL "\177"
813: #endif
814:
815: #define STR_SPACE " "
816: #define STR_EXCLAMATION_MARK "!"
817: #define STR_QUOTATION_MARK "\""
818: #define STR_NUMBER_SIGN "#"
819: #define STR_DOLLAR_SIGN "$"
820: #define STR_PERCENT_SIGN "%"
821: #define STR_AMPERSAND "&"
822: #define STR_APOSTROPHE "'"
823: #define STR_LEFT_PARENTHESIS "("
824: #define STR_RIGHT_PARENTHESIS ")"
825: #define STR_ASTERISK "*"
826: #define STR_PLUS "+"
827: #define STR_COMMA ","
828: #define STR_MINUS "-"
829: #define STR_DOT "."
830: #define STR_SLASH "/"
831: #define STR_0 "0"
832: #define STR_1 "1"
833: #define STR_2 "2"
834: #define STR_3 "3"
835: #define STR_4 "4"
836: #define STR_5 "5"
837: #define STR_6 "6"
838: #define STR_7 "7"
839: #define STR_8 "8"
840: #define STR_9 "9"
841: #define STR_COLON ":"
842: #define STR_SEMICOLON ";"
843: #define STR_LESS_THAN_SIGN "<"
844: #define STR_EQUALS_SIGN "="
845: #define STR_GREATER_THAN_SIGN ">"
846: #define STR_QUESTION_MARK "?"
847: #define STR_COMMERCIAL_AT "@"
848: #define STR_A "A"
849: #define STR_B "B"
850: #define STR_C "C"
851: #define STR_D "D"
852: #define STR_E "E"
853: #define STR_F "F"
854: #define STR_G "G"
855: #define STR_H "H"
856: #define STR_I "I"
857: #define STR_J "J"
858: #define STR_K "K"
859: #define STR_L "L"
860: #define STR_M "M"
861: #define STR_N "N"
862: #define STR_O "O"
863: #define STR_P "P"
864: #define STR_Q "Q"
865: #define STR_R "R"
866: #define STR_S "S"
867: #define STR_T "T"
868: #define STR_U "U"
869: #define STR_V "V"
870: #define STR_W "W"
871: #define STR_X "X"
872: #define STR_Y "Y"
873: #define STR_Z "Z"
874: #define STR_LEFT_SQUARE_BRACKET "["
875: #define STR_BACKSLASH "\\"
876: #define STR_RIGHT_SQUARE_BRACKET "]"
877: #define STR_CIRCUMFLEX_ACCENT "^"
878: #define STR_UNDERSCORE "_"
879: #define STR_GRAVE_ACCENT "`"
880: #define STR_a "a"
881: #define STR_b "b"
882: #define STR_c "c"
883: #define STR_d "d"
884: #define STR_e "e"
885: #define STR_f "f"
886: #define STR_g "g"
887: #define STR_h "h"
888: #define STR_i "i"
889: #define STR_j "j"
890: #define STR_k "k"
891: #define STR_l "l"
892: #define STR_m "m"
893: #define STR_n "n"
894: #define STR_o "o"
895: #define STR_p "p"
896: #define STR_q "q"
897: #define STR_r "r"
898: #define STR_s "s"
899: #define STR_t "t"
900: #define STR_u "u"
901: #define STR_v "v"
902: #define STR_w "w"
903: #define STR_x "x"
904: #define STR_y "y"
905: #define STR_z "z"
906: #define STR_LEFT_CURLY_BRACKET "{"
907: #define STR_VERTICAL_LINE "|"
908: #define STR_RIGHT_CURLY_BRACKET "}"
909: #define STR_TILDE "~"
910:
911: #define STRING_ACCEPT0 "ACCEPT\0"
912: #define STRING_COMMIT0 "COMMIT\0"
913: #define STRING_F0 "F\0"
914: #define STRING_FAIL0 "FAIL\0"
915: #define STRING_MARK0 "MARK\0"
916: #define STRING_PRUNE0 "PRUNE\0"
917: #define STRING_SKIP0 "SKIP\0"
918: #define STRING_THEN "THEN"
919:
920: #define STRING_alpha0 "alpha\0"
921: #define STRING_lower0 "lower\0"
922: #define STRING_upper0 "upper\0"
923: #define STRING_alnum0 "alnum\0"
924: #define STRING_ascii0 "ascii\0"
925: #define STRING_blank0 "blank\0"
926: #define STRING_cntrl0 "cntrl\0"
927: #define STRING_digit0 "digit\0"
928: #define STRING_graph0 "graph\0"
929: #define STRING_print0 "print\0"
930: #define STRING_punct0 "punct\0"
931: #define STRING_space0 "space\0"
932: #define STRING_word0 "word\0"
933: #define STRING_xdigit "xdigit"
934:
935: #define STRING_DEFINE "DEFINE"
936:
937: #define STRING_CR_RIGHTPAR "CR)"
938: #define STRING_LF_RIGHTPAR "LF)"
939: #define STRING_CRLF_RIGHTPAR "CRLF)"
940: #define STRING_ANY_RIGHTPAR "ANY)"
941: #define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)"
942: #define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)"
943: #define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)"
944: #define STRING_UTF8_RIGHTPAR "UTF8)"
945: #define STRING_UCP_RIGHTPAR "UCP)"
946: #define STRING_NO_START_OPT_RIGHTPAR "NO_START_OPT)"
947:
948: #else /* SUPPORT_UTF8 */
949:
950: /* UTF-8 support is enabled; always use UTF-8 (=ASCII) character codes. This
951: works in both modes non-EBCDIC platforms, and on EBCDIC platforms in UTF-8 mode
952: only. */
953:
954: #define CHAR_HT '\011'
955: #define CHAR_VT '\013'
956: #define CHAR_FF '\014'
957: #define CHAR_CR '\015'
958: #define CHAR_NL '\012'
959: #define CHAR_BS '\010'
960: #define CHAR_BEL '\007'
961: #define CHAR_ESC '\033'
962: #define CHAR_DEL '\177'
963:
964: #define CHAR_SPACE '\040'
965: #define CHAR_EXCLAMATION_MARK '\041'
966: #define CHAR_QUOTATION_MARK '\042'
967: #define CHAR_NUMBER_SIGN '\043'
968: #define CHAR_DOLLAR_SIGN '\044'
969: #define CHAR_PERCENT_SIGN '\045'
970: #define CHAR_AMPERSAND '\046'
971: #define CHAR_APOSTROPHE '\047'
972: #define CHAR_LEFT_PARENTHESIS '\050'
973: #define CHAR_RIGHT_PARENTHESIS '\051'
974: #define CHAR_ASTERISK '\052'
975: #define CHAR_PLUS '\053'
976: #define CHAR_COMMA '\054'
977: #define CHAR_MINUS '\055'
978: #define CHAR_DOT '\056'
979: #define CHAR_SLASH '\057'
980: #define CHAR_0 '\060'
981: #define CHAR_1 '\061'
982: #define CHAR_2 '\062'
983: #define CHAR_3 '\063'
984: #define CHAR_4 '\064'
985: #define CHAR_5 '\065'
986: #define CHAR_6 '\066'
987: #define CHAR_7 '\067'
988: #define CHAR_8 '\070'
989: #define CHAR_9 '\071'
990: #define CHAR_COLON '\072'
991: #define CHAR_SEMICOLON '\073'
992: #define CHAR_LESS_THAN_SIGN '\074'
993: #define CHAR_EQUALS_SIGN '\075'
994: #define CHAR_GREATER_THAN_SIGN '\076'
995: #define CHAR_QUESTION_MARK '\077'
996: #define CHAR_COMMERCIAL_AT '\100'
997: #define CHAR_A '\101'
998: #define CHAR_B '\102'
999: #define CHAR_C '\103'
1000: #define CHAR_D '\104'
1001: #define CHAR_E '\105'
1002: #define CHAR_F '\106'
1003: #define CHAR_G '\107'
1004: #define CHAR_H '\110'
1005: #define CHAR_I '\111'
1006: #define CHAR_J '\112'
1007: #define CHAR_K '\113'
1008: #define CHAR_L '\114'
1009: #define CHAR_M '\115'
1010: #define CHAR_N '\116'
1011: #define CHAR_O '\117'
1012: #define CHAR_P '\120'
1013: #define CHAR_Q '\121'
1014: #define CHAR_R '\122'
1015: #define CHAR_S '\123'
1016: #define CHAR_T '\124'
1017: #define CHAR_U '\125'
1018: #define CHAR_V '\126'
1019: #define CHAR_W '\127'
1020: #define CHAR_X '\130'
1021: #define CHAR_Y '\131'
1022: #define CHAR_Z '\132'
1023: #define CHAR_LEFT_SQUARE_BRACKET '\133'
1024: #define CHAR_BACKSLASH '\134'
1025: #define CHAR_RIGHT_SQUARE_BRACKET '\135'
1026: #define CHAR_CIRCUMFLEX_ACCENT '\136'
1027: #define CHAR_UNDERSCORE '\137'
1028: #define CHAR_GRAVE_ACCENT '\140'
1029: #define CHAR_a '\141'
1030: #define CHAR_b '\142'
1031: #define CHAR_c '\143'
1032: #define CHAR_d '\144'
1033: #define CHAR_e '\145'
1034: #define CHAR_f '\146'
1035: #define CHAR_g '\147'
1036: #define CHAR_h '\150'
1037: #define CHAR_i '\151'
1038: #define CHAR_j '\152'
1039: #define CHAR_k '\153'
1040: #define CHAR_l '\154'
1041: #define CHAR_m '\155'
1042: #define CHAR_n '\156'
1043: #define CHAR_o '\157'
1044: #define CHAR_p '\160'
1045: #define CHAR_q '\161'
1046: #define CHAR_r '\162'
1047: #define CHAR_s '\163'
1048: #define CHAR_t '\164'
1049: #define CHAR_u '\165'
1050: #define CHAR_v '\166'
1051: #define CHAR_w '\167'
1052: #define CHAR_x '\170'
1053: #define CHAR_y '\171'
1054: #define CHAR_z '\172'
1055: #define CHAR_LEFT_CURLY_BRACKET '\173'
1056: #define CHAR_VERTICAL_LINE '\174'
1057: #define CHAR_RIGHT_CURLY_BRACKET '\175'
1058: #define CHAR_TILDE '\176'
1059:
1060: #define STR_HT "\011"
1061: #define STR_VT "\013"
1062: #define STR_FF "\014"
1063: #define STR_CR "\015"
1064: #define STR_NL "\012"
1065: #define STR_BS "\010"
1066: #define STR_BEL "\007"
1067: #define STR_ESC "\033"
1068: #define STR_DEL "\177"
1069:
1070: #define STR_SPACE "\040"
1071: #define STR_EXCLAMATION_MARK "\041"
1072: #define STR_QUOTATION_MARK "\042"
1073: #define STR_NUMBER_SIGN "\043"
1074: #define STR_DOLLAR_SIGN "\044"
1075: #define STR_PERCENT_SIGN "\045"
1076: #define STR_AMPERSAND "\046"
1077: #define STR_APOSTROPHE "\047"
1078: #define STR_LEFT_PARENTHESIS "\050"
1079: #define STR_RIGHT_PARENTHESIS "\051"
1080: #define STR_ASTERISK "\052"
1081: #define STR_PLUS "\053"
1082: #define STR_COMMA "\054"
1083: #define STR_MINUS "\055"
1084: #define STR_DOT "\056"
1085: #define STR_SLASH "\057"
1086: #define STR_0 "\060"
1087: #define STR_1 "\061"
1088: #define STR_2 "\062"
1089: #define STR_3 "\063"
1090: #define STR_4 "\064"
1091: #define STR_5 "\065"
1092: #define STR_6 "\066"
1093: #define STR_7 "\067"
1094: #define STR_8 "\070"
1095: #define STR_9 "\071"
1096: #define STR_COLON "\072"
1097: #define STR_SEMICOLON "\073"
1098: #define STR_LESS_THAN_SIGN "\074"
1099: #define STR_EQUALS_SIGN "\075"
1100: #define STR_GREATER_THAN_SIGN "\076"
1101: #define STR_QUESTION_MARK "\077"
1102: #define STR_COMMERCIAL_AT "\100"
1103: #define STR_A "\101"
1104: #define STR_B "\102"
1105: #define STR_C "\103"
1106: #define STR_D "\104"
1107: #define STR_E "\105"
1108: #define STR_F "\106"
1109: #define STR_G "\107"
1110: #define STR_H "\110"
1111: #define STR_I "\111"
1112: #define STR_J "\112"
1113: #define STR_K "\113"
1114: #define STR_L "\114"
1115: #define STR_M "\115"
1116: #define STR_N "\116"
1117: #define STR_O "\117"
1118: #define STR_P "\120"
1119: #define STR_Q "\121"
1120: #define STR_R "\122"
1121: #define STR_S "\123"
1122: #define STR_T "\124"
1123: #define STR_U "\125"
1124: #define STR_V "\126"
1125: #define STR_W "\127"
1126: #define STR_X "\130"
1127: #define STR_Y "\131"
1128: #define STR_Z "\132"
1129: #define STR_LEFT_SQUARE_BRACKET "\133"
1130: #define STR_BACKSLASH "\134"
1131: #define STR_RIGHT_SQUARE_BRACKET "\135"
1132: #define STR_CIRCUMFLEX_ACCENT "\136"
1133: #define STR_UNDERSCORE "\137"
1134: #define STR_GRAVE_ACCENT "\140"
1135: #define STR_a "\141"
1136: #define STR_b "\142"
1137: #define STR_c "\143"
1138: #define STR_d "\144"
1139: #define STR_e "\145"
1140: #define STR_f "\146"
1141: #define STR_g "\147"
1142: #define STR_h "\150"
1143: #define STR_i "\151"
1144: #define STR_j "\152"
1145: #define STR_k "\153"
1146: #define STR_l "\154"
1147: #define STR_m "\155"
1148: #define STR_n "\156"
1149: #define STR_o "\157"
1150: #define STR_p "\160"
1151: #define STR_q "\161"
1152: #define STR_r "\162"
1153: #define STR_s "\163"
1154: #define STR_t "\164"
1155: #define STR_u "\165"
1156: #define STR_v "\166"
1157: #define STR_w "\167"
1158: #define STR_x "\170"
1159: #define STR_y "\171"
1160: #define STR_z "\172"
1161: #define STR_LEFT_CURLY_BRACKET "\173"
1162: #define STR_VERTICAL_LINE "\174"
1163: #define STR_RIGHT_CURLY_BRACKET "\175"
1164: #define STR_TILDE "\176"
1165:
1166: #define STRING_ACCEPT0 STR_A STR_C STR_C STR_E STR_P STR_T "\0"
1167: #define STRING_COMMIT0 STR_C STR_O STR_M STR_M STR_I STR_T "\0"
1168: #define STRING_F0 STR_F "\0"
1169: #define STRING_FAIL0 STR_F STR_A STR_I STR_L "\0"
1170: #define STRING_MARK0 STR_M STR_A STR_R STR_K "\0"
1171: #define STRING_PRUNE0 STR_P STR_R STR_U STR_N STR_E "\0"
1172: #define STRING_SKIP0 STR_S STR_K STR_I STR_P "\0"
1173: #define STRING_THEN STR_T STR_H STR_E STR_N
1174:
1175: #define STRING_alpha0 STR_a STR_l STR_p STR_h STR_a "\0"
1176: #define STRING_lower0 STR_l STR_o STR_w STR_e STR_r "\0"
1177: #define STRING_upper0 STR_u STR_p STR_p STR_e STR_r "\0"
1178: #define STRING_alnum0 STR_a STR_l STR_n STR_u STR_m "\0"
1179: #define STRING_ascii0 STR_a STR_s STR_c STR_i STR_i "\0"
1180: #define STRING_blank0 STR_b STR_l STR_a STR_n STR_k "\0"
1181: #define STRING_cntrl0 STR_c STR_n STR_t STR_r STR_l "\0"
1182: #define STRING_digit0 STR_d STR_i STR_g STR_i STR_t "\0"
1183: #define STRING_graph0 STR_g STR_r STR_a STR_p STR_h "\0"
1184: #define STRING_print0 STR_p STR_r STR_i STR_n STR_t "\0"
1185: #define STRING_punct0 STR_p STR_u STR_n STR_c STR_t "\0"
1186: #define STRING_space0 STR_s STR_p STR_a STR_c STR_e "\0"
1187: #define STRING_word0 STR_w STR_o STR_r STR_d "\0"
1188: #define STRING_xdigit STR_x STR_d STR_i STR_g STR_i STR_t
1189:
1190: #define STRING_DEFINE STR_D STR_E STR_F STR_I STR_N STR_E
1191:
1192: #define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS
1193: #define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS
1194: #define STRING_CRLF_RIGHTPAR STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
1195: #define STRING_ANY_RIGHTPAR STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS
1196: #define STRING_ANYCRLF_RIGHTPAR STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
1197: #define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
1198: #define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS
1199: #define STRING_UTF8_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS
1200: #define STRING_UCP_RIGHTPAR STR_U STR_C STR_P STR_RIGHT_PARENTHESIS
1201: #define STRING_NO_START_OPT_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_S STR_T STR_A STR_R STR_T STR_UNDERSCORE STR_O STR_P STR_T STR_RIGHT_PARENTHESIS
1202:
1203: #endif /* SUPPORT_UTF8 */
1204:
1205: /* Escape items that are just an encoding of a particular data value. */
1206:
1207: #ifndef ESC_e
1208: #define ESC_e CHAR_ESC
1209: #endif
1210:
1211: #ifndef ESC_f
1212: #define ESC_f CHAR_FF
1213: #endif
1214:
1215: #ifndef ESC_n
1216: #define ESC_n CHAR_NL
1217: #endif
1218:
1219: #ifndef ESC_r
1220: #define ESC_r CHAR_CR
1221: #endif
1222:
1223: /* We can't officially use ESC_t because it is a POSIX reserved identifier
1224: (presumably because of all the others like size_t). */
1225:
1226: #ifndef ESC_tee
1227: #define ESC_tee CHAR_HT
1228: #endif
1229:
1230: /* Codes for different types of Unicode property */
1231:
1232: #define PT_ANY 0 /* Any property - matches all chars */
1233: #define PT_LAMP 1 /* L& - the union of Lu, Ll, Lt */
1234: #define PT_GC 2 /* Specified general characteristic (e.g. L) */
1235: #define PT_PC 3 /* Specified particular characteristic (e.g. Lu) */
1236: #define PT_SC 4 /* Script (e.g. Han) */
1237: #define PT_ALNUM 5 /* Alphanumeric - the union of L and N */
1238: #define PT_SPACE 6 /* Perl space - Z plus 9,10,12,13 */
1239: #define PT_PXSPACE 7 /* POSIX space - Z plus 9,10,11,12,13 */
1240: #define PT_WORD 8 /* Word - L plus N plus underscore */
1241:
1242: /* Flag bits and data types for the extended class (OP_XCLASS) for classes that
1243: contain UTF-8 characters with values greater than 255. */
1244:
1245: #define XCL_NOT 0x01 /* Flag: this is a negative class */
1246: #define XCL_MAP 0x02 /* Flag: a 32-byte map is present */
1247:
1248: #define XCL_END 0 /* Marks end of individual items */
1249: #define XCL_SINGLE 1 /* Single item (one multibyte char) follows */
1250: #define XCL_RANGE 2 /* A range (two multibyte chars) follows */
1251: #define XCL_PROP 3 /* Unicode property (2-byte property code follows) */
1252: #define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */
1253:
1254: /* These are escaped items that aren't just an encoding of a particular data
1255: value such as \n. They must have non-zero values, as check_escape() returns
1256: their negation. Also, they must appear in the same order as in the opcode
1257: definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it
1258: corresponds to "." in DOTALL mode rather than an escape sequence. It is also
1259: used for [^] in JavaScript compatibility mode. In non-DOTALL mode, "." behaves
1260: like \N.
1261:
1262: The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc.
1263: when PCRE_UCP is set, when replacement of \d etc by \p sequences is required.
1264: They must be contiguous, and remain in order so that the replacements can be
1265: looked up from a table.
1266:
1267: The final escape must be ESC_REF as subsequent values are used for
1268: backreferences (\1, \2, \3, etc). There are two tests in the code for an escape
1269: greater than ESC_b and less than ESC_Z to detect the types that may be
1270: repeated. These are the types that consume characters. If any new escapes are
1271: put in between that don't consume a character, that code will have to change.
1272: */
1273:
1274: enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s,
1275: ESC_W, ESC_w, ESC_N, ESC_dum, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H,
1276: ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z,
1277: ESC_E, ESC_Q, ESC_g, ESC_k,
1278: ESC_DU, ESC_du, ESC_SU, ESC_su, ESC_WU, ESC_wu,
1279: ESC_REF };
1280:
1281: /* Opcode table: Starting from 1 (i.e. after OP_END), the values up to
1282: OP_EOD must correspond in order to the list of escapes immediately above.
1283:
1284: *** NOTE NOTE NOTE *** Whenever this list is updated, the two macro definitions
1285: that follow must also be updated to match. There are also tables called
1286: "coptable" and "poptable" in pcre_dfa_exec.c that must be updated. */
1287:
1288: enum {
1289: OP_END, /* 0 End of pattern */
1290:
1291: /* Values corresponding to backslashed metacharacters */
1292:
1293: OP_SOD, /* 1 Start of data: \A */
1294: OP_SOM, /* 2 Start of match (subject + offset): \G */
1295: OP_SET_SOM, /* 3 Set start of match (\K) */
1296: OP_NOT_WORD_BOUNDARY, /* 4 \B */
1297: OP_WORD_BOUNDARY, /* 5 \b */
1298: OP_NOT_DIGIT, /* 6 \D */
1299: OP_DIGIT, /* 7 \d */
1300: OP_NOT_WHITESPACE, /* 8 \S */
1301: OP_WHITESPACE, /* 9 \s */
1302: OP_NOT_WORDCHAR, /* 10 \W */
1303: OP_WORDCHAR, /* 11 \w */
1304: OP_ANY, /* 12 Match any character except newline */
1305: OP_ALLANY, /* 13 Match any character */
1306: OP_ANYBYTE, /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */
1307: OP_NOTPROP, /* 15 \P (not Unicode property) */
1308: OP_PROP, /* 16 \p (Unicode property) */
1309: OP_ANYNL, /* 17 \R (any newline sequence) */
1310: OP_NOT_HSPACE, /* 18 \H (not horizontal whitespace) */
1311: OP_HSPACE, /* 19 \h (horizontal whitespace) */
1312: OP_NOT_VSPACE, /* 20 \V (not vertical whitespace) */
1313: OP_VSPACE, /* 21 \v (vertical whitespace) */
1314: OP_EXTUNI, /* 22 \X (extended Unicode sequence */
1315: OP_EODN, /* 23 End of data or \n at end of data: \Z. */
1316: OP_EOD, /* 24 End of data: \z */
1317:
1318: OP_OPT, /* 25 Set runtime options */
1319: OP_CIRC, /* 26 Start of line - varies with multiline switch */
1320: OP_DOLL, /* 27 End of line - varies with multiline switch */
1321: OP_CHAR, /* 28 Match one character, casefully */
1322: OP_CHARNC, /* 29 Match one character, caselessly */
1323: OP_NOT, /* 30 Match one character, not the following one */
1324:
1325: OP_STAR, /* 31 The maximizing and minimizing versions of */
1326: OP_MINSTAR, /* 32 these six opcodes must come in pairs, with */
1327: OP_PLUS, /* 33 the minimizing one second. */
1328: OP_MINPLUS, /* 34 This first set applies to single characters.*/
1329: OP_QUERY, /* 35 */
1330: OP_MINQUERY, /* 36 */
1331:
1332: OP_UPTO, /* 37 From 0 to n matches */
1333: OP_MINUPTO, /* 38 */
1334: OP_EXACT, /* 39 Exactly n matches */
1335:
1336: OP_POSSTAR, /* 40 Possessified star */
1337: OP_POSPLUS, /* 41 Possessified plus */
1338: OP_POSQUERY, /* 42 Posesssified query */
1339: OP_POSUPTO, /* 43 Possessified upto */
1340:
1341: OP_NOTSTAR, /* 44 The maximizing and minimizing versions of */
1342: OP_NOTMINSTAR, /* 45 these six opcodes must come in pairs, with */
1343: OP_NOTPLUS, /* 46 the minimizing one second. They must be in */
1344: OP_NOTMINPLUS, /* 47 exactly the same order as those above. */
1345: OP_NOTQUERY, /* 48 This set applies to "not" single characters. */
1346: OP_NOTMINQUERY, /* 49 */
1347:
1348: OP_NOTUPTO, /* 50 From 0 to n matches */
1349: OP_NOTMINUPTO, /* 51 */
1350: OP_NOTEXACT, /* 52 Exactly n matches */
1351:
1352: OP_NOTPOSSTAR, /* 53 Possessified versions */
1353: OP_NOTPOSPLUS, /* 54 */
1354: OP_NOTPOSQUERY, /* 55 */
1355: OP_NOTPOSUPTO, /* 56 */
1356:
1357: OP_TYPESTAR, /* 57 The maximizing and minimizing versions of */
1358: OP_TYPEMINSTAR, /* 58 these six opcodes must come in pairs, with */
1359: OP_TYPEPLUS, /* 59 the minimizing one second. These codes must */
1360: OP_TYPEMINPLUS, /* 60 be in exactly the same order as those above. */
1361: OP_TYPEQUERY, /* 61 This set applies to character types such as \d */
1362: OP_TYPEMINQUERY, /* 62 */
1363:
1364: OP_TYPEUPTO, /* 63 From 0 to n matches */
1365: OP_TYPEMINUPTO, /* 64 */
1366: OP_TYPEEXACT, /* 65 Exactly n matches */
1367:
1368: OP_TYPEPOSSTAR, /* 66 Possessified versions */
1369: OP_TYPEPOSPLUS, /* 67 */
1370: OP_TYPEPOSQUERY, /* 68 */
1371: OP_TYPEPOSUPTO, /* 69 */
1372:
1373: OP_CRSTAR, /* 70 The maximizing and minimizing versions of */
1374: OP_CRMINSTAR, /* 71 all these opcodes must come in pairs, with */
1375: OP_CRPLUS, /* 72 the minimizing one second. These codes must */
1376: OP_CRMINPLUS, /* 73 be in exactly the same order as those above. */
1377: OP_CRQUERY, /* 74 These are for character classes and back refs */
1378: OP_CRMINQUERY, /* 75 */
1379: OP_CRRANGE, /* 76 These are different to the three sets above. */
1380: OP_CRMINRANGE, /* 77 */
1381:
1382: OP_CLASS, /* 78 Match a character class, chars < 256 only */
1383: OP_NCLASS, /* 79 Same, but the bitmap was created from a negative
1384: class - the difference is relevant only when a UTF-8
1385: character > 255 is encountered. */
1386:
1387: OP_XCLASS, /* 80 Extended class for handling UTF-8 chars within the
1388: class. This does both positive and negative. */
1389:
1390: OP_REF, /* 81 Match a back reference */
1391: OP_RECURSE, /* 82 Match a numbered subpattern (possibly recursive) */
1392: OP_CALLOUT, /* 83 Call out to external function if provided */
1393:
1394: OP_ALT, /* 84 Start of alternation */
1395: OP_KET, /* 85 End of group that doesn't have an unbounded repeat */
1396: OP_KETRMAX, /* 86 These two must remain together and in this */
1397: OP_KETRMIN, /* 87 order. They are for groups the repeat for ever. */
1398:
1399: /* The assertions must come before BRA, CBRA, ONCE, and COND.*/
1400:
1401: OP_ASSERT, /* 88 Positive lookahead */
1402: OP_ASSERT_NOT, /* 89 Negative lookahead */
1403: OP_ASSERTBACK, /* 90 Positive lookbehind */
1404: OP_ASSERTBACK_NOT, /* 91 Negative lookbehind */
1405: OP_REVERSE, /* 92 Move pointer back - used in lookbehind assertions */
1406:
1407: /* ONCE, BRA, CBRA, and COND must come after the assertions, with ONCE first,
1408: as there's a test for >= ONCE for a subpattern that isn't an assertion. */
1409:
1410: OP_ONCE, /* 93 Atomic group */
1411: OP_BRA, /* 94 Start of non-capturing bracket */
1412: OP_CBRA, /* 95 Start of capturing bracket */
1413: OP_COND, /* 96 Conditional group */
1414:
1415: /* These three must follow the previous three, in the same order. There's a
1416: check for >= SBRA to distinguish the two sets. */
1417:
1418: OP_SBRA, /* 97 Start of non-capturing bracket, check empty */
1419: OP_SCBRA, /* 98 Start of capturing bracket, check empty */
1420: OP_SCOND, /* 99 Conditional group, check empty */
1421:
1422: /* The next two pairs must (respectively) be kept together. */
1423:
1424: OP_CREF, /* 100 Used to hold a capture number as condition */
1425: OP_NCREF, /* 101 Same, but generaged by a name reference*/
1426: OP_RREF, /* 102 Used to hold a recursion number as condition */
1427: OP_NRREF, /* 103 Same, but generaged by a name reference*/
1428: OP_DEF, /* 104 The DEFINE condition */
1429:
1430: OP_BRAZERO, /* 105 These two must remain together and in this */
1431: OP_BRAMINZERO, /* 106 order. */
1432:
1433: /* These are backtracking control verbs */
1434:
1435: OP_MARK, /* 107 always has an argument */
1436: OP_PRUNE, /* 108 */
1437: OP_PRUNE_ARG, /* 109 same, but with argument */
1438: OP_SKIP, /* 110 */
1439: OP_SKIP_ARG, /* 111 same, but with argument */
1440: OP_THEN, /* 112 */
1441: OP_THEN_ARG, /* 113 same, but with argument */
1442: OP_COMMIT, /* 114 */
1443:
1444: /* These are forced failure and success verbs */
1445:
1446: OP_FAIL, /* 115 */
1447: OP_ACCEPT, /* 116 */
1448: OP_CLOSE, /* 117 Used before OP_ACCEPT to close open captures */
1449:
1450: /* This is used to skip a subpattern with a {0} quantifier */
1451:
1452: OP_SKIPZERO, /* 118 */
1453:
1454: /* This is not an opcode, but is used to check that tables indexed by opcode
1455: are the correct length, in order to catch updating errors - there have been
1456: some in the past. */
1457:
1458: OP_TABLE_LENGTH
1459: };
1460:
1461: /* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro
1462: definitions that follow must also be updated to match. There are also tables
1463: called "coptable" and "poptable" in pcre_dfa_exec.c that must be updated. */
1464:
1465:
1466: /* This macro defines textual names for all the opcodes. These are used only
1467: for debugging. The macro is referenced only in pcre_printint.c. */
1468:
1469: #define OP_NAME_LIST \
1470: "End", "\\A", "\\G", "\\K", "\\B", "\\b", "\\D", "\\d", \
1471: "\\S", "\\s", "\\W", "\\w", "Any", "AllAny", "Anybyte", \
1472: "notprop", "prop", "\\R", "\\H", "\\h", "\\V", "\\v", \
1473: "extuni", "\\Z", "\\z", \
1474: "Opt", "^", "$", "char", "charnc", "not", \
1475: "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
1476: "*+","++", "?+", "{", \
1477: "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
1478: "*+","++", "?+", "{", \
1479: "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
1480: "*+","++", "?+", "{", \
1481: "*", "*?", "+", "+?", "?", "??", "{", "{", \
1482: "class", "nclass", "xclass", "Ref", "Recurse", "Callout", \
1483: "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \
1484: "AssertB", "AssertB not", "Reverse", \
1485: "Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond", \
1486: "Cond ref", "Cond nref", "Cond rec", "Cond nrec", "Cond def", \
1487: "Brazero", "Braminzero", \
1488: "*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP", \
1489: "*THEN", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \
1490: "Close", "Skip zero"
1491:
1492:
1493: /* This macro defines the length of fixed length operations in the compiled
1494: regex. The lengths are used when searching for specific things, and also in the
1495: debugging printing of a compiled regex. We use a macro so that it can be
1496: defined close to the definitions of the opcodes themselves.
1497:
1498: As things have been extended, some of these are no longer fixed lenths, but are
1499: minima instead. For example, the length of a single-character repeat may vary
1500: in UTF-8 mode. The code that uses this table must know about such things. */
1501:
1502: #define OP_LENGTHS \
1503: 1, /* End */ \
1504: 1, 1, 1, 1, 1, /* \A, \G, \K, \B, \b */ \
1505: 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ \
1506: 1, 1, 1, /* Any, AllAny, Anybyte */ \
1507: 3, 3, /* \P, \p */ \
1508: 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ \
1509: 1, /* \X */ \
1510: 1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \
1511: 2, /* Char - the minimum length */ \
1512: 2, /* Charnc - the minimum length */ \
1513: 2, /* not */ \
1514: /* Positive single-char repeats ** These are */ \
1515: 2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \
1516: 4, 4, 4, /* upto, minupto, exact ** UTF-8 mode */ \
1517: 2, 2, 2, 4, /* *+, ++, ?+, upto+ */ \
1518: /* Negative single-char repeats - only for chars < 256 */ \
1519: 2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \
1520: 4, 4, 4, /* NOT upto, minupto, exact */ \
1521: 2, 2, 2, 4, /* Possessive *, +, ?, upto */ \
1522: /* Positive type repeats */ \
1523: 2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \
1524: 4, 4, 4, /* Type upto, minupto, exact */ \
1525: 2, 2, 2, 4, /* Possessive *+, ++, ?+, upto+ */ \
1526: /* Character class & ref repeats */ \
1527: 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \
1528: 5, 5, /* CRRANGE, CRMINRANGE */ \
1529: 33, /* CLASS */ \
1530: 33, /* NCLASS */ \
1531: 0, /* XCLASS - variable length */ \
1532: 3, /* REF */ \
1533: 1+LINK_SIZE, /* RECURSE */ \
1534: 2+2*LINK_SIZE, /* CALLOUT */ \
1535: 1+LINK_SIZE, /* Alt */ \
1536: 1+LINK_SIZE, /* Ket */ \
1537: 1+LINK_SIZE, /* KetRmax */ \
1538: 1+LINK_SIZE, /* KetRmin */ \
1539: 1+LINK_SIZE, /* Assert */ \
1540: 1+LINK_SIZE, /* Assert not */ \
1541: 1+LINK_SIZE, /* Assert behind */ \
1542: 1+LINK_SIZE, /* Assert behind not */ \
1543: 1+LINK_SIZE, /* Reverse */ \
1544: 1+LINK_SIZE, /* ONCE */ \
1545: 1+LINK_SIZE, /* BRA */ \
1546: 3+LINK_SIZE, /* CBRA */ \
1547: 1+LINK_SIZE, /* COND */ \
1548: 1+LINK_SIZE, /* SBRA */ \
1549: 3+LINK_SIZE, /* SCBRA */ \
1550: 1+LINK_SIZE, /* SCOND */ \
1551: 3, 3, /* CREF, NCREF */ \
1552: 3, 3, /* RREF, NRREF */ \
1553: 1, /* DEF */ \
1554: 1, 1, /* BRAZERO, BRAMINZERO */ \
1555: 3, 1, 3, /* MARK, PRUNE, PRUNE_ARG */ \
1556: 1, 3, /* SKIP, SKIP_ARG */ \
1557: 1+LINK_SIZE, 3+LINK_SIZE, /* THEN, THEN_ARG */ \
1558: 1, 1, 1, 3, 1 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
1559:
1560:
1561: /* A magic value for OP_RREF and OP_NRREF to indicate the "any recursion"
1562: condition. */
1563:
1564: #define RREF_ANY 0xffff
1565:
1566: /* Compile time error code numbers. They are given names so that they can more
1567: easily be tracked. When a new number is added, the table called eint in
1568: pcreposix.c must be updated. */
1569:
1570: enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
1571: ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19,
1572: ERR20, ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29,
1573: ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
1574: ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
1575: ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
1576: ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68,
1577: ERRCOUNT };
1578:
1579: /* The real format of the start of the pcre block; the index of names and the
1580: code vector run on as long as necessary after the end. We store an explicit
1581: offset to the name table so that if a regex is compiled on one host, saved, and
1582: then run on another where the size of pointers is different, all might still
1583: be well. For the case of compiled-on-4 and run-on-8, we include an extra
1584: pointer that is always NULL. For future-proofing, a few dummy fields were
1585: originally included - even though you can never get this planning right - but
1586: there is only one left now.
1587:
1588: NOTE NOTE NOTE:
1589: Because people can now save and re-use compiled patterns, any additions to this
1590: structure should be made at the end, and something earlier (e.g. a new
1591: flag in the options or one of the dummy fields) should indicate that the new
1592: fields are present. Currently PCRE always sets the dummy fields to zero.
1593: NOTE NOTE NOTE
1594: */
1595:
1596: typedef struct real_pcre {
1597: pcre_uint32 magic_number;
1598: pcre_uint32 size; /* Total that was malloced */
1599: pcre_uint32 options; /* Public options */
1600: pcre_uint16 flags; /* Private flags */
1601: pcre_uint16 dummy1; /* For future use */
1602: pcre_uint16 top_bracket;
1603: pcre_uint16 top_backref;
1604: pcre_uint16 first_byte;
1605: pcre_uint16 req_byte;
1606: pcre_uint16 name_table_offset; /* Offset to name table that follows */
1607: pcre_uint16 name_entry_size; /* Size of any name items */
1608: pcre_uint16 name_count; /* Number of name items */
1609: pcre_uint16 ref_count; /* Reference count */
1610:
1611: const unsigned char *tables; /* Pointer to tables or NULL for std */
1612: const unsigned char *nullpad; /* NULL padding */
1613: } real_pcre;
1614:
1615: /* The format of the block used to store data from pcre_study(). The same
1616: remark (see NOTE above) about extending this structure applies. */
1617:
1618: typedef struct pcre_study_data {
1619: pcre_uint32 size; /* Total that was malloced */
1620: pcre_uint32 flags; /* Private flags */
1621: uschar start_bits[32]; /* Starting char bits */
1622: pcre_uint32 minlength; /* Minimum subject length */
1623: } pcre_study_data;
1624:
1625: /* Structure for building a chain of open capturing subpatterns during
1626: compiling, so that instructions to close them can be compiled when (*ACCEPT) is
1627: encountered. This is also used to identify subpatterns that contain recursive
1628: back references to themselves, so that they can be made atomic. */
1629:
1630: typedef struct open_capitem {
1631: struct open_capitem *next; /* Chain link */
1632: pcre_uint16 number; /* Capture number */
1633: pcre_uint16 flag; /* Set TRUE if recursive back ref */
1634: } open_capitem;
1635:
1636: /* Structure for passing "static" information around between the functions
1637: doing the compiling, so that they are thread-safe. */
1638:
1639: typedef struct compile_data {
1640: const uschar *lcc; /* Points to lower casing table */
1641: const uschar *fcc; /* Points to case-flipping table */
1642: const uschar *cbits; /* Points to character type table */
1643: const uschar *ctypes; /* Points to table of type maps */
1644: const uschar *start_workspace;/* The start of working space */
1645: const uschar *start_code; /* The start of the compiled code */
1646: const uschar *start_pattern; /* The start of the pattern */
1647: const uschar *end_pattern; /* The end of the pattern */
1648: open_capitem *open_caps; /* Chain of open capture items */
1649: uschar *hwm; /* High watermark of workspace */
1650: uschar *name_table; /* The name/number table */
1651: int names_found; /* Number of entries so far */
1652: int name_entry_size; /* Size of each entry */
1653: int bracount; /* Count of capturing parens as we compile */
1654: int final_bracount; /* Saved value after first pass */
1655: int top_backref; /* Maximum back reference */
1656: unsigned int backref_map; /* Bitmap of low back refs */
1657: int external_options; /* External (initial) options */
1658: int external_flags; /* External flag bits to be set */
1659: int req_varyopt; /* "After variable item" flag for reqbyte */
1660: BOOL had_accept; /* (*ACCEPT) encountered */
1661: BOOL check_lookbehind; /* Lookbehinds need later checking */
1662: int nltype; /* Newline type */
1663: int nllen; /* Newline string length */
1664: uschar nl[4]; /* Newline string when fixed length */
1665: } compile_data;
1666:
1667: /* Structure for maintaining a chain of pointers to the currently incomplete
1668: branches, for testing for left recursion. */
1669:
1670: typedef struct branch_chain {
1671: struct branch_chain *outer;
1672: uschar *current_branch;
1673: } branch_chain;
1674:
1675: /* Structure for items in a linked list that represents an explicit recursive
1676: call within the pattern. */
1677:
1678: typedef struct recursion_info {
1679: struct recursion_info *prevrec; /* Previous recursion record (or NULL) */
1680: int group_num; /* Number of group that was called */
1681: const uschar *after_call; /* "Return value": points after the call in the expr */
1682: int *offset_save; /* Pointer to start of saved offsets */
1683: int saved_max; /* Number of saved offsets */
1684: int save_offset_top; /* Current value of offset_top */
1685: } recursion_info;
1686:
1687: /* Structure for building a chain of data for holding the values of the subject
1688: pointer at the start of each subpattern, so as to detect when an empty string
1689: has been matched by a subpattern - to break infinite loops. */
1690:
1691: typedef struct eptrblock {
1692: struct eptrblock *epb_prev;
1693: USPTR epb_saved_eptr;
1694: } eptrblock;
1695:
1696:
1697: /* Structure for passing "static" information around between the functions
1698: doing traditional NFA matching, so that they are thread-safe. */
1699:
1700: typedef struct match_data {
1701: unsigned long int match_call_count; /* As it says */
1702: unsigned long int match_limit; /* As it says */
1703: unsigned long int match_limit_recursion; /* As it says */
1704: int *offset_vector; /* Offset vector */
1705: int offset_end; /* One past the end */
1706: int offset_max; /* The maximum usable for return data */
1707: int nltype; /* Newline type */
1708: int nllen; /* Newline string length */
1709: int name_count; /* Number of names in name table */
1710: int name_entry_size; /* Size of entry in names table */
1711: uschar *name_table; /* Table of names */
1712: uschar nl[4]; /* Newline string when fixed */
1713: const uschar *lcc; /* Points to lower casing table */
1714: const uschar *ctypes; /* Points to table of type maps */
1715: BOOL offset_overflow; /* Set if too many extractions */
1716: BOOL notbol; /* NOTBOL flag */
1717: BOOL noteol; /* NOTEOL flag */
1718: BOOL utf8; /* UTF8 flag */
1719: BOOL jscript_compat; /* JAVASCRIPT_COMPAT flag */
1720: BOOL use_ucp; /* PCRE_UCP flag */
1721: BOOL endonly; /* Dollar not before final \n */
1722: BOOL notempty; /* Empty string match not wanted */
1723: BOOL notempty_atstart; /* Empty string match at start not wanted */
1724: BOOL hitend; /* Hit the end of the subject at some point */
1725: BOOL bsr_anycrlf; /* \R is just any CRLF, not full Unicode */
1726: const uschar *start_code; /* For use when recursing */
1727: USPTR start_subject; /* Start of the subject string */
1728: USPTR end_subject; /* End of the subject string */
1729: USPTR start_match_ptr; /* Start of matched string */
1730: USPTR end_match_ptr; /* Subject position at end match */
1731: USPTR start_used_ptr; /* Earliest consulted character */
1732: int partial; /* PARTIAL options */
1733: int end_offset_top; /* Highwater mark at end of match */
1734: int capture_last; /* Most recent capture number */
1735: int start_offset; /* The start offset value */
1736: eptrblock *eptrchain; /* Chain of eptrblocks for tail recursions */
1737: int eptrn; /* Next free eptrblock */
1738: recursion_info *recursive; /* Linked list of recursion data */
1739: void *callout_data; /* To pass back to callouts */
1740: const uschar *mark; /* Mark pointer to pass back */
1741: } match_data;
1742:
1743: /* A similar structure is used for the same purpose by the DFA matching
1744: functions. */
1745:
1746: typedef struct dfa_match_data {
1747: const uschar *start_code; /* Start of the compiled pattern */
1748: const uschar *start_subject; /* Start of the subject string */
1749: const uschar *end_subject; /* End of subject string */
1750: const uschar *start_used_ptr; /* Earliest consulted character */
1751: const uschar *tables; /* Character tables */
1752: int start_offset; /* The start offset value */
1753: int moptions; /* Match options */
1754: int poptions; /* Pattern options */
1755: int nltype; /* Newline type */
1756: int nllen; /* Newline string length */
1757: uschar nl[4]; /* Newline string when fixed */
1758: void *callout_data; /* To pass back to callouts */
1759: } dfa_match_data;
1760:
1761: /* Bit definitions for entries in the pcre_ctypes table. */
1762:
1763: #define ctype_space 0x01
1764: #define ctype_letter 0x02
1765: #define ctype_digit 0x04
1766: #define ctype_xdigit 0x08
1767: #define ctype_word 0x10 /* alphanumeric or '_' */
1768: #define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */
1769:
1770: /* Offsets for the bitmap tables in pcre_cbits. Each table contains a set
1771: of bits for a class map. Some classes are built by combining these tables. */
1772:
1773: #define cbit_space 0 /* [:space:] or \s */
1774: #define cbit_xdigit 32 /* [:xdigit:] */
1775: #define cbit_digit 64 /* [:digit:] or \d */
1776: #define cbit_upper 96 /* [:upper:] */
1777: #define cbit_lower 128 /* [:lower:] */
1778: #define cbit_word 160 /* [:word:] or \w */
1779: #define cbit_graph 192 /* [:graph:] */
1780: #define cbit_print 224 /* [:print:] */
1781: #define cbit_punct 256 /* [:punct:] */
1782: #define cbit_cntrl 288 /* [:cntrl:] */
1783: #define cbit_length 320 /* Length of the cbits table */
1784:
1785: /* Offsets of the various tables from the base tables pointer, and
1786: total length. */
1787:
1788: #define lcc_offset 0
1789: #define fcc_offset 256
1790: #define cbits_offset 512
1791: #define ctypes_offset (cbits_offset + cbit_length)
1792: #define tables_length (ctypes_offset + 256)
1793:
1794: /* Layout of the UCP type table that translates property names into types and
1795: codes. Each entry used to point directly to a name, but to reduce the number of
1796: relocations in shared libraries, it now has an offset into a single string
1797: instead. */
1798:
1799: typedef struct {
1800: pcre_uint16 name_offset;
1801: pcre_uint16 type;
1802: pcre_uint16 value;
1803: } ucp_type_table;
1804:
1805:
1806: /* Internal shared data tables. These are tables that are used by more than one
1807: of the exported public functions. They have to be "external" in the C sense,
1808: but are not part of the PCRE public API. The data for these tables is in the
1809: pcre_tables.c module. */
1810:
1811: extern const int _pcre_utf8_table1[];
1812: extern const int _pcre_utf8_table2[];
1813: extern const int _pcre_utf8_table3[];
1814: extern const uschar _pcre_utf8_table4[];
1815:
1816: extern const int _pcre_utf8_table1_size;
1817:
1818: extern const char _pcre_utt_names[];
1819: extern const ucp_type_table _pcre_utt[];
1820: extern const int _pcre_utt_size;
1821:
1822: extern const uschar _pcre_default_tables[];
1823:
1824: extern const uschar _pcre_OP_lengths[];
1825:
1826:
1827: /* Internal shared functions. These are functions that are used by more than
1828: one of the exported public functions. They have to be "external" in the C
1829: sense, but are not part of the PCRE public API. */
1830:
1831: extern const uschar *_pcre_find_bracket(const uschar *, BOOL, int);
1832: extern BOOL _pcre_is_newline(USPTR, int, USPTR, int *, BOOL);
1833: extern int _pcre_ord2utf8(int, uschar *);
1834: extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *,
1835: const pcre_study_data *, pcre_study_data *);
1836: extern int _pcre_valid_utf8(USPTR, int);
1837: extern BOOL _pcre_was_newline(USPTR, int, USPTR, int *, BOOL);
1838: extern BOOL _pcre_xclass(int, const uschar *);
1839:
1840:
1841: /* Unicode character database (UCD) */
1842:
1843: typedef struct {
1844: uschar script;
1845: uschar chartype;
1846: pcre_int32 other_case;
1847: } ucd_record;
1848:
1849: extern const ucd_record _pcre_ucd_records[];
1850: extern const uschar _pcre_ucd_stage1[];
1851: extern const pcre_uint16 _pcre_ucd_stage2[];
1852: extern const int _pcre_ucp_gentype[];
1853:
1854:
1855: /* UCD access macros */
1856:
1857: #define UCD_BLOCK_SIZE 128
1858: #define GET_UCD(ch) (_pcre_ucd_records + \
1859: _pcre_ucd_stage2[_pcre_ucd_stage1[(ch) / UCD_BLOCK_SIZE] * \
1860: UCD_BLOCK_SIZE + ch % UCD_BLOCK_SIZE])
1861:
1862: #define UCD_CHARTYPE(ch) GET_UCD(ch)->chartype
1863: #define UCD_SCRIPT(ch) GET_UCD(ch)->script
1864: #define UCD_CATEGORY(ch) _pcre_ucp_gentype[UCD_CHARTYPE(ch)]
1865: #define UCD_OTHERCASE(ch) (ch + GET_UCD(ch)->other_case)
1866:
1867: #endif
1868:
1869: /* End of pcre_internal.h */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>