Annotation of embedaddon/pcre/pcre_printint.src, revision 1.1.1.1
1.1 misho 1: /*************************************************
2: * Perl-Compatible Regular Expressions *
3: *************************************************/
4:
5: /* PCRE is a library of functions to support regular expressions whose syntax
6: and semantics are as close as possible to those of the Perl 5 language.
7:
8: Written by Philip Hazel
9: Copyright (c) 1997-2010 University of Cambridge
10:
11: -----------------------------------------------------------------------------
12: Redistribution and use in source and binary forms, with or without
13: modification, are permitted provided that the following conditions are met:
14:
15: * Redistributions of source code must retain the above copyright notice,
16: this list of conditions and the following disclaimer.
17:
18: * Redistributions in binary form must reproduce the above copyright
19: notice, this list of conditions and the following disclaimer in the
20: documentation and/or other materials provided with the distribution.
21:
22: * Neither the name of the University of Cambridge nor the names of its
23: contributors may be used to endorse or promote products derived from
24: this software without specific prior written permission.
25:
26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36: POSSIBILITY OF SUCH DAMAGE.
37: -----------------------------------------------------------------------------
38: */
39:
40:
41: /* This module contains a PCRE private debugging function for printing out the
42: internal form of a compiled regular expression, along with some supporting
43: local functions. This source file is used in two places:
44:
45: (1) It is #included by pcre_compile.c when it is compiled in debugging mode
46: (PCRE_DEBUG defined in pcre_internal.h). It is not included in production
47: compiles.
48:
49: (2) It is always #included by pcretest.c, which can be asked to print out a
50: compiled regex for debugging purposes. */
51:
52:
53: /* Macro that decides whether a character should be output as a literal or in
54: hexadecimal. We don't use isprint() because that can vary from system to system
55: (even without the use of locales) and we want the output always to be the same,
56: for testing purposes. This macro is used in pcretest as well as in this file. */
57:
58: #ifdef EBCDIC
59: #define PRINTABLE(c) ((c) >= 64 && (c) < 255)
60: #else
61: #define PRINTABLE(c) ((c) >= 32 && (c) < 127)
62: #endif
63:
64: /* The table of operator names. */
65:
66: static const char *OP_names[] = { OP_NAME_LIST };
67:
68:
69:
70: /*************************************************
71: * Print single- or multi-byte character *
72: *************************************************/
73:
74: static int
75: print_char(FILE *f, uschar *ptr, BOOL utf8)
76: {
77: int c = *ptr;
78:
79: #ifndef SUPPORT_UTF8
80: utf8 = utf8; /* Avoid compiler warning */
81: if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
82: return 0;
83:
84: #else
85: if (!utf8 || (c & 0xc0) != 0xc0)
86: {
87: if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
88: return 0;
89: }
90: else
91: {
92: int i;
93: int a = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */
94: int s = 6*a;
95: c = (c & _pcre_utf8_table3[a]) << s;
96: for (i = 1; i <= a; i++)
97: {
98: /* This is a check for malformed UTF-8; it should only occur if the sanity
99: check has been turned off. Rather than swallow random bytes, just stop if
100: we hit a bad one. Print it with \X instead of \x as an indication. */
101:
102: if ((ptr[i] & 0xc0) != 0x80)
103: {
104: fprintf(f, "\\X{%x}", c);
105: return i - 1;
106: }
107:
108: /* The byte is OK */
109:
110: s -= 6;
111: c |= (ptr[i] & 0x3f) << s;
112: }
113: if (c < 128) fprintf(f, "\\x%02x", c); else fprintf(f, "\\x{%x}", c);
114: return a;
115: }
116: #endif
117: }
118:
119:
120:
121: /*************************************************
122: * Find Unicode property name *
123: *************************************************/
124:
125: static const char *
126: get_ucpname(int ptype, int pvalue)
127: {
128: #ifdef SUPPORT_UCP
129: int i;
130: for (i = _pcre_utt_size - 1; i >= 0; i--)
131: {
132: if (ptype == _pcre_utt[i].type && pvalue == _pcre_utt[i].value) break;
133: }
134: return (i >= 0)? _pcre_utt_names + _pcre_utt[i].name_offset : "??";
135: #else
136: /* It gets harder and harder to shut off unwanted compiler warnings. */
137: ptype = ptype * pvalue;
138: return (ptype == pvalue)? "??" : "??";
139: #endif
140: }
141:
142:
143:
144: /*************************************************
145: * Print compiled regex *
146: *************************************************/
147:
148: /* Make this function work for a regex with integers either byte order.
149: However, we assume that what we are passed is a compiled regex. The
150: print_lengths flag controls whether offsets and lengths of items are printed.
151: They can be turned off from pcretest so that automatic tests on bytecode can be
152: written that do not depend on the value of LINK_SIZE. */
153:
154: static void
155: pcre_printint(pcre *external_re, FILE *f, BOOL print_lengths)
156: {
157: real_pcre *re = (real_pcre *)external_re;
158: uschar *codestart, *code;
159: BOOL utf8;
160:
161: unsigned int options = re->options;
162: int offset = re->name_table_offset;
163: int count = re->name_count;
164: int size = re->name_entry_size;
165:
166: if (re->magic_number != MAGIC_NUMBER)
167: {
168: offset = ((offset << 8) & 0xff00) | ((offset >> 8) & 0xff);
169: count = ((count << 8) & 0xff00) | ((count >> 8) & 0xff);
170: size = ((size << 8) & 0xff00) | ((size >> 8) & 0xff);
171: options = ((options << 24) & 0xff000000) |
172: ((options << 8) & 0x00ff0000) |
173: ((options >> 8) & 0x0000ff00) |
174: ((options >> 24) & 0x000000ff);
175: }
176:
177: code = codestart = (uschar *)re + offset + count * size;
178: utf8 = (options & PCRE_UTF8) != 0;
179:
180: for(;;)
181: {
182: uschar *ccode;
183: const char *flag = " ";
184: int c;
185: int extra = 0;
186:
187: if (print_lengths)
188: fprintf(f, "%3d ", (int)(code - codestart));
189: else
190: fprintf(f, " ");
191:
192: switch(*code)
193: {
194: /* ========================================================================== */
195: /* These cases are never obeyed. This is a fudge that causes a compile-
196: time error if the vectors OP_names or _pcre_OP_lengths, which are indexed
197: by opcode, are not the correct length. It seems to be the only way to do
198: such a check at compile time, as the sizeof() operator does not work in
199: the C preprocessor. We do this while compiling pcretest, because that
200: #includes pcre_tables.c, which holds _pcre_OP_lengths. We can't do this
201: when building pcre_compile.c with PCRE_DEBUG set, because it doesn't then
202: know the size of _pcre_OP_lengths. */
203:
204: #ifdef COMPILING_PCRETEST
205: case OP_TABLE_LENGTH:
206: case OP_TABLE_LENGTH +
207: ((sizeof(OP_names)/sizeof(const char *) == OP_TABLE_LENGTH) &&
208: (sizeof(_pcre_OP_lengths) == OP_TABLE_LENGTH)):
209: break;
210: #endif
211: /* ========================================================================== */
212:
213: case OP_END:
214: fprintf(f, " %s\n", OP_names[*code]);
215: fprintf(f, "------------------------------------------------------------------\n");
216: return;
217:
218: case OP_CHAR:
219: fprintf(f, " ");
220: do
221: {
222: code++;
223: code += 1 + print_char(f, code, utf8);
224: }
225: while (*code == OP_CHAR);
226: fprintf(f, "\n");
227: continue;
228:
229: case OP_CHARI:
230: fprintf(f, " /i ");
231: do
232: {
233: code++;
234: code += 1 + print_char(f, code, utf8);
235: }
236: while (*code == OP_CHARI);
237: fprintf(f, "\n");
238: continue;
239:
240: case OP_CBRA:
241: case OP_CBRAPOS:
242: case OP_SCBRA:
243: case OP_SCBRAPOS:
244: if (print_lengths) fprintf(f, "%3d ", GET(code, 1));
245: else fprintf(f, " ");
246: fprintf(f, "%s %d", OP_names[*code], GET2(code, 1+LINK_SIZE));
247: break;
248:
249: case OP_BRA:
250: case OP_BRAPOS:
251: case OP_SBRA:
252: case OP_SBRAPOS:
253: case OP_KETRMAX:
254: case OP_KETRMIN:
255: case OP_KETRPOS:
256: case OP_ALT:
257: case OP_KET:
258: case OP_ASSERT:
259: case OP_ASSERT_NOT:
260: case OP_ASSERTBACK:
261: case OP_ASSERTBACK_NOT:
262: case OP_ONCE:
263: case OP_ONCE_NC:
264: case OP_COND:
265: case OP_SCOND:
266: case OP_REVERSE:
267: if (print_lengths) fprintf(f, "%3d ", GET(code, 1));
268: else fprintf(f, " ");
269: fprintf(f, "%s", OP_names[*code]);
270: break;
271:
272: case OP_CLOSE:
273: fprintf(f, " %s %d", OP_names[*code], GET2(code, 1));
274: break;
275:
276: case OP_CREF:
277: case OP_NCREF:
278: fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]);
279: break;
280:
281: case OP_RREF:
282: c = GET2(code, 1);
283: if (c == RREF_ANY)
284: fprintf(f, " Cond recurse any");
285: else
286: fprintf(f, " Cond recurse %d", c);
287: break;
288:
289: case OP_NRREF:
290: c = GET2(code, 1);
291: if (c == RREF_ANY)
292: fprintf(f, " Cond nrecurse any");
293: else
294: fprintf(f, " Cond nrecurse %d", c);
295: break;
296:
297: case OP_DEF:
298: fprintf(f, " Cond def");
299: break;
300:
301: case OP_STARI:
302: case OP_MINSTARI:
303: case OP_POSSTARI:
304: case OP_PLUSI:
305: case OP_MINPLUSI:
306: case OP_POSPLUSI:
307: case OP_QUERYI:
308: case OP_MINQUERYI:
309: case OP_POSQUERYI:
310: flag = "/i";
311: /* Fall through */
312: case OP_STAR:
313: case OP_MINSTAR:
314: case OP_POSSTAR:
315: case OP_PLUS:
316: case OP_MINPLUS:
317: case OP_POSPLUS:
318: case OP_QUERY:
319: case OP_MINQUERY:
320: case OP_POSQUERY:
321: case OP_TYPESTAR:
322: case OP_TYPEMINSTAR:
323: case OP_TYPEPOSSTAR:
324: case OP_TYPEPLUS:
325: case OP_TYPEMINPLUS:
326: case OP_TYPEPOSPLUS:
327: case OP_TYPEQUERY:
328: case OP_TYPEMINQUERY:
329: case OP_TYPEPOSQUERY:
330: fprintf(f, " %s ", flag);
331: if (*code >= OP_TYPESTAR)
332: {
333: fprintf(f, "%s", OP_names[code[1]]);
334: if (code[1] == OP_PROP || code[1] == OP_NOTPROP)
335: {
336: fprintf(f, " %s ", get_ucpname(code[2], code[3]));
337: extra = 2;
338: }
339: }
340: else extra = print_char(f, code+1, utf8);
341: fprintf(f, "%s", OP_names[*code]);
342: break;
343:
344: case OP_EXACTI:
345: case OP_UPTOI:
346: case OP_MINUPTOI:
347: case OP_POSUPTOI:
348: flag = "/i";
349: /* Fall through */
350: case OP_EXACT:
351: case OP_UPTO:
352: case OP_MINUPTO:
353: case OP_POSUPTO:
354: fprintf(f, " %s ", flag);
355: extra = print_char(f, code+3, utf8);
356: fprintf(f, "{");
357: if (*code != OP_EXACT && *code != OP_EXACTI) fprintf(f, "0,");
358: fprintf(f, "%d}", GET2(code,1));
359: if (*code == OP_MINUPTO || *code == OP_MINUPTOI) fprintf(f, "?");
360: else if (*code == OP_POSUPTO || *code == OP_POSUPTOI) fprintf(f, "+");
361: break;
362:
363: case OP_TYPEEXACT:
364: case OP_TYPEUPTO:
365: case OP_TYPEMINUPTO:
366: case OP_TYPEPOSUPTO:
367: fprintf(f, " %s", OP_names[code[3]]);
368: if (code[3] == OP_PROP || code[3] == OP_NOTPROP)
369: {
370: fprintf(f, " %s ", get_ucpname(code[4], code[5]));
371: extra = 2;
372: }
373: fprintf(f, "{");
374: if (*code != OP_TYPEEXACT) fprintf(f, "0,");
375: fprintf(f, "%d}", GET2(code,1));
376: if (*code == OP_TYPEMINUPTO) fprintf(f, "?");
377: else if (*code == OP_TYPEPOSUPTO) fprintf(f, "+");
378: break;
379:
380: case OP_NOTI:
381: flag = "/i";
382: /* Fall through */
383: case OP_NOT:
384: c = code[1];
385: if (PRINTABLE(c)) fprintf(f, " %s [^%c]", flag, c);
386: else fprintf(f, " %s [^\\x%02x]", flag, c);
387: break;
388:
389: case OP_NOTSTARI:
390: case OP_NOTMINSTARI:
391: case OP_NOTPOSSTARI:
392: case OP_NOTPLUSI:
393: case OP_NOTMINPLUSI:
394: case OP_NOTPOSPLUSI:
395: case OP_NOTQUERYI:
396: case OP_NOTMINQUERYI:
397: case OP_NOTPOSQUERYI:
398: flag = "/i";
399: /* Fall through */
400:
401: case OP_NOTSTAR:
402: case OP_NOTMINSTAR:
403: case OP_NOTPOSSTAR:
404: case OP_NOTPLUS:
405: case OP_NOTMINPLUS:
406: case OP_NOTPOSPLUS:
407: case OP_NOTQUERY:
408: case OP_NOTMINQUERY:
409: case OP_NOTPOSQUERY:
410: c = code[1];
411: if (PRINTABLE(c)) fprintf(f, " %s [^%c]", flag, c);
412: else fprintf(f, " %s [^\\x%02x]", flag, c);
413: fprintf(f, "%s", OP_names[*code]);
414: break;
415:
416: case OP_NOTEXACTI:
417: case OP_NOTUPTOI:
418: case OP_NOTMINUPTOI:
419: case OP_NOTPOSUPTOI:
420: flag = "/i";
421: /* Fall through */
422:
423: case OP_NOTEXACT:
424: case OP_NOTUPTO:
425: case OP_NOTMINUPTO:
426: case OP_NOTPOSUPTO:
427: c = code[3];
428: if (PRINTABLE(c)) fprintf(f, " %s [^%c]{", flag, c);
429: else fprintf(f, " %s [^\\x%02x]{", flag, c);
430: if (*code != OP_NOTEXACT && *code != OP_NOTEXACTI) fprintf(f, "0,");
431: fprintf(f, "%d}", GET2(code,1));
432: if (*code == OP_NOTMINUPTO || *code == OP_NOTMINUPTOI) fprintf(f, "?");
433: else
434: if (*code == OP_NOTPOSUPTO || *code == OP_NOTPOSUPTOI) fprintf(f, "+");
435: break;
436:
437: case OP_RECURSE:
438: if (print_lengths) fprintf(f, "%3d ", GET(code, 1));
439: else fprintf(f, " ");
440: fprintf(f, "%s", OP_names[*code]);
441: break;
442:
443: case OP_REFI:
444: flag = "/i";
445: /* Fall through */
446: case OP_REF:
447: fprintf(f, " %s \\%d", flag, GET2(code,1));
448: ccode = code + _pcre_OP_lengths[*code];
449: goto CLASS_REF_REPEAT;
450:
451: case OP_CALLOUT:
452: fprintf(f, " %s %d %d %d", OP_names[*code], code[1], GET(code,2),
453: GET(code, 2 + LINK_SIZE));
454: break;
455:
456: case OP_PROP:
457: case OP_NOTPROP:
458: fprintf(f, " %s %s", OP_names[*code], get_ucpname(code[1], code[2]));
459: break;
460:
461: /* OP_XCLASS can only occur in UTF-8 mode. However, there's no harm in
462: having this code always here, and it makes it less messy without all those
463: #ifdefs. */
464:
465: case OP_CLASS:
466: case OP_NCLASS:
467: case OP_XCLASS:
468: {
469: int i, min, max;
470: BOOL printmap;
471:
472: fprintf(f, " [");
473:
474: if (*code == OP_XCLASS)
475: {
476: extra = GET(code, 1);
477: ccode = code + LINK_SIZE + 1;
478: printmap = (*ccode & XCL_MAP) != 0;
479: if ((*ccode++ & XCL_NOT) != 0) fprintf(f, "^");
480: }
481: else
482: {
483: printmap = TRUE;
484: ccode = code + 1;
485: }
486:
487: /* Print a bit map */
488:
489: if (printmap)
490: {
491: for (i = 0; i < 256; i++)
492: {
493: if ((ccode[i/8] & (1 << (i&7))) != 0)
494: {
495: int j;
496: for (j = i+1; j < 256; j++)
497: if ((ccode[j/8] & (1 << (j&7))) == 0) break;
498: if (i == '-' || i == ']') fprintf(f, "\\");
499: if (PRINTABLE(i)) fprintf(f, "%c", i);
500: else fprintf(f, "\\x%02x", i);
501: if (--j > i)
502: {
503: if (j != i + 1) fprintf(f, "-");
504: if (j == '-' || j == ']') fprintf(f, "\\");
505: if (PRINTABLE(j)) fprintf(f, "%c", j);
506: else fprintf(f, "\\x%02x", j);
507: }
508: i = j;
509: }
510: }
511: ccode += 32;
512: }
513:
514: /* For an XCLASS there is always some additional data */
515:
516: if (*code == OP_XCLASS)
517: {
518: int ch;
519: while ((ch = *ccode++) != XCL_END)
520: {
521: if (ch == XCL_PROP)
522: {
523: int ptype = *ccode++;
524: int pvalue = *ccode++;
525: fprintf(f, "\\p{%s}", get_ucpname(ptype, pvalue));
526: }
527: else if (ch == XCL_NOTPROP)
528: {
529: int ptype = *ccode++;
530: int pvalue = *ccode++;
531: fprintf(f, "\\P{%s}", get_ucpname(ptype, pvalue));
532: }
533: else
534: {
535: ccode += 1 + print_char(f, ccode, TRUE);
536: if (ch == XCL_RANGE)
537: {
538: fprintf(f, "-");
539: ccode += 1 + print_char(f, ccode, TRUE);
540: }
541: }
542: }
543: }
544:
545: /* Indicate a non-UTF8 class which was created by negation */
546:
547: fprintf(f, "]%s", (*code == OP_NCLASS)? " (neg)" : "");
548:
549: /* Handle repeats after a class or a back reference */
550:
551: CLASS_REF_REPEAT:
552: switch(*ccode)
553: {
554: case OP_CRSTAR:
555: case OP_CRMINSTAR:
556: case OP_CRPLUS:
557: case OP_CRMINPLUS:
558: case OP_CRQUERY:
559: case OP_CRMINQUERY:
560: fprintf(f, "%s", OP_names[*ccode]);
561: extra += _pcre_OP_lengths[*ccode];
562: break;
563:
564: case OP_CRRANGE:
565: case OP_CRMINRANGE:
566: min = GET2(ccode,1);
567: max = GET2(ccode,3);
568: if (max == 0) fprintf(f, "{%d,}", min);
569: else fprintf(f, "{%d,%d}", min, max);
570: if (*ccode == OP_CRMINRANGE) fprintf(f, "?");
571: extra += _pcre_OP_lengths[*ccode];
572: break;
573:
574: /* Do nothing if it's not a repeat; this code stops picky compilers
575: warning about the lack of a default code path. */
576:
577: default:
578: break;
579: }
580: }
581: break;
582:
583: case OP_MARK:
584: case OP_PRUNE_ARG:
585: case OP_SKIP_ARG:
586: fprintf(f, " %s %s", OP_names[*code], code + 2);
587: extra += code[1];
588: break;
589:
590: case OP_THEN:
591: fprintf(f, " %s", OP_names[*code]);
592: break;
593:
594: case OP_THEN_ARG:
595: fprintf(f, " %s %s", OP_names[*code], code + 2);
596: extra += code[1];
597: break;
598:
599: case OP_CIRCM:
600: case OP_DOLLM:
601: flag = "/m";
602: /* Fall through */
603:
604: /* Anything else is just an item with no data, but possibly a flag. */
605:
606: default:
607: fprintf(f, " %s %s", flag, OP_names[*code]);
608: break;
609: }
610:
611: code += _pcre_OP_lengths[*code] + extra;
612: fprintf(f, "\n");
613: }
614: }
615:
616: /* End of pcre_printint.src */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>