1: /*
2: * regexp.c: generic and extensible Regular Expression engine
3: *
4: * Basically designed with the purpose of compiling regexps for
5: * the variety of validation/shemas mechanisms now available in
6: * XML related specifications these include:
7: * - XML-1.0 DTD validation
8: * - XML Schemas structure part 1
9: * - XML Schemas Datatypes part 2 especially Appendix F
10: * - RELAX-NG/TREX i.e. the counter proposal
11: *
12: * See Copyright for the status of this software.
13: *
14: * Daniel Veillard <veillard@redhat.com>
15: */
16:
17: #define IN_LIBXML
18: #include "libxml.h"
19:
20: #ifdef LIBXML_REGEXP_ENABLED
21:
22: /* #define DEBUG_ERR */
23:
24: #include <stdio.h>
25: #include <string.h>
26: #ifdef HAVE_LIMITS_H
27: #include <limits.h>
28: #endif
29:
30: #include <libxml/tree.h>
31: #include <libxml/parserInternals.h>
32: #include <libxml/xmlregexp.h>
33: #include <libxml/xmlautomata.h>
34: #include <libxml/xmlunicode.h>
35:
36: #ifndef INT_MAX
37: #define INT_MAX 123456789 /* easy to flag and big enough for our needs */
38: #endif
39:
40: /* #define DEBUG_REGEXP_GRAPH */
41: /* #define DEBUG_REGEXP_EXEC */
42: /* #define DEBUG_PUSH */
43: /* #define DEBUG_COMPACTION */
44:
45: #define MAX_PUSH 10000000
46:
47: #ifdef ERROR
48: #undef ERROR
49: #endif
50: #define ERROR(str) \
51: ctxt->error = XML_REGEXP_COMPILE_ERROR; \
52: xmlRegexpErrCompile(ctxt, str);
53: #define NEXT ctxt->cur++
54: #define CUR (*(ctxt->cur))
55: #define NXT(index) (ctxt->cur[index])
56:
57: #define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
58: #define NEXTL(l) ctxt->cur += l;
59: #define XML_REG_STRING_SEPARATOR '|'
60: /*
61: * Need PREV to check on a '-' within a Character Group. May only be used
62: * when it's guaranteed that cur is not at the beginning of ctxt->string!
63: */
64: #define PREV (ctxt->cur[-1])
65:
66: /**
67: * TODO:
68: *
69: * macro to flag unimplemented blocks
70: */
71: #define TODO \
72: xmlGenericError(xmlGenericErrorContext, \
73: "Unimplemented block at %s:%d\n", \
74: __FILE__, __LINE__);
75:
76: /************************************************************************
77: * *
78: * Datatypes and structures *
79: * *
80: ************************************************************************/
81:
82: /*
83: * Note: the order of the enums below is significant, do not shuffle
84: */
85: typedef enum {
86: XML_REGEXP_EPSILON = 1,
87: XML_REGEXP_CHARVAL,
88: XML_REGEXP_RANGES,
89: XML_REGEXP_SUBREG, /* used for () sub regexps */
90: XML_REGEXP_STRING,
91: XML_REGEXP_ANYCHAR, /* . */
92: XML_REGEXP_ANYSPACE, /* \s */
93: XML_REGEXP_NOTSPACE, /* \S */
94: XML_REGEXP_INITNAME, /* \l */
95: XML_REGEXP_NOTINITNAME, /* \L */
96: XML_REGEXP_NAMECHAR, /* \c */
97: XML_REGEXP_NOTNAMECHAR, /* \C */
98: XML_REGEXP_DECIMAL, /* \d */
99: XML_REGEXP_NOTDECIMAL, /* \D */
100: XML_REGEXP_REALCHAR, /* \w */
101: XML_REGEXP_NOTREALCHAR, /* \W */
102: XML_REGEXP_LETTER = 100,
103: XML_REGEXP_LETTER_UPPERCASE,
104: XML_REGEXP_LETTER_LOWERCASE,
105: XML_REGEXP_LETTER_TITLECASE,
106: XML_REGEXP_LETTER_MODIFIER,
107: XML_REGEXP_LETTER_OTHERS,
108: XML_REGEXP_MARK,
109: XML_REGEXP_MARK_NONSPACING,
110: XML_REGEXP_MARK_SPACECOMBINING,
111: XML_REGEXP_MARK_ENCLOSING,
112: XML_REGEXP_NUMBER,
113: XML_REGEXP_NUMBER_DECIMAL,
114: XML_REGEXP_NUMBER_LETTER,
115: XML_REGEXP_NUMBER_OTHERS,
116: XML_REGEXP_PUNCT,
117: XML_REGEXP_PUNCT_CONNECTOR,
118: XML_REGEXP_PUNCT_DASH,
119: XML_REGEXP_PUNCT_OPEN,
120: XML_REGEXP_PUNCT_CLOSE,
121: XML_REGEXP_PUNCT_INITQUOTE,
122: XML_REGEXP_PUNCT_FINQUOTE,
123: XML_REGEXP_PUNCT_OTHERS,
124: XML_REGEXP_SEPAR,
125: XML_REGEXP_SEPAR_SPACE,
126: XML_REGEXP_SEPAR_LINE,
127: XML_REGEXP_SEPAR_PARA,
128: XML_REGEXP_SYMBOL,
129: XML_REGEXP_SYMBOL_MATH,
130: XML_REGEXP_SYMBOL_CURRENCY,
131: XML_REGEXP_SYMBOL_MODIFIER,
132: XML_REGEXP_SYMBOL_OTHERS,
133: XML_REGEXP_OTHER,
134: XML_REGEXP_OTHER_CONTROL,
135: XML_REGEXP_OTHER_FORMAT,
136: XML_REGEXP_OTHER_PRIVATE,
137: XML_REGEXP_OTHER_NA,
138: XML_REGEXP_BLOCK_NAME
139: } xmlRegAtomType;
140:
141: typedef enum {
142: XML_REGEXP_QUANT_EPSILON = 1,
143: XML_REGEXP_QUANT_ONCE,
144: XML_REGEXP_QUANT_OPT,
145: XML_REGEXP_QUANT_MULT,
146: XML_REGEXP_QUANT_PLUS,
147: XML_REGEXP_QUANT_ONCEONLY,
148: XML_REGEXP_QUANT_ALL,
149: XML_REGEXP_QUANT_RANGE
150: } xmlRegQuantType;
151:
152: typedef enum {
153: XML_REGEXP_START_STATE = 1,
154: XML_REGEXP_FINAL_STATE,
155: XML_REGEXP_TRANS_STATE,
156: XML_REGEXP_SINK_STATE,
157: XML_REGEXP_UNREACH_STATE
158: } xmlRegStateType;
159:
160: typedef enum {
161: XML_REGEXP_MARK_NORMAL = 0,
162: XML_REGEXP_MARK_START,
163: XML_REGEXP_MARK_VISITED
164: } xmlRegMarkedType;
165:
166: typedef struct _xmlRegRange xmlRegRange;
167: typedef xmlRegRange *xmlRegRangePtr;
168:
169: struct _xmlRegRange {
170: int neg; /* 0 normal, 1 not, 2 exclude */
171: xmlRegAtomType type;
172: int start;
173: int end;
174: xmlChar *blockName;
175: };
176:
177: typedef struct _xmlRegAtom xmlRegAtom;
178: typedef xmlRegAtom *xmlRegAtomPtr;
179:
180: typedef struct _xmlAutomataState xmlRegState;
181: typedef xmlRegState *xmlRegStatePtr;
182:
183: struct _xmlRegAtom {
184: int no;
185: xmlRegAtomType type;
186: xmlRegQuantType quant;
187: int min;
188: int max;
189:
190: void *valuep;
191: void *valuep2;
192: int neg;
193: int codepoint;
194: xmlRegStatePtr start;
195: xmlRegStatePtr start0;
196: xmlRegStatePtr stop;
197: int maxRanges;
198: int nbRanges;
199: xmlRegRangePtr *ranges;
200: void *data;
201: };
202:
203: typedef struct _xmlRegCounter xmlRegCounter;
204: typedef xmlRegCounter *xmlRegCounterPtr;
205:
206: struct _xmlRegCounter {
207: int min;
208: int max;
209: };
210:
211: typedef struct _xmlRegTrans xmlRegTrans;
212: typedef xmlRegTrans *xmlRegTransPtr;
213:
214: struct _xmlRegTrans {
215: xmlRegAtomPtr atom;
216: int to;
217: int counter;
218: int count;
219: int nd;
220: };
221:
222: struct _xmlAutomataState {
223: xmlRegStateType type;
224: xmlRegMarkedType mark;
225: xmlRegMarkedType markd;
226: xmlRegMarkedType reached;
227: int no;
228: int maxTrans;
229: int nbTrans;
230: xmlRegTrans *trans;
231: /* knowing states ponting to us can speed things up */
232: int maxTransTo;
233: int nbTransTo;
234: int *transTo;
235: };
236:
237: typedef struct _xmlAutomata xmlRegParserCtxt;
238: typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
239:
240: #define AM_AUTOMATA_RNG 1
241:
242: struct _xmlAutomata {
243: xmlChar *string;
244: xmlChar *cur;
245:
246: int error;
247: int neg;
248:
249: xmlRegStatePtr start;
250: xmlRegStatePtr end;
251: xmlRegStatePtr state;
252:
253: xmlRegAtomPtr atom;
254:
255: int maxAtoms;
256: int nbAtoms;
257: xmlRegAtomPtr *atoms;
258:
259: int maxStates;
260: int nbStates;
261: xmlRegStatePtr *states;
262:
263: int maxCounters;
264: int nbCounters;
265: xmlRegCounter *counters;
266:
267: int determinist;
268: int negs;
269: int flags;
270: };
271:
272: struct _xmlRegexp {
273: xmlChar *string;
274: int nbStates;
275: xmlRegStatePtr *states;
276: int nbAtoms;
277: xmlRegAtomPtr *atoms;
278: int nbCounters;
279: xmlRegCounter *counters;
280: int determinist;
281: int flags;
282: /*
283: * That's the compact form for determinists automatas
284: */
285: int nbstates;
286: int *compact;
287: void **transdata;
288: int nbstrings;
289: xmlChar **stringMap;
290: };
291:
292: typedef struct _xmlRegExecRollback xmlRegExecRollback;
293: typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
294:
295: struct _xmlRegExecRollback {
296: xmlRegStatePtr state;/* the current state */
297: int index; /* the index in the input stack */
298: int nextbranch; /* the next transition to explore in that state */
299: int *counts; /* save the automata state if it has some */
300: };
301:
302: typedef struct _xmlRegInputToken xmlRegInputToken;
303: typedef xmlRegInputToken *xmlRegInputTokenPtr;
304:
305: struct _xmlRegInputToken {
306: xmlChar *value;
307: void *data;
308: };
309:
310: struct _xmlRegExecCtxt {
311: int status; /* execution status != 0 indicate an error */
312: int determinist; /* did we find an indeterministic behaviour */
313: xmlRegexpPtr comp; /* the compiled regexp */
314: xmlRegExecCallbacks callback;
315: void *data;
316:
317: xmlRegStatePtr state;/* the current state */
318: int transno; /* the current transition on that state */
319: int transcount; /* the number of chars in char counted transitions */
320:
321: /*
322: * A stack of rollback states
323: */
324: int maxRollbacks;
325: int nbRollbacks;
326: xmlRegExecRollback *rollbacks;
327:
328: /*
329: * The state of the automata if any
330: */
331: int *counts;
332:
333: /*
334: * The input stack
335: */
336: int inputStackMax;
337: int inputStackNr;
338: int index;
339: int *charStack;
340: const xmlChar *inputString; /* when operating on characters */
341: xmlRegInputTokenPtr inputStack;/* when operating on strings */
342:
343: /*
344: * error handling
345: */
346: int errStateNo; /* the error state number */
347: xmlRegStatePtr errState; /* the error state */
348: xmlChar *errString; /* the string raising the error */
349: int *errCounts; /* counters at the error state */
350: int nbPush;
351: };
352:
353: #define REGEXP_ALL_COUNTER 0x123456
354: #define REGEXP_ALL_LAX_COUNTER 0x123457
355:
356: static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
357: static void xmlRegFreeState(xmlRegStatePtr state);
358: static void xmlRegFreeAtom(xmlRegAtomPtr atom);
359: static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr);
360: static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint);
361: static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint,
362: int neg, int start, int end, const xmlChar *blockName);
363:
364: void xmlAutomataSetFlags(xmlAutomataPtr am, int flags);
365:
366: /************************************************************************
367: * *
368: * Regexp memory error handler *
369: * *
370: ************************************************************************/
371: /**
372: * xmlRegexpErrMemory:
373: * @extra: extra information
374: *
375: * Handle an out of memory condition
376: */
377: static void
378: xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra)
379: {
380: const char *regexp = NULL;
381: if (ctxt != NULL) {
382: regexp = (const char *) ctxt->string;
383: ctxt->error = XML_ERR_NO_MEMORY;
384: }
385: __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
386: XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
387: regexp, NULL, 0, 0,
388: "Memory allocation failed : %s\n", extra);
389: }
390:
391: /**
392: * xmlRegexpErrCompile:
393: * @extra: extra information
394: *
395: * Handle a compilation failure
396: */
397: static void
398: xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra)
399: {
400: const char *regexp = NULL;
401: int idx = 0;
402:
403: if (ctxt != NULL) {
404: regexp = (const char *) ctxt->string;
405: idx = ctxt->cur - ctxt->string;
406: ctxt->error = XML_REGEXP_COMPILE_ERROR;
407: }
408: __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
409: XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra,
410: regexp, NULL, idx, 0,
411: "failed to compile: %s\n", extra);
412: }
413:
414: /************************************************************************
415: * *
416: * Allocation/Deallocation *
417: * *
418: ************************************************************************/
419:
420: static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
421: /**
422: * xmlRegEpxFromParse:
423: * @ctxt: the parser context used to build it
424: *
425: * Allocate a new regexp and fill it with the result from the parser
426: *
427: * Returns the new regexp or NULL in case of error
428: */
429: static xmlRegexpPtr
430: xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
431: xmlRegexpPtr ret;
432:
433: ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
434: if (ret == NULL) {
435: xmlRegexpErrMemory(ctxt, "compiling regexp");
436: return(NULL);
437: }
438: memset(ret, 0, sizeof(xmlRegexp));
439: ret->string = ctxt->string;
440: ret->nbStates = ctxt->nbStates;
441: ret->states = ctxt->states;
442: ret->nbAtoms = ctxt->nbAtoms;
443: ret->atoms = ctxt->atoms;
444: ret->nbCounters = ctxt->nbCounters;
445: ret->counters = ctxt->counters;
446: ret->determinist = ctxt->determinist;
447: ret->flags = ctxt->flags;
448: if (ret->determinist == -1) {
449: xmlRegexpIsDeterminist(ret);
450: }
451:
452: if ((ret->determinist != 0) &&
453: (ret->nbCounters == 0) &&
454: (ctxt->negs == 0) &&
455: (ret->atoms != NULL) &&
456: (ret->atoms[0] != NULL) &&
457: (ret->atoms[0]->type == XML_REGEXP_STRING)) {
458: int i, j, nbstates = 0, nbatoms = 0;
459: int *stateRemap;
460: int *stringRemap;
461: int *transitions;
462: void **transdata;
463: xmlChar **stringMap;
464: xmlChar *value;
465:
466: /*
467: * Switch to a compact representation
468: * 1/ counting the effective number of states left
469: * 2/ counting the unique number of atoms, and check that
470: * they are all of the string type
471: * 3/ build a table state x atom for the transitions
472: */
473:
474: stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
475: if (stateRemap == NULL) {
476: xmlRegexpErrMemory(ctxt, "compiling regexp");
477: xmlFree(ret);
478: return(NULL);
479: }
480: for (i = 0;i < ret->nbStates;i++) {
481: if (ret->states[i] != NULL) {
482: stateRemap[i] = nbstates;
483: nbstates++;
484: } else {
485: stateRemap[i] = -1;
486: }
487: }
488: #ifdef DEBUG_COMPACTION
489: printf("Final: %d states\n", nbstates);
490: #endif
491: stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
492: if (stringMap == NULL) {
493: xmlRegexpErrMemory(ctxt, "compiling regexp");
494: xmlFree(stateRemap);
495: xmlFree(ret);
496: return(NULL);
497: }
498: stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
499: if (stringRemap == NULL) {
500: xmlRegexpErrMemory(ctxt, "compiling regexp");
501: xmlFree(stringMap);
502: xmlFree(stateRemap);
503: xmlFree(ret);
504: return(NULL);
505: }
506: for (i = 0;i < ret->nbAtoms;i++) {
507: if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
508: (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
509: value = ret->atoms[i]->valuep;
510: for (j = 0;j < nbatoms;j++) {
511: if (xmlStrEqual(stringMap[j], value)) {
512: stringRemap[i] = j;
513: break;
514: }
515: }
516: if (j >= nbatoms) {
517: stringRemap[i] = nbatoms;
518: stringMap[nbatoms] = xmlStrdup(value);
519: if (stringMap[nbatoms] == NULL) {
520: for (i = 0;i < nbatoms;i++)
521: xmlFree(stringMap[i]);
522: xmlFree(stringRemap);
523: xmlFree(stringMap);
524: xmlFree(stateRemap);
525: xmlFree(ret);
526: return(NULL);
527: }
528: nbatoms++;
529: }
530: } else {
531: xmlFree(stateRemap);
532: xmlFree(stringRemap);
533: for (i = 0;i < nbatoms;i++)
534: xmlFree(stringMap[i]);
535: xmlFree(stringMap);
536: xmlFree(ret);
537: return(NULL);
538: }
539: }
540: #ifdef DEBUG_COMPACTION
541: printf("Final: %d atoms\n", nbatoms);
542: #endif
543: transitions = (int *) xmlMalloc((nbstates + 1) *
544: (nbatoms + 1) * sizeof(int));
545: if (transitions == NULL) {
546: xmlFree(stateRemap);
547: xmlFree(stringRemap);
548: xmlFree(stringMap);
549: xmlFree(ret);
550: return(NULL);
551: }
552: memset(transitions, 0, (nbstates + 1) * (nbatoms + 1) * sizeof(int));
553:
554: /*
555: * Allocate the transition table. The first entry for each
556: * state corresponds to the state type.
557: */
558: transdata = NULL;
559:
560: for (i = 0;i < ret->nbStates;i++) {
561: int stateno, atomno, targetno, prev;
562: xmlRegStatePtr state;
563: xmlRegTransPtr trans;
564:
565: stateno = stateRemap[i];
566: if (stateno == -1)
567: continue;
568: state = ret->states[i];
569:
570: transitions[stateno * (nbatoms + 1)] = state->type;
571:
572: for (j = 0;j < state->nbTrans;j++) {
573: trans = &(state->trans[j]);
574: if ((trans->to == -1) || (trans->atom == NULL))
575: continue;
576: atomno = stringRemap[trans->atom->no];
577: if ((trans->atom->data != NULL) && (transdata == NULL)) {
578: transdata = (void **) xmlMalloc(nbstates * nbatoms *
579: sizeof(void *));
580: if (transdata != NULL)
581: memset(transdata, 0,
582: nbstates * nbatoms * sizeof(void *));
583: else {
584: xmlRegexpErrMemory(ctxt, "compiling regexp");
585: break;
586: }
587: }
588: targetno = stateRemap[trans->to];
589: /*
590: * if the same atom can generate transitions to 2 different
591: * states then it means the automata is not determinist and
592: * the compact form can't be used !
593: */
594: prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
595: if (prev != 0) {
596: if (prev != targetno + 1) {
597: ret->determinist = 0;
598: #ifdef DEBUG_COMPACTION
599: printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
600: i, j, trans->atom->no, trans->to, atomno, targetno);
601: printf(" previous to is %d\n", prev);
602: #endif
603: if (transdata != NULL)
604: xmlFree(transdata);
605: xmlFree(transitions);
606: xmlFree(stateRemap);
607: xmlFree(stringRemap);
608: for (i = 0;i < nbatoms;i++)
609: xmlFree(stringMap[i]);
610: xmlFree(stringMap);
611: goto not_determ;
612: }
613: } else {
614: #if 0
615: printf("State %d trans %d: atom %d to %d : %d to %d\n",
616: i, j, trans->atom->no, trans->to, atomno, targetno);
617: #endif
618: transitions[stateno * (nbatoms + 1) + atomno + 1] =
619: targetno + 1; /* to avoid 0 */
620: if (transdata != NULL)
621: transdata[stateno * nbatoms + atomno] =
622: trans->atom->data;
623: }
624: }
625: }
626: ret->determinist = 1;
627: #ifdef DEBUG_COMPACTION
628: /*
629: * Debug
630: */
631: for (i = 0;i < nbstates;i++) {
632: for (j = 0;j < nbatoms + 1;j++) {
633: printf("%02d ", transitions[i * (nbatoms + 1) + j]);
634: }
635: printf("\n");
636: }
637: printf("\n");
638: #endif
639: /*
640: * Cleanup of the old data
641: */
642: if (ret->states != NULL) {
643: for (i = 0;i < ret->nbStates;i++)
644: xmlRegFreeState(ret->states[i]);
645: xmlFree(ret->states);
646: }
647: ret->states = NULL;
648: ret->nbStates = 0;
649: if (ret->atoms != NULL) {
650: for (i = 0;i < ret->nbAtoms;i++)
651: xmlRegFreeAtom(ret->atoms[i]);
652: xmlFree(ret->atoms);
653: }
654: ret->atoms = NULL;
655: ret->nbAtoms = 0;
656:
657: ret->compact = transitions;
658: ret->transdata = transdata;
659: ret->stringMap = stringMap;
660: ret->nbstrings = nbatoms;
661: ret->nbstates = nbstates;
662: xmlFree(stateRemap);
663: xmlFree(stringRemap);
664: }
665: not_determ:
666: ctxt->string = NULL;
667: ctxt->nbStates = 0;
668: ctxt->states = NULL;
669: ctxt->nbAtoms = 0;
670: ctxt->atoms = NULL;
671: ctxt->nbCounters = 0;
672: ctxt->counters = NULL;
673: return(ret);
674: }
675:
676: /**
677: * xmlRegNewParserCtxt:
678: * @string: the string to parse
679: *
680: * Allocate a new regexp parser context
681: *
682: * Returns the new context or NULL in case of error
683: */
684: static xmlRegParserCtxtPtr
685: xmlRegNewParserCtxt(const xmlChar *string) {
686: xmlRegParserCtxtPtr ret;
687:
688: ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt));
689: if (ret == NULL)
690: return(NULL);
691: memset(ret, 0, sizeof(xmlRegParserCtxt));
692: if (string != NULL)
693: ret->string = xmlStrdup(string);
694: ret->cur = ret->string;
695: ret->neg = 0;
696: ret->negs = 0;
697: ret->error = 0;
698: ret->determinist = -1;
699: return(ret);
700: }
701:
702: /**
703: * xmlRegNewRange:
704: * @ctxt: the regexp parser context
705: * @neg: is that negative
706: * @type: the type of range
707: * @start: the start codepoint
708: * @end: the end codepoint
709: *
710: * Allocate a new regexp range
711: *
712: * Returns the new range or NULL in case of error
713: */
714: static xmlRegRangePtr
715: xmlRegNewRange(xmlRegParserCtxtPtr ctxt,
716: int neg, xmlRegAtomType type, int start, int end) {
717: xmlRegRangePtr ret;
718:
719: ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange));
720: if (ret == NULL) {
721: xmlRegexpErrMemory(ctxt, "allocating range");
722: return(NULL);
723: }
724: ret->neg = neg;
725: ret->type = type;
726: ret->start = start;
727: ret->end = end;
728: return(ret);
729: }
730:
731: /**
732: * xmlRegFreeRange:
733: * @range: the regexp range
734: *
735: * Free a regexp range
736: */
737: static void
738: xmlRegFreeRange(xmlRegRangePtr range) {
739: if (range == NULL)
740: return;
741:
742: if (range->blockName != NULL)
743: xmlFree(range->blockName);
744: xmlFree(range);
745: }
746:
747: /**
748: * xmlRegCopyRange:
749: * @range: the regexp range
750: *
751: * Copy a regexp range
752: *
753: * Returns the new copy or NULL in case of error.
754: */
755: static xmlRegRangePtr
756: xmlRegCopyRange(xmlRegParserCtxtPtr ctxt, xmlRegRangePtr range) {
757: xmlRegRangePtr ret;
758:
759: if (range == NULL)
760: return(NULL);
761:
762: ret = xmlRegNewRange(ctxt, range->neg, range->type, range->start,
763: range->end);
764: if (ret == NULL)
765: return(NULL);
766: if (range->blockName != NULL) {
767: ret->blockName = xmlStrdup(range->blockName);
768: if (ret->blockName == NULL) {
769: xmlRegexpErrMemory(ctxt, "allocating range");
770: xmlRegFreeRange(ret);
771: return(NULL);
772: }
773: }
774: return(ret);
775: }
776:
777: /**
778: * xmlRegNewAtom:
779: * @ctxt: the regexp parser context
780: * @type: the type of atom
781: *
782: * Allocate a new atom
783: *
784: * Returns the new atom or NULL in case of error
785: */
786: static xmlRegAtomPtr
787: xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) {
788: xmlRegAtomPtr ret;
789:
790: ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
791: if (ret == NULL) {
792: xmlRegexpErrMemory(ctxt, "allocating atom");
793: return(NULL);
794: }
795: memset(ret, 0, sizeof(xmlRegAtom));
796: ret->type = type;
797: ret->quant = XML_REGEXP_QUANT_ONCE;
798: ret->min = 0;
799: ret->max = 0;
800: return(ret);
801: }
802:
803: /**
804: * xmlRegFreeAtom:
805: * @atom: the regexp atom
806: *
807: * Free a regexp atom
808: */
809: static void
810: xmlRegFreeAtom(xmlRegAtomPtr atom) {
811: int i;
812:
813: if (atom == NULL)
814: return;
815:
816: for (i = 0;i < atom->nbRanges;i++)
817: xmlRegFreeRange(atom->ranges[i]);
818: if (atom->ranges != NULL)
819: xmlFree(atom->ranges);
820: if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL))
821: xmlFree(atom->valuep);
822: if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL))
823: xmlFree(atom->valuep2);
824: if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL))
825: xmlFree(atom->valuep);
826: xmlFree(atom);
827: }
828:
829: /**
830: * xmlRegCopyAtom:
831: * @ctxt: the regexp parser context
832: * @atom: the oiginal atom
833: *
834: * Allocate a new regexp range
835: *
836: * Returns the new atom or NULL in case of error
837: */
838: static xmlRegAtomPtr
839: xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
840: xmlRegAtomPtr ret;
841:
842: ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
843: if (ret == NULL) {
844: xmlRegexpErrMemory(ctxt, "copying atom");
845: return(NULL);
846: }
847: memset(ret, 0, sizeof(xmlRegAtom));
848: ret->type = atom->type;
849: ret->quant = atom->quant;
850: ret->min = atom->min;
851: ret->max = atom->max;
852: if (atom->nbRanges > 0) {
853: int i;
854:
855: ret->ranges = (xmlRegRangePtr *) xmlMalloc(sizeof(xmlRegRangePtr) *
856: atom->nbRanges);
857: if (ret->ranges == NULL) {
858: xmlRegexpErrMemory(ctxt, "copying atom");
859: goto error;
860: }
861: for (i = 0;i < atom->nbRanges;i++) {
862: ret->ranges[i] = xmlRegCopyRange(ctxt, atom->ranges[i]);
863: if (ret->ranges[i] == NULL)
864: goto error;
865: ret->nbRanges = i + 1;
866: }
867: }
868: return(ret);
869:
870: error:
871: xmlRegFreeAtom(ret);
872: return(NULL);
873: }
874:
875: static xmlRegStatePtr
876: xmlRegNewState(xmlRegParserCtxtPtr ctxt) {
877: xmlRegStatePtr ret;
878:
879: ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState));
880: if (ret == NULL) {
881: xmlRegexpErrMemory(ctxt, "allocating state");
882: return(NULL);
883: }
884: memset(ret, 0, sizeof(xmlRegState));
885: ret->type = XML_REGEXP_TRANS_STATE;
886: ret->mark = XML_REGEXP_MARK_NORMAL;
887: return(ret);
888: }
889:
890: /**
891: * xmlRegFreeState:
892: * @state: the regexp state
893: *
894: * Free a regexp state
895: */
896: static void
897: xmlRegFreeState(xmlRegStatePtr state) {
898: if (state == NULL)
899: return;
900:
901: if (state->trans != NULL)
902: xmlFree(state->trans);
903: if (state->transTo != NULL)
904: xmlFree(state->transTo);
905: xmlFree(state);
906: }
907:
908: /**
909: * xmlRegFreeParserCtxt:
910: * @ctxt: the regexp parser context
911: *
912: * Free a regexp parser context
913: */
914: static void
915: xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) {
916: int i;
917: if (ctxt == NULL)
918: return;
919:
920: if (ctxt->string != NULL)
921: xmlFree(ctxt->string);
922: if (ctxt->states != NULL) {
923: for (i = 0;i < ctxt->nbStates;i++)
924: xmlRegFreeState(ctxt->states[i]);
925: xmlFree(ctxt->states);
926: }
927: if (ctxt->atoms != NULL) {
928: for (i = 0;i < ctxt->nbAtoms;i++)
929: xmlRegFreeAtom(ctxt->atoms[i]);
930: xmlFree(ctxt->atoms);
931: }
932: if (ctxt->counters != NULL)
933: xmlFree(ctxt->counters);
934: xmlFree(ctxt);
935: }
936:
937: /************************************************************************
938: * *
939: * Display of Data structures *
940: * *
941: ************************************************************************/
942:
943: static void
944: xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) {
945: switch (type) {
946: case XML_REGEXP_EPSILON:
947: fprintf(output, "epsilon "); break;
948: case XML_REGEXP_CHARVAL:
949: fprintf(output, "charval "); break;
950: case XML_REGEXP_RANGES:
951: fprintf(output, "ranges "); break;
952: case XML_REGEXP_SUBREG:
953: fprintf(output, "subexpr "); break;
954: case XML_REGEXP_STRING:
955: fprintf(output, "string "); break;
956: case XML_REGEXP_ANYCHAR:
957: fprintf(output, "anychar "); break;
958: case XML_REGEXP_ANYSPACE:
959: fprintf(output, "anyspace "); break;
960: case XML_REGEXP_NOTSPACE:
961: fprintf(output, "notspace "); break;
962: case XML_REGEXP_INITNAME:
963: fprintf(output, "initname "); break;
964: case XML_REGEXP_NOTINITNAME:
965: fprintf(output, "notinitname "); break;
966: case XML_REGEXP_NAMECHAR:
967: fprintf(output, "namechar "); break;
968: case XML_REGEXP_NOTNAMECHAR:
969: fprintf(output, "notnamechar "); break;
970: case XML_REGEXP_DECIMAL:
971: fprintf(output, "decimal "); break;
972: case XML_REGEXP_NOTDECIMAL:
973: fprintf(output, "notdecimal "); break;
974: case XML_REGEXP_REALCHAR:
975: fprintf(output, "realchar "); break;
976: case XML_REGEXP_NOTREALCHAR:
977: fprintf(output, "notrealchar "); break;
978: case XML_REGEXP_LETTER:
979: fprintf(output, "LETTER "); break;
980: case XML_REGEXP_LETTER_UPPERCASE:
981: fprintf(output, "LETTER_UPPERCASE "); break;
982: case XML_REGEXP_LETTER_LOWERCASE:
983: fprintf(output, "LETTER_LOWERCASE "); break;
984: case XML_REGEXP_LETTER_TITLECASE:
985: fprintf(output, "LETTER_TITLECASE "); break;
986: case XML_REGEXP_LETTER_MODIFIER:
987: fprintf(output, "LETTER_MODIFIER "); break;
988: case XML_REGEXP_LETTER_OTHERS:
989: fprintf(output, "LETTER_OTHERS "); break;
990: case XML_REGEXP_MARK:
991: fprintf(output, "MARK "); break;
992: case XML_REGEXP_MARK_NONSPACING:
993: fprintf(output, "MARK_NONSPACING "); break;
994: case XML_REGEXP_MARK_SPACECOMBINING:
995: fprintf(output, "MARK_SPACECOMBINING "); break;
996: case XML_REGEXP_MARK_ENCLOSING:
997: fprintf(output, "MARK_ENCLOSING "); break;
998: case XML_REGEXP_NUMBER:
999: fprintf(output, "NUMBER "); break;
1000: case XML_REGEXP_NUMBER_DECIMAL:
1001: fprintf(output, "NUMBER_DECIMAL "); break;
1002: case XML_REGEXP_NUMBER_LETTER:
1003: fprintf(output, "NUMBER_LETTER "); break;
1004: case XML_REGEXP_NUMBER_OTHERS:
1005: fprintf(output, "NUMBER_OTHERS "); break;
1006: case XML_REGEXP_PUNCT:
1007: fprintf(output, "PUNCT "); break;
1008: case XML_REGEXP_PUNCT_CONNECTOR:
1009: fprintf(output, "PUNCT_CONNECTOR "); break;
1010: case XML_REGEXP_PUNCT_DASH:
1011: fprintf(output, "PUNCT_DASH "); break;
1012: case XML_REGEXP_PUNCT_OPEN:
1013: fprintf(output, "PUNCT_OPEN "); break;
1014: case XML_REGEXP_PUNCT_CLOSE:
1015: fprintf(output, "PUNCT_CLOSE "); break;
1016: case XML_REGEXP_PUNCT_INITQUOTE:
1017: fprintf(output, "PUNCT_INITQUOTE "); break;
1018: case XML_REGEXP_PUNCT_FINQUOTE:
1019: fprintf(output, "PUNCT_FINQUOTE "); break;
1020: case XML_REGEXP_PUNCT_OTHERS:
1021: fprintf(output, "PUNCT_OTHERS "); break;
1022: case XML_REGEXP_SEPAR:
1023: fprintf(output, "SEPAR "); break;
1024: case XML_REGEXP_SEPAR_SPACE:
1025: fprintf(output, "SEPAR_SPACE "); break;
1026: case XML_REGEXP_SEPAR_LINE:
1027: fprintf(output, "SEPAR_LINE "); break;
1028: case XML_REGEXP_SEPAR_PARA:
1029: fprintf(output, "SEPAR_PARA "); break;
1030: case XML_REGEXP_SYMBOL:
1031: fprintf(output, "SYMBOL "); break;
1032: case XML_REGEXP_SYMBOL_MATH:
1033: fprintf(output, "SYMBOL_MATH "); break;
1034: case XML_REGEXP_SYMBOL_CURRENCY:
1035: fprintf(output, "SYMBOL_CURRENCY "); break;
1036: case XML_REGEXP_SYMBOL_MODIFIER:
1037: fprintf(output, "SYMBOL_MODIFIER "); break;
1038: case XML_REGEXP_SYMBOL_OTHERS:
1039: fprintf(output, "SYMBOL_OTHERS "); break;
1040: case XML_REGEXP_OTHER:
1041: fprintf(output, "OTHER "); break;
1042: case XML_REGEXP_OTHER_CONTROL:
1043: fprintf(output, "OTHER_CONTROL "); break;
1044: case XML_REGEXP_OTHER_FORMAT:
1045: fprintf(output, "OTHER_FORMAT "); break;
1046: case XML_REGEXP_OTHER_PRIVATE:
1047: fprintf(output, "OTHER_PRIVATE "); break;
1048: case XML_REGEXP_OTHER_NA:
1049: fprintf(output, "OTHER_NA "); break;
1050: case XML_REGEXP_BLOCK_NAME:
1051: fprintf(output, "BLOCK "); break;
1052: }
1053: }
1054:
1055: static void
1056: xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) {
1057: switch (type) {
1058: case XML_REGEXP_QUANT_EPSILON:
1059: fprintf(output, "epsilon "); break;
1060: case XML_REGEXP_QUANT_ONCE:
1061: fprintf(output, "once "); break;
1062: case XML_REGEXP_QUANT_OPT:
1063: fprintf(output, "? "); break;
1064: case XML_REGEXP_QUANT_MULT:
1065: fprintf(output, "* "); break;
1066: case XML_REGEXP_QUANT_PLUS:
1067: fprintf(output, "+ "); break;
1068: case XML_REGEXP_QUANT_RANGE:
1069: fprintf(output, "range "); break;
1070: case XML_REGEXP_QUANT_ONCEONLY:
1071: fprintf(output, "onceonly "); break;
1072: case XML_REGEXP_QUANT_ALL:
1073: fprintf(output, "all "); break;
1074: }
1075: }
1076: static void
1077: xmlRegPrintRange(FILE *output, xmlRegRangePtr range) {
1078: fprintf(output, " range: ");
1079: if (range->neg)
1080: fprintf(output, "negative ");
1081: xmlRegPrintAtomType(output, range->type);
1082: fprintf(output, "%c - %c\n", range->start, range->end);
1083: }
1084:
1085: static void
1086: xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) {
1087: fprintf(output, " atom: ");
1088: if (atom == NULL) {
1089: fprintf(output, "NULL\n");
1090: return;
1091: }
1092: if (atom->neg)
1093: fprintf(output, "not ");
1094: xmlRegPrintAtomType(output, atom->type);
1095: xmlRegPrintQuantType(output, atom->quant);
1096: if (atom->quant == XML_REGEXP_QUANT_RANGE)
1097: fprintf(output, "%d-%d ", atom->min, atom->max);
1098: if (atom->type == XML_REGEXP_STRING)
1099: fprintf(output, "'%s' ", (char *) atom->valuep);
1100: if (atom->type == XML_REGEXP_CHARVAL)
1101: fprintf(output, "char %c\n", atom->codepoint);
1102: else if (atom->type == XML_REGEXP_RANGES) {
1103: int i;
1104: fprintf(output, "%d entries\n", atom->nbRanges);
1105: for (i = 0; i < atom->nbRanges;i++)
1106: xmlRegPrintRange(output, atom->ranges[i]);
1107: } else if (atom->type == XML_REGEXP_SUBREG) {
1108: fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no);
1109: } else {
1110: fprintf(output, "\n");
1111: }
1112: }
1113:
1114: static void
1115: xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
1116: fprintf(output, " trans: ");
1117: if (trans == NULL) {
1118: fprintf(output, "NULL\n");
1119: return;
1120: }
1121: if (trans->to < 0) {
1122: fprintf(output, "removed\n");
1123: return;
1124: }
1125: if (trans->nd != 0) {
1126: if (trans->nd == 2)
1127: fprintf(output, "last not determinist, ");
1128: else
1129: fprintf(output, "not determinist, ");
1130: }
1131: if (trans->counter >= 0) {
1132: fprintf(output, "counted %d, ", trans->counter);
1133: }
1134: if (trans->count == REGEXP_ALL_COUNTER) {
1135: fprintf(output, "all transition, ");
1136: } else if (trans->count >= 0) {
1137: fprintf(output, "count based %d, ", trans->count);
1138: }
1139: if (trans->atom == NULL) {
1140: fprintf(output, "epsilon to %d\n", trans->to);
1141: return;
1142: }
1143: if (trans->atom->type == XML_REGEXP_CHARVAL)
1144: fprintf(output, "char %c ", trans->atom->codepoint);
1145: fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to);
1146: }
1147:
1148: static void
1149: xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
1150: int i;
1151:
1152: fprintf(output, " state: ");
1153: if (state == NULL) {
1154: fprintf(output, "NULL\n");
1155: return;
1156: }
1157: if (state->type == XML_REGEXP_START_STATE)
1158: fprintf(output, "START ");
1159: if (state->type == XML_REGEXP_FINAL_STATE)
1160: fprintf(output, "FINAL ");
1161:
1162: fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans);
1163: for (i = 0;i < state->nbTrans; i++) {
1164: xmlRegPrintTrans(output, &(state->trans[i]));
1165: }
1166: }
1167:
1168: #ifdef DEBUG_REGEXP_GRAPH
1169: static void
1170: xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
1171: int i;
1172:
1173: fprintf(output, " ctxt: ");
1174: if (ctxt == NULL) {
1175: fprintf(output, "NULL\n");
1176: return;
1177: }
1178: fprintf(output, "'%s' ", ctxt->string);
1179: if (ctxt->error)
1180: fprintf(output, "error ");
1181: if (ctxt->neg)
1182: fprintf(output, "neg ");
1183: fprintf(output, "\n");
1184: fprintf(output, "%d atoms:\n", ctxt->nbAtoms);
1185: for (i = 0;i < ctxt->nbAtoms; i++) {
1186: fprintf(output, " %02d ", i);
1187: xmlRegPrintAtom(output, ctxt->atoms[i]);
1188: }
1189: if (ctxt->atom != NULL) {
1190: fprintf(output, "current atom:\n");
1191: xmlRegPrintAtom(output, ctxt->atom);
1192: }
1193: fprintf(output, "%d states:", ctxt->nbStates);
1194: if (ctxt->start != NULL)
1195: fprintf(output, " start: %d", ctxt->start->no);
1196: if (ctxt->end != NULL)
1197: fprintf(output, " end: %d", ctxt->end->no);
1198: fprintf(output, "\n");
1199: for (i = 0;i < ctxt->nbStates; i++) {
1200: xmlRegPrintState(output, ctxt->states[i]);
1201: }
1202: fprintf(output, "%d counters:\n", ctxt->nbCounters);
1203: for (i = 0;i < ctxt->nbCounters; i++) {
1204: fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min,
1205: ctxt->counters[i].max);
1206: }
1207: }
1208: #endif
1209:
1210: /************************************************************************
1211: * *
1212: * Finite Automata structures manipulations *
1213: * *
1214: ************************************************************************/
1215:
1216: static void
1217: xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom,
1218: int neg, xmlRegAtomType type, int start, int end,
1219: xmlChar *blockName) {
1220: xmlRegRangePtr range;
1221:
1222: if (atom == NULL) {
1223: ERROR("add range: atom is NULL");
1224: return;
1225: }
1226: if (atom->type != XML_REGEXP_RANGES) {
1227: ERROR("add range: atom is not ranges");
1228: return;
1229: }
1230: if (atom->maxRanges == 0) {
1231: atom->maxRanges = 4;
1232: atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges *
1233: sizeof(xmlRegRangePtr));
1234: if (atom->ranges == NULL) {
1235: xmlRegexpErrMemory(ctxt, "adding ranges");
1236: atom->maxRanges = 0;
1237: return;
1238: }
1239: } else if (atom->nbRanges >= atom->maxRanges) {
1240: xmlRegRangePtr *tmp;
1241: atom->maxRanges *= 2;
1242: tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges *
1243: sizeof(xmlRegRangePtr));
1244: if (tmp == NULL) {
1245: xmlRegexpErrMemory(ctxt, "adding ranges");
1246: atom->maxRanges /= 2;
1247: return;
1248: }
1249: atom->ranges = tmp;
1250: }
1251: range = xmlRegNewRange(ctxt, neg, type, start, end);
1252: if (range == NULL)
1253: return;
1254: range->blockName = blockName;
1255: atom->ranges[atom->nbRanges++] = range;
1256:
1257: }
1258:
1259: static int
1260: xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) {
1261: if (ctxt->maxCounters == 0) {
1262: ctxt->maxCounters = 4;
1263: ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters *
1264: sizeof(xmlRegCounter));
1265: if (ctxt->counters == NULL) {
1266: xmlRegexpErrMemory(ctxt, "allocating counter");
1267: ctxt->maxCounters = 0;
1268: return(-1);
1269: }
1270: } else if (ctxt->nbCounters >= ctxt->maxCounters) {
1271: xmlRegCounter *tmp;
1272: ctxt->maxCounters *= 2;
1273: tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters *
1274: sizeof(xmlRegCounter));
1275: if (tmp == NULL) {
1276: xmlRegexpErrMemory(ctxt, "allocating counter");
1277: ctxt->maxCounters /= 2;
1278: return(-1);
1279: }
1280: ctxt->counters = tmp;
1281: }
1282: ctxt->counters[ctxt->nbCounters].min = -1;
1283: ctxt->counters[ctxt->nbCounters].max = -1;
1284: return(ctxt->nbCounters++);
1285: }
1286:
1287: static int
1288: xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
1289: if (atom == NULL) {
1290: ERROR("atom push: atom is NULL");
1291: return(-1);
1292: }
1293: if (ctxt->maxAtoms == 0) {
1294: ctxt->maxAtoms = 4;
1295: ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms *
1296: sizeof(xmlRegAtomPtr));
1297: if (ctxt->atoms == NULL) {
1298: xmlRegexpErrMemory(ctxt, "pushing atom");
1299: ctxt->maxAtoms = 0;
1300: return(-1);
1301: }
1302: } else if (ctxt->nbAtoms >= ctxt->maxAtoms) {
1303: xmlRegAtomPtr *tmp;
1304: ctxt->maxAtoms *= 2;
1305: tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms *
1306: sizeof(xmlRegAtomPtr));
1307: if (tmp == NULL) {
1308: xmlRegexpErrMemory(ctxt, "allocating counter");
1309: ctxt->maxAtoms /= 2;
1310: return(-1);
1311: }
1312: ctxt->atoms = tmp;
1313: }
1314: atom->no = ctxt->nbAtoms;
1315: ctxt->atoms[ctxt->nbAtoms++] = atom;
1316: return(0);
1317: }
1318:
1319: static void
1320: xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target,
1321: int from) {
1322: if (target->maxTransTo == 0) {
1323: target->maxTransTo = 8;
1324: target->transTo = (int *) xmlMalloc(target->maxTransTo *
1325: sizeof(int));
1326: if (target->transTo == NULL) {
1327: xmlRegexpErrMemory(ctxt, "adding transition");
1328: target->maxTransTo = 0;
1329: return;
1330: }
1331: } else if (target->nbTransTo >= target->maxTransTo) {
1332: int *tmp;
1333: target->maxTransTo *= 2;
1334: tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo *
1335: sizeof(int));
1336: if (tmp == NULL) {
1337: xmlRegexpErrMemory(ctxt, "adding transition");
1338: target->maxTransTo /= 2;
1339: return;
1340: }
1341: target->transTo = tmp;
1342: }
1343: target->transTo[target->nbTransTo] = from;
1344: target->nbTransTo++;
1345: }
1346:
1347: static void
1348: xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1349: xmlRegAtomPtr atom, xmlRegStatePtr target,
1350: int counter, int count) {
1351:
1352: int nrtrans;
1353:
1354: if (state == NULL) {
1355: ERROR("add state: state is NULL");
1356: return;
1357: }
1358: if (target == NULL) {
1359: ERROR("add state: target is NULL");
1360: return;
1361: }
1362: /*
1363: * Other routines follow the philosophy 'When in doubt, add a transition'
1364: * so we check here whether such a transition is already present and, if
1365: * so, silently ignore this request.
1366: */
1367:
1368: for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) {
1369: xmlRegTransPtr trans = &(state->trans[nrtrans]);
1370: if ((trans->atom == atom) &&
1371: (trans->to == target->no) &&
1372: (trans->counter == counter) &&
1373: (trans->count == count)) {
1374: #ifdef DEBUG_REGEXP_GRAPH
1375: printf("Ignoring duplicate transition from %d to %d\n",
1376: state->no, target->no);
1377: #endif
1378: return;
1379: }
1380: }
1381:
1382: if (state->maxTrans == 0) {
1383: state->maxTrans = 8;
1384: state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans *
1385: sizeof(xmlRegTrans));
1386: if (state->trans == NULL) {
1387: xmlRegexpErrMemory(ctxt, "adding transition");
1388: state->maxTrans = 0;
1389: return;
1390: }
1391: } else if (state->nbTrans >= state->maxTrans) {
1392: xmlRegTrans *tmp;
1393: state->maxTrans *= 2;
1394: tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans *
1395: sizeof(xmlRegTrans));
1396: if (tmp == NULL) {
1397: xmlRegexpErrMemory(ctxt, "adding transition");
1398: state->maxTrans /= 2;
1399: return;
1400: }
1401: state->trans = tmp;
1402: }
1403: #ifdef DEBUG_REGEXP_GRAPH
1404: printf("Add trans from %d to %d ", state->no, target->no);
1405: if (count == REGEXP_ALL_COUNTER)
1406: printf("all transition\n");
1407: else if (count >= 0)
1408: printf("count based %d\n", count);
1409: else if (counter >= 0)
1410: printf("counted %d\n", counter);
1411: else if (atom == NULL)
1412: printf("epsilon transition\n");
1413: else if (atom != NULL)
1414: xmlRegPrintAtom(stdout, atom);
1415: #endif
1416:
1417: state->trans[state->nbTrans].atom = atom;
1418: state->trans[state->nbTrans].to = target->no;
1419: state->trans[state->nbTrans].counter = counter;
1420: state->trans[state->nbTrans].count = count;
1421: state->trans[state->nbTrans].nd = 0;
1422: state->nbTrans++;
1423: xmlRegStateAddTransTo(ctxt, target, state->no);
1424: }
1425:
1426: static int
1427: xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
1428: if (state == NULL) return(-1);
1429: if (ctxt->maxStates == 0) {
1430: ctxt->maxStates = 4;
1431: ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates *
1432: sizeof(xmlRegStatePtr));
1433: if (ctxt->states == NULL) {
1434: xmlRegexpErrMemory(ctxt, "adding state");
1435: ctxt->maxStates = 0;
1436: return(-1);
1437: }
1438: } else if (ctxt->nbStates >= ctxt->maxStates) {
1439: xmlRegStatePtr *tmp;
1440: ctxt->maxStates *= 2;
1441: tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates *
1442: sizeof(xmlRegStatePtr));
1443: if (tmp == NULL) {
1444: xmlRegexpErrMemory(ctxt, "adding state");
1445: ctxt->maxStates /= 2;
1446: return(-1);
1447: }
1448: ctxt->states = tmp;
1449: }
1450: state->no = ctxt->nbStates;
1451: ctxt->states[ctxt->nbStates++] = state;
1452: return(0);
1453: }
1454:
1455: /**
1456: * xmlFAGenerateAllTransition:
1457: * @ctxt: a regexp parser context
1458: * @from: the from state
1459: * @to: the target state or NULL for building a new one
1460: * @lax:
1461: *
1462: */
1463: static void
1464: xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
1465: xmlRegStatePtr from, xmlRegStatePtr to,
1466: int lax) {
1467: if (to == NULL) {
1468: to = xmlRegNewState(ctxt);
1469: xmlRegStatePush(ctxt, to);
1470: ctxt->state = to;
1471: }
1472: if (lax)
1473: xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
1474: else
1475: xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
1476: }
1477:
1478: /**
1479: * xmlFAGenerateEpsilonTransition:
1480: * @ctxt: a regexp parser context
1481: * @from: the from state
1482: * @to: the target state or NULL for building a new one
1483: *
1484: */
1485: static void
1486: xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1487: xmlRegStatePtr from, xmlRegStatePtr to) {
1488: if (to == NULL) {
1489: to = xmlRegNewState(ctxt);
1490: xmlRegStatePush(ctxt, to);
1491: ctxt->state = to;
1492: }
1493: xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
1494: }
1495:
1496: /**
1497: * xmlFAGenerateCountedEpsilonTransition:
1498: * @ctxt: a regexp parser context
1499: * @from: the from state
1500: * @to: the target state or NULL for building a new one
1501: * counter: the counter for that transition
1502: *
1503: */
1504: static void
1505: xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1506: xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1507: if (to == NULL) {
1508: to = xmlRegNewState(ctxt);
1509: xmlRegStatePush(ctxt, to);
1510: ctxt->state = to;
1511: }
1512: xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
1513: }
1514:
1515: /**
1516: * xmlFAGenerateCountedTransition:
1517: * @ctxt: a regexp parser context
1518: * @from: the from state
1519: * @to: the target state or NULL for building a new one
1520: * counter: the counter for that transition
1521: *
1522: */
1523: static void
1524: xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
1525: xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1526: if (to == NULL) {
1527: to = xmlRegNewState(ctxt);
1528: xmlRegStatePush(ctxt, to);
1529: ctxt->state = to;
1530: }
1531: xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
1532: }
1533:
1534: /**
1535: * xmlFAGenerateTransitions:
1536: * @ctxt: a regexp parser context
1537: * @from: the from state
1538: * @to: the target state or NULL for building a new one
1539: * @atom: the atom generating the transition
1540: *
1541: * Returns 0 if success and -1 in case of error.
1542: */
1543: static int
1544: xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
1545: xmlRegStatePtr to, xmlRegAtomPtr atom) {
1546: xmlRegStatePtr end;
1547:
1548: if (atom == NULL) {
1549: ERROR("genrate transition: atom == NULL");
1550: return(-1);
1551: }
1552: if (atom->type == XML_REGEXP_SUBREG) {
1553: /*
1554: * this is a subexpression handling one should not need to
1555: * create a new node except for XML_REGEXP_QUANT_RANGE.
1556: */
1557: if (xmlRegAtomPush(ctxt, atom) < 0) {
1558: return(-1);
1559: }
1560: if ((to != NULL) && (atom->stop != to) &&
1561: (atom->quant != XML_REGEXP_QUANT_RANGE)) {
1562: /*
1563: * Generate an epsilon transition to link to the target
1564: */
1565: xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1566: #ifdef DV
1567: } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) &&
1568: (atom->quant != XML_REGEXP_QUANT_ONCE)) {
1569: to = xmlRegNewState(ctxt);
1570: xmlRegStatePush(ctxt, to);
1571: ctxt->state = to;
1572: xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1573: #endif
1574: }
1575: switch (atom->quant) {
1576: case XML_REGEXP_QUANT_OPT:
1577: atom->quant = XML_REGEXP_QUANT_ONCE;
1578: /*
1579: * transition done to the state after end of atom.
1580: * 1. set transition from atom start to new state
1581: * 2. set transition from atom end to this state.
1582: */
1583: if (to == NULL) {
1584: xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0);
1585: xmlFAGenerateEpsilonTransition(ctxt, atom->stop,
1586: ctxt->state);
1587: } else {
1588: xmlFAGenerateEpsilonTransition(ctxt, atom->start, to);
1589: }
1590: break;
1591: case XML_REGEXP_QUANT_MULT:
1592: atom->quant = XML_REGEXP_QUANT_ONCE;
1593: xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1594: xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1595: break;
1596: case XML_REGEXP_QUANT_PLUS:
1597: atom->quant = XML_REGEXP_QUANT_ONCE;
1598: xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1599: break;
1600: case XML_REGEXP_QUANT_RANGE: {
1601: int counter;
1602: xmlRegStatePtr inter, newstate;
1603:
1604: /*
1605: * create the final state now if needed
1606: */
1607: if (to != NULL) {
1608: newstate = to;
1609: } else {
1610: newstate = xmlRegNewState(ctxt);
1611: xmlRegStatePush(ctxt, newstate);
1612: }
1613:
1614: /*
1615: * The principle here is to use counted transition
1616: * to avoid explosion in the number of states in the
1617: * graph. This is clearly more complex but should not
1618: * be exploitable at runtime.
1619: */
1620: if ((atom->min == 0) && (atom->start0 == NULL)) {
1621: xmlRegAtomPtr copy;
1622: /*
1623: * duplicate a transition based on atom to count next
1624: * occurences after 1. We cannot loop to atom->start
1625: * directly because we need an epsilon transition to
1626: * newstate.
1627: */
1628: /* ???? For some reason it seems we never reach that
1629: case, I suppose this got optimized out before when
1630: building the automata */
1631: copy = xmlRegCopyAtom(ctxt, atom);
1632: if (copy == NULL)
1633: return(-1);
1634: copy->quant = XML_REGEXP_QUANT_ONCE;
1635: copy->min = 0;
1636: copy->max = 0;
1637:
1638: if (xmlFAGenerateTransitions(ctxt, atom->start, NULL, copy)
1639: < 0)
1640: return(-1);
1641: inter = ctxt->state;
1642: counter = xmlRegGetCounter(ctxt);
1643: ctxt->counters[counter].min = atom->min - 1;
1644: ctxt->counters[counter].max = atom->max - 1;
1645: /* count the number of times we see it again */
1646: xmlFAGenerateCountedEpsilonTransition(ctxt, inter,
1647: atom->stop, counter);
1648: /* allow a way out based on the count */
1649: xmlFAGenerateCountedTransition(ctxt, inter,
1650: newstate, counter);
1651: /* and also allow a direct exit for 0 */
1652: xmlFAGenerateEpsilonTransition(ctxt, atom->start,
1653: newstate);
1654: } else {
1655: /*
1656: * either we need the atom at least once or there
1657: * is an atom->start0 allowing to easilly plug the
1658: * epsilon transition.
1659: */
1660: counter = xmlRegGetCounter(ctxt);
1661: ctxt->counters[counter].min = atom->min - 1;
1662: ctxt->counters[counter].max = atom->max - 1;
1663: /* count the number of times we see it again */
1664: xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
1665: atom->start, counter);
1666: /* allow a way out based on the count */
1667: xmlFAGenerateCountedTransition(ctxt, atom->stop,
1668: newstate, counter);
1669: /* and if needed allow a direct exit for 0 */
1670: if (atom->min == 0)
1671: xmlFAGenerateEpsilonTransition(ctxt, atom->start0,
1672: newstate);
1673:
1674: }
1675: atom->min = 0;
1676: atom->max = 0;
1677: atom->quant = XML_REGEXP_QUANT_ONCE;
1678: ctxt->state = newstate;
1679: }
1680: default:
1681: break;
1682: }
1683: return(0);
1684: }
1685: if ((atom->min == 0) && (atom->max == 0) &&
1686: (atom->quant == XML_REGEXP_QUANT_RANGE)) {
1687: /*
1688: * we can discard the atom and generate an epsilon transition instead
1689: */
1690: if (to == NULL) {
1691: to = xmlRegNewState(ctxt);
1692: if (to != NULL)
1693: xmlRegStatePush(ctxt, to);
1694: else {
1695: return(-1);
1696: }
1697: }
1698: xmlFAGenerateEpsilonTransition(ctxt, from, to);
1699: ctxt->state = to;
1700: xmlRegFreeAtom(atom);
1701: return(0);
1702: }
1703: if (to == NULL) {
1704: to = xmlRegNewState(ctxt);
1705: if (to != NULL)
1706: xmlRegStatePush(ctxt, to);
1707: else {
1708: return(-1);
1709: }
1710: }
1711: end = to;
1712: if ((atom->quant == XML_REGEXP_QUANT_MULT) ||
1713: (atom->quant == XML_REGEXP_QUANT_PLUS)) {
1714: /*
1715: * Do not pollute the target state by adding transitions from
1716: * it as it is likely to be the shared target of multiple branches.
1717: * So isolate with an epsilon transition.
1718: */
1719: xmlRegStatePtr tmp;
1720:
1721: tmp = xmlRegNewState(ctxt);
1722: if (tmp != NULL)
1723: xmlRegStatePush(ctxt, tmp);
1724: else {
1725: return(-1);
1726: }
1727: xmlFAGenerateEpsilonTransition(ctxt, tmp, to);
1728: to = tmp;
1729: }
1730: if (xmlRegAtomPush(ctxt, atom) < 0) {
1731: return(-1);
1732: }
1733: xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
1734: ctxt->state = end;
1735: switch (atom->quant) {
1736: case XML_REGEXP_QUANT_OPT:
1737: atom->quant = XML_REGEXP_QUANT_ONCE;
1738: xmlFAGenerateEpsilonTransition(ctxt, from, to);
1739: break;
1740: case XML_REGEXP_QUANT_MULT:
1741: atom->quant = XML_REGEXP_QUANT_ONCE;
1742: xmlFAGenerateEpsilonTransition(ctxt, from, to);
1743: xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1744: break;
1745: case XML_REGEXP_QUANT_PLUS:
1746: atom->quant = XML_REGEXP_QUANT_ONCE;
1747: xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1748: break;
1749: case XML_REGEXP_QUANT_RANGE:
1750: #if DV_test
1751: if (atom->min == 0) {
1752: xmlFAGenerateEpsilonTransition(ctxt, from, to);
1753: }
1754: #endif
1755: break;
1756: default:
1757: break;
1758: }
1759: return(0);
1760: }
1761:
1762: /**
1763: * xmlFAReduceEpsilonTransitions:
1764: * @ctxt: a regexp parser context
1765: * @fromnr: the from state
1766: * @tonr: the to state
1767: * @counter: should that transition be associated to a counted
1768: *
1769: */
1770: static void
1771: xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
1772: int tonr, int counter) {
1773: int transnr;
1774: xmlRegStatePtr from;
1775: xmlRegStatePtr to;
1776:
1777: #ifdef DEBUG_REGEXP_GRAPH
1778: printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr);
1779: #endif
1780: from = ctxt->states[fromnr];
1781: if (from == NULL)
1782: return;
1783: to = ctxt->states[tonr];
1784: if (to == NULL)
1785: return;
1786: if ((to->mark == XML_REGEXP_MARK_START) ||
1787: (to->mark == XML_REGEXP_MARK_VISITED))
1788: return;
1789:
1790: to->mark = XML_REGEXP_MARK_VISITED;
1791: if (to->type == XML_REGEXP_FINAL_STATE) {
1792: #ifdef DEBUG_REGEXP_GRAPH
1793: printf("State %d is final, so %d becomes final\n", tonr, fromnr);
1794: #endif
1795: from->type = XML_REGEXP_FINAL_STATE;
1796: }
1797: for (transnr = 0;transnr < to->nbTrans;transnr++) {
1798: if (to->trans[transnr].to < 0)
1799: continue;
1800: if (to->trans[transnr].atom == NULL) {
1801: /*
1802: * Don't remove counted transitions
1803: * Don't loop either
1804: */
1805: if (to->trans[transnr].to != fromnr) {
1806: if (to->trans[transnr].count >= 0) {
1807: int newto = to->trans[transnr].to;
1808:
1809: xmlRegStateAddTrans(ctxt, from, NULL,
1810: ctxt->states[newto],
1811: -1, to->trans[transnr].count);
1812: } else {
1813: #ifdef DEBUG_REGEXP_GRAPH
1814: printf("Found epsilon trans %d from %d to %d\n",
1815: transnr, tonr, to->trans[transnr].to);
1816: #endif
1817: if (to->trans[transnr].counter >= 0) {
1818: xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1819: to->trans[transnr].to,
1820: to->trans[transnr].counter);
1821: } else {
1822: xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1823: to->trans[transnr].to,
1824: counter);
1825: }
1826: }
1827: }
1828: } else {
1829: int newto = to->trans[transnr].to;
1830:
1831: if (to->trans[transnr].counter >= 0) {
1832: xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1833: ctxt->states[newto],
1834: to->trans[transnr].counter, -1);
1835: } else {
1836: xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1837: ctxt->states[newto], counter, -1);
1838: }
1839: }
1840: }
1841: to->mark = XML_REGEXP_MARK_NORMAL;
1842: }
1843:
1844: /**
1845: * xmlFAEliminateSimpleEpsilonTransitions:
1846: * @ctxt: a regexp parser context
1847: *
1848: * Eliminating general epsilon transitions can get costly in the general
1849: * algorithm due to the large amount of generated new transitions and
1850: * associated comparisons. However for simple epsilon transition used just
1851: * to separate building blocks when generating the automata this can be
1852: * reduced to state elimination:
1853: * - if there exists an epsilon from X to Y
1854: * - if there is no other transition from X
1855: * then X and Y are semantically equivalent and X can be eliminated
1856: * If X is the start state then make Y the start state, else replace the
1857: * target of all transitions to X by transitions to Y.
1858: */
1859: static void
1860: xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1861: int statenr, i, j, newto;
1862: xmlRegStatePtr state, tmp;
1863:
1864: for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1865: state = ctxt->states[statenr];
1866: if (state == NULL)
1867: continue;
1868: if (state->nbTrans != 1)
1869: continue;
1870: if (state->type == XML_REGEXP_UNREACH_STATE)
1871: continue;
1872: /* is the only transition out a basic transition */
1873: if ((state->trans[0].atom == NULL) &&
1874: (state->trans[0].to >= 0) &&
1875: (state->trans[0].to != statenr) &&
1876: (state->trans[0].counter < 0) &&
1877: (state->trans[0].count < 0)) {
1878: newto = state->trans[0].to;
1879:
1880: if (state->type == XML_REGEXP_START_STATE) {
1881: #ifdef DEBUG_REGEXP_GRAPH
1882: printf("Found simple epsilon trans from start %d to %d\n",
1883: statenr, newto);
1884: #endif
1885: } else {
1886: #ifdef DEBUG_REGEXP_GRAPH
1887: printf("Found simple epsilon trans from %d to %d\n",
1888: statenr, newto);
1889: #endif
1890: for (i = 0;i < state->nbTransTo;i++) {
1891: tmp = ctxt->states[state->transTo[i]];
1892: for (j = 0;j < tmp->nbTrans;j++) {
1893: if (tmp->trans[j].to == statenr) {
1894: #ifdef DEBUG_REGEXP_GRAPH
1895: printf("Changed transition %d on %d to go to %d\n",
1896: j, tmp->no, newto);
1897: #endif
1898: tmp->trans[j].to = -1;
1899: xmlRegStateAddTrans(ctxt, tmp, tmp->trans[j].atom,
1900: ctxt->states[newto],
1901: tmp->trans[j].counter,
1902: tmp->trans[j].count);
1903: }
1904: }
1905: }
1906: if (state->type == XML_REGEXP_FINAL_STATE)
1907: ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE;
1908: /* eliminate the transition completely */
1909: state->nbTrans = 0;
1910:
1911: state->type = XML_REGEXP_UNREACH_STATE;
1912:
1913: }
1914:
1915: }
1916: }
1917: }
1918: /**
1919: * xmlFAEliminateEpsilonTransitions:
1920: * @ctxt: a regexp parser context
1921: *
1922: */
1923: static void
1924: xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1925: int statenr, transnr;
1926: xmlRegStatePtr state;
1927: int has_epsilon;
1928:
1929: if (ctxt->states == NULL) return;
1930:
1931: /*
1932: * Eliminate simple epsilon transition and the associated unreachable
1933: * states.
1934: */
1935: xmlFAEliminateSimpleEpsilonTransitions(ctxt);
1936: for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1937: state = ctxt->states[statenr];
1938: if ((state != NULL) && (state->type == XML_REGEXP_UNREACH_STATE)) {
1939: #ifdef DEBUG_REGEXP_GRAPH
1940: printf("Removed unreachable state %d\n", statenr);
1941: #endif
1942: xmlRegFreeState(state);
1943: ctxt->states[statenr] = NULL;
1944: }
1945: }
1946:
1947: has_epsilon = 0;
1948:
1949: /*
1950: * Build the completed transitions bypassing the epsilons
1951: * Use a marking algorithm to avoid loops
1952: * Mark sink states too.
1953: * Process from the latests states backward to the start when
1954: * there is long cascading epsilon chains this minimize the
1955: * recursions and transition compares when adding the new ones
1956: */
1957: for (statenr = ctxt->nbStates - 1;statenr >= 0;statenr--) {
1958: state = ctxt->states[statenr];
1959: if (state == NULL)
1960: continue;
1961: if ((state->nbTrans == 0) &&
1962: (state->type != XML_REGEXP_FINAL_STATE)) {
1963: state->type = XML_REGEXP_SINK_STATE;
1964: }
1965: for (transnr = 0;transnr < state->nbTrans;transnr++) {
1966: if ((state->trans[transnr].atom == NULL) &&
1967: (state->trans[transnr].to >= 0)) {
1968: if (state->trans[transnr].to == statenr) {
1969: state->trans[transnr].to = -1;
1970: #ifdef DEBUG_REGEXP_GRAPH
1971: printf("Removed loopback epsilon trans %d on %d\n",
1972: transnr, statenr);
1973: #endif
1974: } else if (state->trans[transnr].count < 0) {
1975: int newto = state->trans[transnr].to;
1976:
1977: #ifdef DEBUG_REGEXP_GRAPH
1978: printf("Found epsilon trans %d from %d to %d\n",
1979: transnr, statenr, newto);
1980: #endif
1981: has_epsilon = 1;
1982: state->trans[transnr].to = -2;
1983: state->mark = XML_REGEXP_MARK_START;
1984: xmlFAReduceEpsilonTransitions(ctxt, statenr,
1985: newto, state->trans[transnr].counter);
1986: state->mark = XML_REGEXP_MARK_NORMAL;
1987: #ifdef DEBUG_REGEXP_GRAPH
1988: } else {
1989: printf("Found counted transition %d on %d\n",
1990: transnr, statenr);
1991: #endif
1992: }
1993: }
1994: }
1995: }
1996: /*
1997: * Eliminate the epsilon transitions
1998: */
1999: if (has_epsilon) {
2000: for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2001: state = ctxt->states[statenr];
2002: if (state == NULL)
2003: continue;
2004: for (transnr = 0;transnr < state->nbTrans;transnr++) {
2005: xmlRegTransPtr trans = &(state->trans[transnr]);
2006: if ((trans->atom == NULL) &&
2007: (trans->count < 0) &&
2008: (trans->to >= 0)) {
2009: trans->to = -1;
2010: }
2011: }
2012: }
2013: }
2014:
2015: /*
2016: * Use this pass to detect unreachable states too
2017: */
2018: for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2019: state = ctxt->states[statenr];
2020: if (state != NULL)
2021: state->reached = XML_REGEXP_MARK_NORMAL;
2022: }
2023: state = ctxt->states[0];
2024: if (state != NULL)
2025: state->reached = XML_REGEXP_MARK_START;
2026: while (state != NULL) {
2027: xmlRegStatePtr target = NULL;
2028: state->reached = XML_REGEXP_MARK_VISITED;
2029: /*
2030: * Mark all states reachable from the current reachable state
2031: */
2032: for (transnr = 0;transnr < state->nbTrans;transnr++) {
2033: if ((state->trans[transnr].to >= 0) &&
2034: ((state->trans[transnr].atom != NULL) ||
2035: (state->trans[transnr].count >= 0))) {
2036: int newto = state->trans[transnr].to;
2037:
2038: if (ctxt->states[newto] == NULL)
2039: continue;
2040: if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) {
2041: ctxt->states[newto]->reached = XML_REGEXP_MARK_START;
2042: target = ctxt->states[newto];
2043: }
2044: }
2045: }
2046:
2047: /*
2048: * find the next accessible state not explored
2049: */
2050: if (target == NULL) {
2051: for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
2052: state = ctxt->states[statenr];
2053: if ((state != NULL) && (state->reached ==
2054: XML_REGEXP_MARK_START)) {
2055: target = state;
2056: break;
2057: }
2058: }
2059: }
2060: state = target;
2061: }
2062: for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2063: state = ctxt->states[statenr];
2064: if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) {
2065: #ifdef DEBUG_REGEXP_GRAPH
2066: printf("Removed unreachable state %d\n", statenr);
2067: #endif
2068: xmlRegFreeState(state);
2069: ctxt->states[statenr] = NULL;
2070: }
2071: }
2072:
2073: }
2074:
2075: static int
2076: xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) {
2077: int ret = 0;
2078:
2079: if ((range1->type == XML_REGEXP_RANGES) ||
2080: (range2->type == XML_REGEXP_RANGES) ||
2081: (range2->type == XML_REGEXP_SUBREG) ||
2082: (range1->type == XML_REGEXP_SUBREG) ||
2083: (range1->type == XML_REGEXP_STRING) ||
2084: (range2->type == XML_REGEXP_STRING))
2085: return(-1);
2086:
2087: /* put them in order */
2088: if (range1->type > range2->type) {
2089: xmlRegRangePtr tmp;
2090:
2091: tmp = range1;
2092: range1 = range2;
2093: range2 = tmp;
2094: }
2095: if ((range1->type == XML_REGEXP_ANYCHAR) ||
2096: (range2->type == XML_REGEXP_ANYCHAR)) {
2097: ret = 1;
2098: } else if ((range1->type == XML_REGEXP_EPSILON) ||
2099: (range2->type == XML_REGEXP_EPSILON)) {
2100: return(0);
2101: } else if (range1->type == range2->type) {
2102: if (range1->type != XML_REGEXP_CHARVAL)
2103: ret = 1;
2104: else if ((range1->end < range2->start) ||
2105: (range2->end < range1->start))
2106: ret = 0;
2107: else
2108: ret = 1;
2109: } else if (range1->type == XML_REGEXP_CHARVAL) {
2110: int codepoint;
2111: int neg = 0;
2112:
2113: /*
2114: * just check all codepoints in the range for acceptance,
2115: * this is usually way cheaper since done only once at
2116: * compilation than testing over and over at runtime or
2117: * pushing too many states when evaluating.
2118: */
2119: if (((range1->neg == 0) && (range2->neg != 0)) ||
2120: ((range1->neg != 0) && (range2->neg == 0)))
2121: neg = 1;
2122:
2123: for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) {
2124: ret = xmlRegCheckCharacterRange(range2->type, codepoint,
2125: 0, range2->start, range2->end,
2126: range2->blockName);
2127: if (ret < 0)
2128: return(-1);
2129: if (((neg == 1) && (ret == 0)) ||
2130: ((neg == 0) && (ret == 1)))
2131: return(1);
2132: }
2133: return(0);
2134: } else if ((range1->type == XML_REGEXP_BLOCK_NAME) ||
2135: (range2->type == XML_REGEXP_BLOCK_NAME)) {
2136: if (range1->type == range2->type) {
2137: ret = xmlStrEqual(range1->blockName, range2->blockName);
2138: } else {
2139: /*
2140: * comparing a block range with anything else is way
2141: * too costly, and maintining the table is like too much
2142: * memory too, so let's force the automata to save state
2143: * here.
2144: */
2145: return(1);
2146: }
2147: } else if ((range1->type < XML_REGEXP_LETTER) ||
2148: (range2->type < XML_REGEXP_LETTER)) {
2149: if ((range1->type == XML_REGEXP_ANYSPACE) &&
2150: (range2->type == XML_REGEXP_NOTSPACE))
2151: ret = 0;
2152: else if ((range1->type == XML_REGEXP_INITNAME) &&
2153: (range2->type == XML_REGEXP_NOTINITNAME))
2154: ret = 0;
2155: else if ((range1->type == XML_REGEXP_NAMECHAR) &&
2156: (range2->type == XML_REGEXP_NOTNAMECHAR))
2157: ret = 0;
2158: else if ((range1->type == XML_REGEXP_DECIMAL) &&
2159: (range2->type == XML_REGEXP_NOTDECIMAL))
2160: ret = 0;
2161: else if ((range1->type == XML_REGEXP_REALCHAR) &&
2162: (range2->type == XML_REGEXP_NOTREALCHAR))
2163: ret = 0;
2164: else {
2165: /* same thing to limit complexity */
2166: return(1);
2167: }
2168: } else {
2169: ret = 0;
2170: /* range1->type < range2->type here */
2171: switch (range1->type) {
2172: case XML_REGEXP_LETTER:
2173: /* all disjoint except in the subgroups */
2174: if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) ||
2175: (range2->type == XML_REGEXP_LETTER_LOWERCASE) ||
2176: (range2->type == XML_REGEXP_LETTER_TITLECASE) ||
2177: (range2->type == XML_REGEXP_LETTER_MODIFIER) ||
2178: (range2->type == XML_REGEXP_LETTER_OTHERS))
2179: ret = 1;
2180: break;
2181: case XML_REGEXP_MARK:
2182: if ((range2->type == XML_REGEXP_MARK_NONSPACING) ||
2183: (range2->type == XML_REGEXP_MARK_SPACECOMBINING) ||
2184: (range2->type == XML_REGEXP_MARK_ENCLOSING))
2185: ret = 1;
2186: break;
2187: case XML_REGEXP_NUMBER:
2188: if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) ||
2189: (range2->type == XML_REGEXP_NUMBER_LETTER) ||
2190: (range2->type == XML_REGEXP_NUMBER_OTHERS))
2191: ret = 1;
2192: break;
2193: case XML_REGEXP_PUNCT:
2194: if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) ||
2195: (range2->type == XML_REGEXP_PUNCT_DASH) ||
2196: (range2->type == XML_REGEXP_PUNCT_OPEN) ||
2197: (range2->type == XML_REGEXP_PUNCT_CLOSE) ||
2198: (range2->type == XML_REGEXP_PUNCT_INITQUOTE) ||
2199: (range2->type == XML_REGEXP_PUNCT_FINQUOTE) ||
2200: (range2->type == XML_REGEXP_PUNCT_OTHERS))
2201: ret = 1;
2202: break;
2203: case XML_REGEXP_SEPAR:
2204: if ((range2->type == XML_REGEXP_SEPAR_SPACE) ||
2205: (range2->type == XML_REGEXP_SEPAR_LINE) ||
2206: (range2->type == XML_REGEXP_SEPAR_PARA))
2207: ret = 1;
2208: break;
2209: case XML_REGEXP_SYMBOL:
2210: if ((range2->type == XML_REGEXP_SYMBOL_MATH) ||
2211: (range2->type == XML_REGEXP_SYMBOL_CURRENCY) ||
2212: (range2->type == XML_REGEXP_SYMBOL_MODIFIER) ||
2213: (range2->type == XML_REGEXP_SYMBOL_OTHERS))
2214: ret = 1;
2215: break;
2216: case XML_REGEXP_OTHER:
2217: if ((range2->type == XML_REGEXP_OTHER_CONTROL) ||
2218: (range2->type == XML_REGEXP_OTHER_FORMAT) ||
2219: (range2->type == XML_REGEXP_OTHER_PRIVATE))
2220: ret = 1;
2221: break;
2222: default:
2223: if ((range2->type >= XML_REGEXP_LETTER) &&
2224: (range2->type < XML_REGEXP_BLOCK_NAME))
2225: ret = 0;
2226: else {
2227: /* safety net ! */
2228: return(1);
2229: }
2230: }
2231: }
2232: if (((range1->neg == 0) && (range2->neg != 0)) ||
2233: ((range1->neg != 0) && (range2->neg == 0)))
2234: ret = !ret;
2235: return(ret);
2236: }
2237:
2238: /**
2239: * xmlFACompareAtomTypes:
2240: * @type1: an atom type
2241: * @type2: an atom type
2242: *
2243: * Compares two atoms type to check whether they intersect in some ways,
2244: * this is used by xmlFACompareAtoms only
2245: *
2246: * Returns 1 if they may intersect and 0 otherwise
2247: */
2248: static int
2249: xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) {
2250: if ((type1 == XML_REGEXP_EPSILON) ||
2251: (type1 == XML_REGEXP_CHARVAL) ||
2252: (type1 == XML_REGEXP_RANGES) ||
2253: (type1 == XML_REGEXP_SUBREG) ||
2254: (type1 == XML_REGEXP_STRING) ||
2255: (type1 == XML_REGEXP_ANYCHAR))
2256: return(1);
2257: if ((type2 == XML_REGEXP_EPSILON) ||
2258: (type2 == XML_REGEXP_CHARVAL) ||
2259: (type2 == XML_REGEXP_RANGES) ||
2260: (type2 == XML_REGEXP_SUBREG) ||
2261: (type2 == XML_REGEXP_STRING) ||
2262: (type2 == XML_REGEXP_ANYCHAR))
2263: return(1);
2264:
2265: if (type1 == type2) return(1);
2266:
2267: /* simplify subsequent compares by making sure type1 < type2 */
2268: if (type1 > type2) {
2269: xmlRegAtomType tmp = type1;
2270: type1 = type2;
2271: type2 = tmp;
2272: }
2273: switch (type1) {
2274: case XML_REGEXP_ANYSPACE: /* \s */
2275: /* can't be a letter, number, mark, pontuation, symbol */
2276: if ((type2 == XML_REGEXP_NOTSPACE) ||
2277: ((type2 >= XML_REGEXP_LETTER) &&
2278: (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2279: ((type2 >= XML_REGEXP_NUMBER) &&
2280: (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2281: ((type2 >= XML_REGEXP_MARK) &&
2282: (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2283: ((type2 >= XML_REGEXP_PUNCT) &&
2284: (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2285: ((type2 >= XML_REGEXP_SYMBOL) &&
2286: (type2 <= XML_REGEXP_SYMBOL_OTHERS))
2287: ) return(0);
2288: break;
2289: case XML_REGEXP_NOTSPACE: /* \S */
2290: break;
2291: case XML_REGEXP_INITNAME: /* \l */
2292: /* can't be a number, mark, separator, pontuation, symbol or other */
2293: if ((type2 == XML_REGEXP_NOTINITNAME) ||
2294: ((type2 >= XML_REGEXP_NUMBER) &&
2295: (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2296: ((type2 >= XML_REGEXP_MARK) &&
2297: (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2298: ((type2 >= XML_REGEXP_SEPAR) &&
2299: (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2300: ((type2 >= XML_REGEXP_PUNCT) &&
2301: (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2302: ((type2 >= XML_REGEXP_SYMBOL) &&
2303: (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2304: ((type2 >= XML_REGEXP_OTHER) &&
2305: (type2 <= XML_REGEXP_OTHER_NA))
2306: ) return(0);
2307: break;
2308: case XML_REGEXP_NOTINITNAME: /* \L */
2309: break;
2310: case XML_REGEXP_NAMECHAR: /* \c */
2311: /* can't be a mark, separator, pontuation, symbol or other */
2312: if ((type2 == XML_REGEXP_NOTNAMECHAR) ||
2313: ((type2 >= XML_REGEXP_MARK) &&
2314: (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2315: ((type2 >= XML_REGEXP_PUNCT) &&
2316: (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2317: ((type2 >= XML_REGEXP_SEPAR) &&
2318: (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2319: ((type2 >= XML_REGEXP_SYMBOL) &&
2320: (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2321: ((type2 >= XML_REGEXP_OTHER) &&
2322: (type2 <= XML_REGEXP_OTHER_NA))
2323: ) return(0);
2324: break;
2325: case XML_REGEXP_NOTNAMECHAR: /* \C */
2326: break;
2327: case XML_REGEXP_DECIMAL: /* \d */
2328: /* can't be a letter, mark, separator, pontuation, symbol or other */
2329: if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2330: (type2 == XML_REGEXP_REALCHAR) ||
2331: ((type2 >= XML_REGEXP_LETTER) &&
2332: (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2333: ((type2 >= XML_REGEXP_MARK) &&
2334: (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2335: ((type2 >= XML_REGEXP_PUNCT) &&
2336: (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2337: ((type2 >= XML_REGEXP_SEPAR) &&
2338: (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2339: ((type2 >= XML_REGEXP_SYMBOL) &&
2340: (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2341: ((type2 >= XML_REGEXP_OTHER) &&
2342: (type2 <= XML_REGEXP_OTHER_NA))
2343: )return(0);
2344: break;
2345: case XML_REGEXP_NOTDECIMAL: /* \D */
2346: break;
2347: case XML_REGEXP_REALCHAR: /* \w */
2348: /* can't be a mark, separator, pontuation, symbol or other */
2349: if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2350: ((type2 >= XML_REGEXP_MARK) &&
2351: (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2352: ((type2 >= XML_REGEXP_PUNCT) &&
2353: (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2354: ((type2 >= XML_REGEXP_SEPAR) &&
2355: (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2356: ((type2 >= XML_REGEXP_SYMBOL) &&
2357: (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2358: ((type2 >= XML_REGEXP_OTHER) &&
2359: (type2 <= XML_REGEXP_OTHER_NA))
2360: )return(0);
2361: break;
2362: case XML_REGEXP_NOTREALCHAR: /* \W */
2363: break;
2364: /*
2365: * at that point we know both type 1 and type2 are from
2366: * character categories are ordered and are different,
2367: * it becomes simple because this is a partition
2368: */
2369: case XML_REGEXP_LETTER:
2370: if (type2 <= XML_REGEXP_LETTER_OTHERS)
2371: return(1);
2372: return(0);
2373: case XML_REGEXP_LETTER_UPPERCASE:
2374: case XML_REGEXP_LETTER_LOWERCASE:
2375: case XML_REGEXP_LETTER_TITLECASE:
2376: case XML_REGEXP_LETTER_MODIFIER:
2377: case XML_REGEXP_LETTER_OTHERS:
2378: return(0);
2379: case XML_REGEXP_MARK:
2380: if (type2 <= XML_REGEXP_MARK_ENCLOSING)
2381: return(1);
2382: return(0);
2383: case XML_REGEXP_MARK_NONSPACING:
2384: case XML_REGEXP_MARK_SPACECOMBINING:
2385: case XML_REGEXP_MARK_ENCLOSING:
2386: return(0);
2387: case XML_REGEXP_NUMBER:
2388: if (type2 <= XML_REGEXP_NUMBER_OTHERS)
2389: return(1);
2390: return(0);
2391: case XML_REGEXP_NUMBER_DECIMAL:
2392: case XML_REGEXP_NUMBER_LETTER:
2393: case XML_REGEXP_NUMBER_OTHERS:
2394: return(0);
2395: case XML_REGEXP_PUNCT:
2396: if (type2 <= XML_REGEXP_PUNCT_OTHERS)
2397: return(1);
2398: return(0);
2399: case XML_REGEXP_PUNCT_CONNECTOR:
2400: case XML_REGEXP_PUNCT_DASH:
2401: case XML_REGEXP_PUNCT_OPEN:
2402: case XML_REGEXP_PUNCT_CLOSE:
2403: case XML_REGEXP_PUNCT_INITQUOTE:
2404: case XML_REGEXP_PUNCT_FINQUOTE:
2405: case XML_REGEXP_PUNCT_OTHERS:
2406: return(0);
2407: case XML_REGEXP_SEPAR:
2408: if (type2 <= XML_REGEXP_SEPAR_PARA)
2409: return(1);
2410: return(0);
2411: case XML_REGEXP_SEPAR_SPACE:
2412: case XML_REGEXP_SEPAR_LINE:
2413: case XML_REGEXP_SEPAR_PARA:
2414: return(0);
2415: case XML_REGEXP_SYMBOL:
2416: if (type2 <= XML_REGEXP_SYMBOL_OTHERS)
2417: return(1);
2418: return(0);
2419: case XML_REGEXP_SYMBOL_MATH:
2420: case XML_REGEXP_SYMBOL_CURRENCY:
2421: case XML_REGEXP_SYMBOL_MODIFIER:
2422: case XML_REGEXP_SYMBOL_OTHERS:
2423: return(0);
2424: case XML_REGEXP_OTHER:
2425: if (type2 <= XML_REGEXP_OTHER_NA)
2426: return(1);
2427: return(0);
2428: case XML_REGEXP_OTHER_CONTROL:
2429: case XML_REGEXP_OTHER_FORMAT:
2430: case XML_REGEXP_OTHER_PRIVATE:
2431: case XML_REGEXP_OTHER_NA:
2432: return(0);
2433: default:
2434: break;
2435: }
2436: return(1);
2437: }
2438:
2439: /**
2440: * xmlFAEqualAtoms:
2441: * @atom1: an atom
2442: * @atom2: an atom
2443: * @deep: if not set only compare string pointers
2444: *
2445: * Compares two atoms to check whether they are the same exactly
2446: * this is used to remove equivalent transitions
2447: *
2448: * Returns 1 if same and 0 otherwise
2449: */
2450: static int
2451: xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
2452: int ret = 0;
2453:
2454: if (atom1 == atom2)
2455: return(1);
2456: if ((atom1 == NULL) || (atom2 == NULL))
2457: return(0);
2458:
2459: if (atom1->type != atom2->type)
2460: return(0);
2461: switch (atom1->type) {
2462: case XML_REGEXP_EPSILON:
2463: ret = 0;
2464: break;
2465: case XML_REGEXP_STRING:
2466: if (!deep)
2467: ret = (atom1->valuep == atom2->valuep);
2468: else
2469: ret = xmlStrEqual((xmlChar *)atom1->valuep,
2470: (xmlChar *)atom2->valuep);
2471: break;
2472: case XML_REGEXP_CHARVAL:
2473: ret = (atom1->codepoint == atom2->codepoint);
2474: break;
2475: case XML_REGEXP_RANGES:
2476: /* too hard to do in the general case */
2477: ret = 0;
2478: default:
2479: break;
2480: }
2481: return(ret);
2482: }
2483:
2484: /**
2485: * xmlFACompareAtoms:
2486: * @atom1: an atom
2487: * @atom2: an atom
2488: * @deep: if not set only compare string pointers
2489: *
2490: * Compares two atoms to check whether they intersect in some ways,
2491: * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only
2492: *
2493: * Returns 1 if yes and 0 otherwise
2494: */
2495: static int
2496: xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
2497: int ret = 1;
2498:
2499: if (atom1 == atom2)
2500: return(1);
2501: if ((atom1 == NULL) || (atom2 == NULL))
2502: return(0);
2503:
2504: if ((atom1->type == XML_REGEXP_ANYCHAR) ||
2505: (atom2->type == XML_REGEXP_ANYCHAR))
2506: return(1);
2507:
2508: if (atom1->type > atom2->type) {
2509: xmlRegAtomPtr tmp;
2510: tmp = atom1;
2511: atom1 = atom2;
2512: atom2 = tmp;
2513: }
2514: if (atom1->type != atom2->type) {
2515: ret = xmlFACompareAtomTypes(atom1->type, atom2->type);
2516: /* if they can't intersect at the type level break now */
2517: if (ret == 0)
2518: return(0);
2519: }
2520: switch (atom1->type) {
2521: case XML_REGEXP_STRING:
2522: if (!deep)
2523: ret = (atom1->valuep != atom2->valuep);
2524: else
2525: ret = xmlRegStrEqualWildcard((xmlChar *)atom1->valuep,
2526: (xmlChar *)atom2->valuep);
2527: break;
2528: case XML_REGEXP_EPSILON:
2529: goto not_determinist;
2530: case XML_REGEXP_CHARVAL:
2531: if (atom2->type == XML_REGEXP_CHARVAL) {
2532: ret = (atom1->codepoint == atom2->codepoint);
2533: } else {
2534: ret = xmlRegCheckCharacter(atom2, atom1->codepoint);
2535: if (ret < 0)
2536: ret = 1;
2537: }
2538: break;
2539: case XML_REGEXP_RANGES:
2540: if (atom2->type == XML_REGEXP_RANGES) {
2541: int i, j, res;
2542: xmlRegRangePtr r1, r2;
2543:
2544: /*
2545: * need to check that none of the ranges eventually matches
2546: */
2547: for (i = 0;i < atom1->nbRanges;i++) {
2548: for (j = 0;j < atom2->nbRanges;j++) {
2549: r1 = atom1->ranges[i];
2550: r2 = atom2->ranges[j];
2551: res = xmlFACompareRanges(r1, r2);
2552: if (res == 1) {
2553: ret = 1;
2554: goto done;
2555: }
2556: }
2557: }
2558: ret = 0;
2559: }
2560: break;
2561: default:
2562: goto not_determinist;
2563: }
2564: done:
2565: if (atom1->neg != atom2->neg) {
2566: ret = !ret;
2567: }
2568: if (ret == 0)
2569: return(0);
2570: not_determinist:
2571: return(1);
2572: }
2573:
2574: /**
2575: * xmlFARecurseDeterminism:
2576: * @ctxt: a regexp parser context
2577: *
2578: * Check whether the associated regexp is determinist,
2579: * should be called after xmlFAEliminateEpsilonTransitions()
2580: *
2581: */
2582: static int
2583: xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
2584: int to, xmlRegAtomPtr atom) {
2585: int ret = 1;
2586: int res;
2587: int transnr, nbTrans;
2588: xmlRegTransPtr t1;
2589: int deep = 1;
2590:
2591: if (state == NULL)
2592: return(ret);
2593: if (state->markd == XML_REGEXP_MARK_VISITED)
2594: return(ret);
2595:
2596: if (ctxt->flags & AM_AUTOMATA_RNG)
2597: deep = 0;
2598:
2599: /*
2600: * don't recurse on transitions potentially added in the course of
2601: * the elimination.
2602: */
2603: nbTrans = state->nbTrans;
2604: for (transnr = 0;transnr < nbTrans;transnr++) {
2605: t1 = &(state->trans[transnr]);
2606: /*
2607: * check transitions conflicting with the one looked at
2608: */
2609: if (t1->atom == NULL) {
2610: if (t1->to < 0)
2611: continue;
2612: state->markd = XML_REGEXP_MARK_VISITED;
2613: res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2614: to, atom);
2615: state->markd = 0;
2616: if (res == 0) {
2617: ret = 0;
2618: /* t1->nd = 1; */
2619: }
2620: continue;
2621: }
2622: if (t1->to != to)
2623: continue;
2624: if (xmlFACompareAtoms(t1->atom, atom, deep)) {
2625: ret = 0;
2626: /* mark the transition as non-deterministic */
2627: t1->nd = 1;
2628: }
2629: }
2630: return(ret);
2631: }
2632:
2633: /**
2634: * xmlFAComputesDeterminism:
2635: * @ctxt: a regexp parser context
2636: *
2637: * Check whether the associated regexp is determinist,
2638: * should be called after xmlFAEliminateEpsilonTransitions()
2639: *
2640: */
2641: static int
2642: xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
2643: int statenr, transnr;
2644: xmlRegStatePtr state;
2645: xmlRegTransPtr t1, t2, last;
2646: int i;
2647: int ret = 1;
2648: int deep = 1;
2649:
2650: #ifdef DEBUG_REGEXP_GRAPH
2651: printf("xmlFAComputesDeterminism\n");
2652: xmlRegPrintCtxt(stdout, ctxt);
2653: #endif
2654: if (ctxt->determinist != -1)
2655: return(ctxt->determinist);
2656:
2657: if (ctxt->flags & AM_AUTOMATA_RNG)
2658: deep = 0;
2659:
2660: /*
2661: * First cleanup the automata removing cancelled transitions
2662: */
2663: for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2664: state = ctxt->states[statenr];
2665: if (state == NULL)
2666: continue;
2667: if (state->nbTrans < 2)
2668: continue;
2669: for (transnr = 0;transnr < state->nbTrans;transnr++) {
2670: t1 = &(state->trans[transnr]);
2671: /*
2672: * Determinism checks in case of counted or all transitions
2673: * will have to be handled separately
2674: */
2675: if (t1->atom == NULL) {
2676: /* t1->nd = 1; */
2677: continue;
2678: }
2679: if (t1->to == -1) /* eliminated */
2680: continue;
2681: for (i = 0;i < transnr;i++) {
2682: t2 = &(state->trans[i]);
2683: if (t2->to == -1) /* eliminated */
2684: continue;
2685: if (t2->atom != NULL) {
2686: if (t1->to == t2->to) {
2687: /*
2688: * Here we use deep because we want to keep the
2689: * transitions which indicate a conflict
2690: */
2691: if (xmlFAEqualAtoms(t1->atom, t2->atom, deep) &&
2692: (t1->counter == t2->counter) &&
2693: (t1->count == t2->count))
2694: t2->to = -1; /* eliminated */
2695: }
2696: }
2697: }
2698: }
2699: }
2700:
2701: /*
2702: * Check for all states that there aren't 2 transitions
2703: * with the same atom and a different target.
2704: */
2705: for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2706: state = ctxt->states[statenr];
2707: if (state == NULL)
2708: continue;
2709: if (state->nbTrans < 2)
2710: continue;
2711: last = NULL;
2712: for (transnr = 0;transnr < state->nbTrans;transnr++) {
2713: t1 = &(state->trans[transnr]);
2714: /*
2715: * Determinism checks in case of counted or all transitions
2716: * will have to be handled separately
2717: */
2718: if (t1->atom == NULL) {
2719: continue;
2720: }
2721: if (t1->to == -1) /* eliminated */
2722: continue;
2723: for (i = 0;i < transnr;i++) {
2724: t2 = &(state->trans[i]);
2725: if (t2->to == -1) /* eliminated */
2726: continue;
2727: if (t2->atom != NULL) {
2728: /*
2729: * But here we don't use deep because we want to
2730: * find transitions which indicate a conflict
2731: */
2732: if (xmlFACompareAtoms(t1->atom, t2->atom, 1)) {
2733: ret = 0;
2734: /* mark the transitions as non-deterministic ones */
2735: t1->nd = 1;
2736: t2->nd = 1;
2737: last = t1;
2738: }
2739: } else if (t1->to != -1) {
2740: /*
2741: * do the closure in case of remaining specific
2742: * epsilon transitions like choices or all
2743: */
2744: ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2745: t2->to, t2->atom);
2746: /* don't shortcut the computation so all non deterministic
2747: transition get marked down
2748: if (ret == 0)
2749: return(0);
2750: */
2751: if (ret == 0) {
2752: t1->nd = 1;
2753: /* t2->nd = 1; */
2754: last = t1;
2755: }
2756: }
2757: }
2758: /* don't shortcut the computation so all non deterministic
2759: transition get marked down
2760: if (ret == 0)
2761: break; */
2762: }
2763:
2764: /*
2765: * mark specifically the last non-deterministic transition
2766: * from a state since there is no need to set-up rollback
2767: * from it
2768: */
2769: if (last != NULL) {
2770: last->nd = 2;
2771: }
2772:
2773: /* don't shortcut the computation so all non deterministic
2774: transition get marked down
2775: if (ret == 0)
2776: break; */
2777: }
2778:
2779: ctxt->determinist = ret;
2780: return(ret);
2781: }
2782:
2783: /************************************************************************
2784: * *
2785: * Routines to check input against transition atoms *
2786: * *
2787: ************************************************************************/
2788:
2789: static int
2790: xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
2791: int start, int end, const xmlChar *blockName) {
2792: int ret = 0;
2793:
2794: switch (type) {
2795: case XML_REGEXP_STRING:
2796: case XML_REGEXP_SUBREG:
2797: case XML_REGEXP_RANGES:
2798: case XML_REGEXP_EPSILON:
2799: return(-1);
2800: case XML_REGEXP_ANYCHAR:
2801: ret = ((codepoint != '\n') && (codepoint != '\r'));
2802: break;
2803: case XML_REGEXP_CHARVAL:
2804: ret = ((codepoint >= start) && (codepoint <= end));
2805: break;
2806: case XML_REGEXP_NOTSPACE:
2807: neg = !neg;
2808: case XML_REGEXP_ANYSPACE:
2809: ret = ((codepoint == '\n') || (codepoint == '\r') ||
2810: (codepoint == '\t') || (codepoint == ' '));
2811: break;
2812: case XML_REGEXP_NOTINITNAME:
2813: neg = !neg;
2814: case XML_REGEXP_INITNAME:
2815: ret = (IS_LETTER(codepoint) ||
2816: (codepoint == '_') || (codepoint == ':'));
2817: break;
2818: case XML_REGEXP_NOTNAMECHAR:
2819: neg = !neg;
2820: case XML_REGEXP_NAMECHAR:
2821: ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) ||
2822: (codepoint == '.') || (codepoint == '-') ||
2823: (codepoint == '_') || (codepoint == ':') ||
2824: IS_COMBINING(codepoint) || IS_EXTENDER(codepoint));
2825: break;
2826: case XML_REGEXP_NOTDECIMAL:
2827: neg = !neg;
2828: case XML_REGEXP_DECIMAL:
2829: ret = xmlUCSIsCatNd(codepoint);
2830: break;
2831: case XML_REGEXP_REALCHAR:
2832: neg = !neg;
2833: case XML_REGEXP_NOTREALCHAR:
2834: ret = xmlUCSIsCatP(codepoint);
2835: if (ret == 0)
2836: ret = xmlUCSIsCatZ(codepoint);
2837: if (ret == 0)
2838: ret = xmlUCSIsCatC(codepoint);
2839: break;
2840: case XML_REGEXP_LETTER:
2841: ret = xmlUCSIsCatL(codepoint);
2842: break;
2843: case XML_REGEXP_LETTER_UPPERCASE:
2844: ret = xmlUCSIsCatLu(codepoint);
2845: break;
2846: case XML_REGEXP_LETTER_LOWERCASE:
2847: ret = xmlUCSIsCatLl(codepoint);
2848: break;
2849: case XML_REGEXP_LETTER_TITLECASE:
2850: ret = xmlUCSIsCatLt(codepoint);
2851: break;
2852: case XML_REGEXP_LETTER_MODIFIER:
2853: ret = xmlUCSIsCatLm(codepoint);
2854: break;
2855: case XML_REGEXP_LETTER_OTHERS:
2856: ret = xmlUCSIsCatLo(codepoint);
2857: break;
2858: case XML_REGEXP_MARK:
2859: ret = xmlUCSIsCatM(codepoint);
2860: break;
2861: case XML_REGEXP_MARK_NONSPACING:
2862: ret = xmlUCSIsCatMn(codepoint);
2863: break;
2864: case XML_REGEXP_MARK_SPACECOMBINING:
2865: ret = xmlUCSIsCatMc(codepoint);
2866: break;
2867: case XML_REGEXP_MARK_ENCLOSING:
2868: ret = xmlUCSIsCatMe(codepoint);
2869: break;
2870: case XML_REGEXP_NUMBER:
2871: ret = xmlUCSIsCatN(codepoint);
2872: break;
2873: case XML_REGEXP_NUMBER_DECIMAL:
2874: ret = xmlUCSIsCatNd(codepoint);
2875: break;
2876: case XML_REGEXP_NUMBER_LETTER:
2877: ret = xmlUCSIsCatNl(codepoint);
2878: break;
2879: case XML_REGEXP_NUMBER_OTHERS:
2880: ret = xmlUCSIsCatNo(codepoint);
2881: break;
2882: case XML_REGEXP_PUNCT:
2883: ret = xmlUCSIsCatP(codepoint);
2884: break;
2885: case XML_REGEXP_PUNCT_CONNECTOR:
2886: ret = xmlUCSIsCatPc(codepoint);
2887: break;
2888: case XML_REGEXP_PUNCT_DASH:
2889: ret = xmlUCSIsCatPd(codepoint);
2890: break;
2891: case XML_REGEXP_PUNCT_OPEN:
2892: ret = xmlUCSIsCatPs(codepoint);
2893: break;
2894: case XML_REGEXP_PUNCT_CLOSE:
2895: ret = xmlUCSIsCatPe(codepoint);
2896: break;
2897: case XML_REGEXP_PUNCT_INITQUOTE:
2898: ret = xmlUCSIsCatPi(codepoint);
2899: break;
2900: case XML_REGEXP_PUNCT_FINQUOTE:
2901: ret = xmlUCSIsCatPf(codepoint);
2902: break;
2903: case XML_REGEXP_PUNCT_OTHERS:
2904: ret = xmlUCSIsCatPo(codepoint);
2905: break;
2906: case XML_REGEXP_SEPAR:
2907: ret = xmlUCSIsCatZ(codepoint);
2908: break;
2909: case XML_REGEXP_SEPAR_SPACE:
2910: ret = xmlUCSIsCatZs(codepoint);
2911: break;
2912: case XML_REGEXP_SEPAR_LINE:
2913: ret = xmlUCSIsCatZl(codepoint);
2914: break;
2915: case XML_REGEXP_SEPAR_PARA:
2916: ret = xmlUCSIsCatZp(codepoint);
2917: break;
2918: case XML_REGEXP_SYMBOL:
2919: ret = xmlUCSIsCatS(codepoint);
2920: break;
2921: case XML_REGEXP_SYMBOL_MATH:
2922: ret = xmlUCSIsCatSm(codepoint);
2923: break;
2924: case XML_REGEXP_SYMBOL_CURRENCY:
2925: ret = xmlUCSIsCatSc(codepoint);
2926: break;
2927: case XML_REGEXP_SYMBOL_MODIFIER:
2928: ret = xmlUCSIsCatSk(codepoint);
2929: break;
2930: case XML_REGEXP_SYMBOL_OTHERS:
2931: ret = xmlUCSIsCatSo(codepoint);
2932: break;
2933: case XML_REGEXP_OTHER:
2934: ret = xmlUCSIsCatC(codepoint);
2935: break;
2936: case XML_REGEXP_OTHER_CONTROL:
2937: ret = xmlUCSIsCatCc(codepoint);
2938: break;
2939: case XML_REGEXP_OTHER_FORMAT:
2940: ret = xmlUCSIsCatCf(codepoint);
2941: break;
2942: case XML_REGEXP_OTHER_PRIVATE:
2943: ret = xmlUCSIsCatCo(codepoint);
2944: break;
2945: case XML_REGEXP_OTHER_NA:
2946: /* ret = xmlUCSIsCatCn(codepoint); */
2947: /* Seems it doesn't exist anymore in recent Unicode releases */
2948: ret = 0;
2949: break;
2950: case XML_REGEXP_BLOCK_NAME:
2951: ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
2952: break;
2953: }
2954: if (neg)
2955: return(!ret);
2956: return(ret);
2957: }
2958:
2959: static int
2960: xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
2961: int i, ret = 0;
2962: xmlRegRangePtr range;
2963:
2964: if ((atom == NULL) || (!IS_CHAR(codepoint)))
2965: return(-1);
2966:
2967: switch (atom->type) {
2968: case XML_REGEXP_SUBREG:
2969: case XML_REGEXP_EPSILON:
2970: return(-1);
2971: case XML_REGEXP_CHARVAL:
2972: return(codepoint == atom->codepoint);
2973: case XML_REGEXP_RANGES: {
2974: int accept = 0;
2975:
2976: for (i = 0;i < atom->nbRanges;i++) {
2977: range = atom->ranges[i];
2978: if (range->neg == 2) {
2979: ret = xmlRegCheckCharacterRange(range->type, codepoint,
2980: 0, range->start, range->end,
2981: range->blockName);
2982: if (ret != 0)
2983: return(0); /* excluded char */
2984: } else if (range->neg) {
2985: ret = xmlRegCheckCharacterRange(range->type, codepoint,
2986: 0, range->start, range->end,
2987: range->blockName);
2988: if (ret == 0)
2989: accept = 1;
2990: else
2991: return(0);
2992: } else {
2993: ret = xmlRegCheckCharacterRange(range->type, codepoint,
2994: 0, range->start, range->end,
2995: range->blockName);
2996: if (ret != 0)
2997: accept = 1; /* might still be excluded */
2998: }
2999: }
3000: return(accept);
3001: }
3002: case XML_REGEXP_STRING:
3003: printf("TODO: XML_REGEXP_STRING\n");
3004: return(-1);
3005: case XML_REGEXP_ANYCHAR:
3006: case XML_REGEXP_ANYSPACE:
3007: case XML_REGEXP_NOTSPACE:
3008: case XML_REGEXP_INITNAME:
3009: case XML_REGEXP_NOTINITNAME:
3010: case XML_REGEXP_NAMECHAR:
3011: case XML_REGEXP_NOTNAMECHAR:
3012: case XML_REGEXP_DECIMAL:
3013: case XML_REGEXP_NOTDECIMAL:
3014: case XML_REGEXP_REALCHAR:
3015: case XML_REGEXP_NOTREALCHAR:
3016: case XML_REGEXP_LETTER:
3017: case XML_REGEXP_LETTER_UPPERCASE:
3018: case XML_REGEXP_LETTER_LOWERCASE:
3019: case XML_REGEXP_LETTER_TITLECASE:
3020: case XML_REGEXP_LETTER_MODIFIER:
3021: case XML_REGEXP_LETTER_OTHERS:
3022: case XML_REGEXP_MARK:
3023: case XML_REGEXP_MARK_NONSPACING:
3024: case XML_REGEXP_MARK_SPACECOMBINING:
3025: case XML_REGEXP_MARK_ENCLOSING:
3026: case XML_REGEXP_NUMBER:
3027: case XML_REGEXP_NUMBER_DECIMAL:
3028: case XML_REGEXP_NUMBER_LETTER:
3029: case XML_REGEXP_NUMBER_OTHERS:
3030: case XML_REGEXP_PUNCT:
3031: case XML_REGEXP_PUNCT_CONNECTOR:
3032: case XML_REGEXP_PUNCT_DASH:
3033: case XML_REGEXP_PUNCT_OPEN:
3034: case XML_REGEXP_PUNCT_CLOSE:
3035: case XML_REGEXP_PUNCT_INITQUOTE:
3036: case XML_REGEXP_PUNCT_FINQUOTE:
3037: case XML_REGEXP_PUNCT_OTHERS:
3038: case XML_REGEXP_SEPAR:
3039: case XML_REGEXP_SEPAR_SPACE:
3040: case XML_REGEXP_SEPAR_LINE:
3041: case XML_REGEXP_SEPAR_PARA:
3042: case XML_REGEXP_SYMBOL:
3043: case XML_REGEXP_SYMBOL_MATH:
3044: case XML_REGEXP_SYMBOL_CURRENCY:
3045: case XML_REGEXP_SYMBOL_MODIFIER:
3046: case XML_REGEXP_SYMBOL_OTHERS:
3047: case XML_REGEXP_OTHER:
3048: case XML_REGEXP_OTHER_CONTROL:
3049: case XML_REGEXP_OTHER_FORMAT:
3050: case XML_REGEXP_OTHER_PRIVATE:
3051: case XML_REGEXP_OTHER_NA:
3052: case XML_REGEXP_BLOCK_NAME:
3053: ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
3054: (const xmlChar *)atom->valuep);
3055: if (atom->neg)
3056: ret = !ret;
3057: break;
3058: }
3059: return(ret);
3060: }
3061:
3062: /************************************************************************
3063: * *
3064: * Saving and restoring state of an execution context *
3065: * *
3066: ************************************************************************/
3067:
3068: #ifdef DEBUG_REGEXP_EXEC
3069: static void
3070: xmlFARegDebugExec(xmlRegExecCtxtPtr exec) {
3071: printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index);
3072: if (exec->inputStack != NULL) {
3073: int i;
3074: printf(": ");
3075: for (i = 0;(i < 3) && (i < exec->inputStackNr);i++)
3076: printf("%s ", (const char *)
3077: exec->inputStack[exec->inputStackNr - (i + 1)].value);
3078: } else {
3079: printf(": %s", &(exec->inputString[exec->index]));
3080: }
3081: printf("\n");
3082: }
3083: #endif
3084:
3085: static void
3086: xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
3087: #ifdef DEBUG_REGEXP_EXEC
3088: printf("saving ");
3089: exec->transno++;
3090: xmlFARegDebugExec(exec);
3091: exec->transno--;
3092: #endif
3093: #ifdef MAX_PUSH
3094: if (exec->nbPush > MAX_PUSH) {
3095: return;
3096: }
3097: exec->nbPush++;
3098: #endif
3099:
3100: if (exec->maxRollbacks == 0) {
3101: exec->maxRollbacks = 4;
3102: exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks *
3103: sizeof(xmlRegExecRollback));
3104: if (exec->rollbacks == NULL) {
3105: xmlRegexpErrMemory(NULL, "saving regexp");
3106: exec->maxRollbacks = 0;
3107: return;
3108: }
3109: memset(exec->rollbacks, 0,
3110: exec->maxRollbacks * sizeof(xmlRegExecRollback));
3111: } else if (exec->nbRollbacks >= exec->maxRollbacks) {
3112: xmlRegExecRollback *tmp;
3113: int len = exec->maxRollbacks;
3114:
3115: exec->maxRollbacks *= 2;
3116: tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks,
3117: exec->maxRollbacks * sizeof(xmlRegExecRollback));
3118: if (tmp == NULL) {
3119: xmlRegexpErrMemory(NULL, "saving regexp");
3120: exec->maxRollbacks /= 2;
3121: return;
3122: }
3123: exec->rollbacks = tmp;
3124: tmp = &exec->rollbacks[len];
3125: memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
3126: }
3127: exec->rollbacks[exec->nbRollbacks].state = exec->state;
3128: exec->rollbacks[exec->nbRollbacks].index = exec->index;
3129: exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
3130: if (exec->comp->nbCounters > 0) {
3131: if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3132: exec->rollbacks[exec->nbRollbacks].counts = (int *)
3133: xmlMalloc(exec->comp->nbCounters * sizeof(int));
3134: if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3135: xmlRegexpErrMemory(NULL, "saving regexp");
3136: exec->status = -5;
3137: return;
3138: }
3139: }
3140: memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
3141: exec->comp->nbCounters * sizeof(int));
3142: }
3143: exec->nbRollbacks++;
3144: }
3145:
3146: static void
3147: xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
3148: if (exec->nbRollbacks <= 0) {
3149: exec->status = -1;
3150: #ifdef DEBUG_REGEXP_EXEC
3151: printf("rollback failed on empty stack\n");
3152: #endif
3153: return;
3154: }
3155: exec->nbRollbacks--;
3156: exec->state = exec->rollbacks[exec->nbRollbacks].state;
3157: exec->index = exec->rollbacks[exec->nbRollbacks].index;
3158: exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
3159: if (exec->comp->nbCounters > 0) {
3160: if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3161: fprintf(stderr, "exec save: allocation failed");
3162: exec->status = -6;
3163: return;
3164: }
3165: memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
3166: exec->comp->nbCounters * sizeof(int));
3167: }
3168:
3169: #ifdef DEBUG_REGEXP_EXEC
3170: printf("restored ");
3171: xmlFARegDebugExec(exec);
3172: #endif
3173: }
3174:
3175: /************************************************************************
3176: * *
3177: * Verifier, running an input against a compiled regexp *
3178: * *
3179: ************************************************************************/
3180:
3181: static int
3182: xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
3183: xmlRegExecCtxt execval;
3184: xmlRegExecCtxtPtr exec = &execval;
3185: int ret, codepoint = 0, len, deter;
3186:
3187: exec->inputString = content;
3188: exec->index = 0;
3189: exec->nbPush = 0;
3190: exec->determinist = 1;
3191: exec->maxRollbacks = 0;
3192: exec->nbRollbacks = 0;
3193: exec->rollbacks = NULL;
3194: exec->status = 0;
3195: exec->comp = comp;
3196: exec->state = comp->states[0];
3197: exec->transno = 0;
3198: exec->transcount = 0;
3199: exec->inputStack = NULL;
3200: exec->inputStackMax = 0;
3201: if (comp->nbCounters > 0) {
3202: exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
3203: if (exec->counts == NULL) {
3204: xmlRegexpErrMemory(NULL, "running regexp");
3205: return(-1);
3206: }
3207: memset(exec->counts, 0, comp->nbCounters * sizeof(int));
3208: } else
3209: exec->counts = NULL;
3210: while ((exec->status == 0) && (exec->state != NULL) &&
3211: ((exec->inputString[exec->index] != 0) ||
3212: ((exec->state != NULL) &&
3213: (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3214: xmlRegTransPtr trans;
3215: xmlRegAtomPtr atom;
3216:
3217: /*
3218: * If end of input on non-terminal state, rollback, however we may
3219: * still have epsilon like transition for counted transitions
3220: * on counters, in that case don't break too early. Additionally,
3221: * if we are working on a range like "AB{0,2}", where B is not present,
3222: * we don't want to break.
3223: */
3224: len = 1;
3225: if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) {
3226: /*
3227: * if there is a transition, we must check if
3228: * atom allows minOccurs of 0
3229: */
3230: if (exec->transno < exec->state->nbTrans) {
3231: trans = &exec->state->trans[exec->transno];
3232: if (trans->to >=0) {
3233: atom = trans->atom;
3234: if (!((atom->min == 0) && (atom->max > 0)))
3235: goto rollback;
3236: }
3237: } else
3238: goto rollback;
3239: }
3240:
3241: exec->transcount = 0;
3242: for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3243: trans = &exec->state->trans[exec->transno];
3244: if (trans->to < 0)
3245: continue;
3246: atom = trans->atom;
3247: ret = 0;
3248: deter = 1;
3249: if (trans->count >= 0) {
3250: int count;
3251: xmlRegCounterPtr counter;
3252:
3253: if (exec->counts == NULL) {
3254: exec->status = -1;
3255: goto error;
3256: }
3257: /*
3258: * A counted transition.
3259: */
3260:
3261: count = exec->counts[trans->count];
3262: counter = &exec->comp->counters[trans->count];
3263: #ifdef DEBUG_REGEXP_EXEC
3264: printf("testing count %d: val %d, min %d, max %d\n",
3265: trans->count, count, counter->min, counter->max);
3266: #endif
3267: ret = ((count >= counter->min) && (count <= counter->max));
3268: if ((ret) && (counter->min != counter->max))
3269: deter = 0;
3270: } else if (atom == NULL) {
3271: fprintf(stderr, "epsilon transition left at runtime\n");
3272: exec->status = -2;
3273: break;
3274: } else if (exec->inputString[exec->index] != 0) {
3275: codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
3276: ret = xmlRegCheckCharacter(atom, codepoint);
3277: if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) {
3278: xmlRegStatePtr to = comp->states[trans->to];
3279:
3280: /*
3281: * this is a multiple input sequence
3282: * If there is a counter associated increment it now.
3283: * before potentially saving and rollback
3284: * do not increment if the counter is already over the
3285: * maximum limit in which case get to next transition
3286: */
3287: if (trans->counter >= 0) {
3288: xmlRegCounterPtr counter;
3289:
3290: if ((exec->counts == NULL) ||
3291: (exec->comp == NULL) ||
3292: (exec->comp->counters == NULL)) {
3293: exec->status = -1;
3294: goto error;
3295: }
3296: counter = &exec->comp->counters[trans->counter];
3297: if (exec->counts[trans->counter] >= counter->max)
3298: continue; /* for loop on transitions */
3299:
3300: #ifdef DEBUG_REGEXP_EXEC
3301: printf("Increasing count %d\n", trans->counter);
3302: #endif
3303: exec->counts[trans->counter]++;
3304: }
3305: if (exec->state->nbTrans > exec->transno + 1) {
3306: xmlFARegExecSave(exec);
3307: }
3308: exec->transcount = 1;
3309: do {
3310: /*
3311: * Try to progress as much as possible on the input
3312: */
3313: if (exec->transcount == atom->max) {
3314: break;
3315: }
3316: exec->index += len;
3317: /*
3318: * End of input: stop here
3319: */
3320: if (exec->inputString[exec->index] == 0) {
3321: exec->index -= len;
3322: break;
3323: }
3324: if (exec->transcount >= atom->min) {
3325: int transno = exec->transno;
3326: xmlRegStatePtr state = exec->state;
3327:
3328: /*
3329: * The transition is acceptable save it
3330: */
3331: exec->transno = -1; /* trick */
3332: exec->state = to;
3333: xmlFARegExecSave(exec);
3334: exec->transno = transno;
3335: exec->state = state;
3336: }
3337: codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
3338: len);
3339: ret = xmlRegCheckCharacter(atom, codepoint);
3340: exec->transcount++;
3341: } while (ret == 1);
3342: if (exec->transcount < atom->min)
3343: ret = 0;
3344:
3345: /*
3346: * If the last check failed but one transition was found
3347: * possible, rollback
3348: */
3349: if (ret < 0)
3350: ret = 0;
3351: if (ret == 0) {
3352: goto rollback;
3353: }
3354: if (trans->counter >= 0) {
3355: if (exec->counts == NULL) {
3356: exec->status = -1;
3357: goto error;
3358: }
3359: #ifdef DEBUG_REGEXP_EXEC
3360: printf("Decreasing count %d\n", trans->counter);
3361: #endif
3362: exec->counts[trans->counter]--;
3363: }
3364: } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) {
3365: /*
3366: * we don't match on the codepoint, but minOccurs of 0
3367: * says that's ok. Setting len to 0 inhibits stepping
3368: * over the codepoint.
3369: */
3370: exec->transcount = 1;
3371: len = 0;
3372: ret = 1;
3373: }
3374: } else if ((atom->min == 0) && (atom->max > 0)) {
3375: /* another spot to match when minOccurs is 0 */
3376: exec->transcount = 1;
3377: len = 0;
3378: ret = 1;
3379: }
3380: if (ret == 1) {
3381: if ((trans->nd == 1) ||
3382: ((trans->count >= 0) && (deter == 0) &&
3383: (exec->state->nbTrans > exec->transno + 1))) {
3384: #ifdef DEBUG_REGEXP_EXEC
3385: if (trans->nd == 1)
3386: printf("Saving on nd transition atom %d for %c at %d\n",
3387: trans->atom->no, codepoint, exec->index);
3388: else
3389: printf("Saving on counted transition count %d for %c at %d\n",
3390: trans->count, codepoint, exec->index);
3391: #endif
3392: xmlFARegExecSave(exec);
3393: }
3394: if (trans->counter >= 0) {
3395: xmlRegCounterPtr counter;
3396:
3397: /* make sure we don't go over the counter maximum value */
3398: if ((exec->counts == NULL) ||
3399: (exec->comp == NULL) ||
3400: (exec->comp->counters == NULL)) {
3401: exec->status = -1;
3402: goto error;
3403: }
3404: counter = &exec->comp->counters[trans->counter];
3405: if (exec->counts[trans->counter] >= counter->max)
3406: continue; /* for loop on transitions */
3407: #ifdef DEBUG_REGEXP_EXEC
3408: printf("Increasing count %d\n", trans->counter);
3409: #endif
3410: exec->counts[trans->counter]++;
3411: }
3412: if ((trans->count >= 0) &&
3413: (trans->count < REGEXP_ALL_COUNTER)) {
3414: if (exec->counts == NULL) {
3415: exec->status = -1;
3416: goto error;
3417: }
3418: #ifdef DEBUG_REGEXP_EXEC
3419: printf("resetting count %d on transition\n",
3420: trans->count);
3421: #endif
3422: exec->counts[trans->count] = 0;
3423: }
3424: #ifdef DEBUG_REGEXP_EXEC
3425: printf("entering state %d\n", trans->to);
3426: #endif
3427: exec->state = comp->states[trans->to];
3428: exec->transno = 0;
3429: if (trans->atom != NULL) {
3430: exec->index += len;
3431: }
3432: goto progress;
3433: } else if (ret < 0) {
3434: exec->status = -4;
3435: break;
3436: }
3437: }
3438: if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3439: rollback:
3440: /*
3441: * Failed to find a way out
3442: */
3443: exec->determinist = 0;
3444: #ifdef DEBUG_REGEXP_EXEC
3445: printf("rollback from state %d on %d:%c\n", exec->state->no,
3446: codepoint,codepoint);
3447: #endif
3448: xmlFARegExecRollBack(exec);
3449: }
3450: progress:
3451: continue;
3452: }
3453: error:
3454: if (exec->rollbacks != NULL) {
3455: if (exec->counts != NULL) {
3456: int i;
3457:
3458: for (i = 0;i < exec->maxRollbacks;i++)
3459: if (exec->rollbacks[i].counts != NULL)
3460: xmlFree(exec->rollbacks[i].counts);
3461: }
3462: xmlFree(exec->rollbacks);
3463: }
3464: if (exec->state == NULL)
3465: return(-1);
3466: if (exec->counts != NULL)
3467: xmlFree(exec->counts);
3468: if (exec->status == 0)
3469: return(1);
3470: if (exec->status == -1) {
3471: if (exec->nbPush > MAX_PUSH)
3472: return(-1);
3473: return(0);
3474: }
3475: return(exec->status);
3476: }
3477:
3478: /************************************************************************
3479: * *
3480: * Progressive interface to the verifier one atom at a time *
3481: * *
3482: ************************************************************************/
3483: #ifdef DEBUG_ERR
3484: static void testerr(xmlRegExecCtxtPtr exec);
3485: #endif
3486:
3487: /**
3488: * xmlRegNewExecCtxt:
3489: * @comp: a precompiled regular expression
3490: * @callback: a callback function used for handling progresses in the
3491: * automata matching phase
3492: * @data: the context data associated to the callback in this context
3493: *
3494: * Build a context used for progressive evaluation of a regexp.
3495: *
3496: * Returns the new context
3497: */
3498: xmlRegExecCtxtPtr
3499: xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
3500: xmlRegExecCtxtPtr exec;
3501:
3502: if (comp == NULL)
3503: return(NULL);
3504: if ((comp->compact == NULL) && (comp->states == NULL))
3505: return(NULL);
3506: exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt));
3507: if (exec == NULL) {
3508: xmlRegexpErrMemory(NULL, "creating execution context");
3509: return(NULL);
3510: }
3511: memset(exec, 0, sizeof(xmlRegExecCtxt));
3512: exec->inputString = NULL;
3513: exec->index = 0;
3514: exec->determinist = 1;
3515: exec->maxRollbacks = 0;
3516: exec->nbRollbacks = 0;
3517: exec->rollbacks = NULL;
3518: exec->status = 0;
3519: exec->comp = comp;
3520: if (comp->compact == NULL)
3521: exec->state = comp->states[0];
3522: exec->transno = 0;
3523: exec->transcount = 0;
3524: exec->callback = callback;
3525: exec->data = data;
3526: if (comp->nbCounters > 0) {
3527: /*
3528: * For error handling, exec->counts is allocated twice the size
3529: * the second half is used to store the data in case of rollback
3530: */
3531: exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)
3532: * 2);
3533: if (exec->counts == NULL) {
3534: xmlRegexpErrMemory(NULL, "creating execution context");
3535: xmlFree(exec);
3536: return(NULL);
3537: }
3538: memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2);
3539: exec->errCounts = &exec->counts[comp->nbCounters];
3540: } else {
3541: exec->counts = NULL;
3542: exec->errCounts = NULL;
3543: }
3544: exec->inputStackMax = 0;
3545: exec->inputStackNr = 0;
3546: exec->inputStack = NULL;
3547: exec->errStateNo = -1;
3548: exec->errString = NULL;
3549: exec->nbPush = 0;
3550: return(exec);
3551: }
3552:
3553: /**
3554: * xmlRegFreeExecCtxt:
3555: * @exec: a regular expression evaulation context
3556: *
3557: * Free the structures associated to a regular expression evaulation context.
3558: */
3559: void
3560: xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) {
3561: if (exec == NULL)
3562: return;
3563:
3564: if (exec->rollbacks != NULL) {
3565: if (exec->counts != NULL) {
3566: int i;
3567:
3568: for (i = 0;i < exec->maxRollbacks;i++)
3569: if (exec->rollbacks[i].counts != NULL)
3570: xmlFree(exec->rollbacks[i].counts);
3571: }
3572: xmlFree(exec->rollbacks);
3573: }
3574: if (exec->counts != NULL)
3575: xmlFree(exec->counts);
3576: if (exec->inputStack != NULL) {
3577: int i;
3578:
3579: for (i = 0;i < exec->inputStackNr;i++) {
3580: if (exec->inputStack[i].value != NULL)
3581: xmlFree(exec->inputStack[i].value);
3582: }
3583: xmlFree(exec->inputStack);
3584: }
3585: if (exec->errString != NULL)
3586: xmlFree(exec->errString);
3587: xmlFree(exec);
3588: }
3589:
3590: static void
3591: xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
3592: void *data) {
3593: #ifdef DEBUG_PUSH
3594: printf("saving value: %d:%s\n", exec->inputStackNr, value);
3595: #endif
3596: if (exec->inputStackMax == 0) {
3597: exec->inputStackMax = 4;
3598: exec->inputStack = (xmlRegInputTokenPtr)
3599: xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken));
3600: if (exec->inputStack == NULL) {
3601: xmlRegexpErrMemory(NULL, "pushing input string");
3602: exec->inputStackMax = 0;
3603: return;
3604: }
3605: } else if (exec->inputStackNr + 1 >= exec->inputStackMax) {
3606: xmlRegInputTokenPtr tmp;
3607:
3608: exec->inputStackMax *= 2;
3609: tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack,
3610: exec->inputStackMax * sizeof(xmlRegInputToken));
3611: if (tmp == NULL) {
3612: xmlRegexpErrMemory(NULL, "pushing input string");
3613: exec->inputStackMax /= 2;
3614: return;
3615: }
3616: exec->inputStack = tmp;
3617: }
3618: exec->inputStack[exec->inputStackNr].value = xmlStrdup(value);
3619: exec->inputStack[exec->inputStackNr].data = data;
3620: exec->inputStackNr++;
3621: exec->inputStack[exec->inputStackNr].value = NULL;
3622: exec->inputStack[exec->inputStackNr].data = NULL;
3623: }
3624:
3625: /**
3626: * xmlRegStrEqualWildcard:
3627: * @expStr: the string to be evaluated
3628: * @valStr: the validation string
3629: *
3630: * Checks if both strings are equal or have the same content. "*"
3631: * can be used as a wildcard in @valStr; "|" is used as a seperator of
3632: * substrings in both @expStr and @valStr.
3633: *
3634: * Returns 1 if the comparison is satisfied and the number of substrings
3635: * is equal, 0 otherwise.
3636: */
3637:
3638: static int
3639: xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) {
3640: if (expStr == valStr) return(1);
3641: if (expStr == NULL) return(0);
3642: if (valStr == NULL) return(0);
3643: do {
3644: /*
3645: * Eval if we have a wildcard for the current item.
3646: */
3647: if (*expStr != *valStr) {
3648: /* if one of them starts with a wildcard make valStr be it */
3649: if (*valStr == '*') {
3650: const xmlChar *tmp;
3651:
3652: tmp = valStr;
3653: valStr = expStr;
3654: expStr = tmp;
3655: }
3656: if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) {
3657: do {
3658: if (*valStr == XML_REG_STRING_SEPARATOR)
3659: break;
3660: valStr++;
3661: } while (*valStr != 0);
3662: continue;
3663: } else
3664: return(0);
3665: }
3666: expStr++;
3667: valStr++;
3668: } while (*valStr != 0);
3669: if (*expStr != 0)
3670: return (0);
3671: else
3672: return (1);
3673: }
3674:
3675: /**
3676: * xmlRegCompactPushString:
3677: * @exec: a regexp execution context
3678: * @comp: the precompiled exec with a compact table
3679: * @value: a string token input
3680: * @data: data associated to the token to reuse in callbacks
3681: *
3682: * Push one input token in the execution context
3683: *
3684: * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3685: * a negative value in case of error.
3686: */
3687: static int
3688: xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
3689: xmlRegexpPtr comp,
3690: const xmlChar *value,
3691: void *data) {
3692: int state = exec->index;
3693: int i, target;
3694:
3695: if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
3696: return(-1);
3697:
3698: if (value == NULL) {
3699: /*
3700: * are we at a final state ?
3701: */
3702: if (comp->compact[state * (comp->nbstrings + 1)] ==
3703: XML_REGEXP_FINAL_STATE)
3704: return(1);
3705: return(0);
3706: }
3707:
3708: #ifdef DEBUG_PUSH
3709: printf("value pushed: %s\n", value);
3710: #endif
3711:
3712: /*
3713: * Examine all outside transitions from current state
3714: */
3715: for (i = 0;i < comp->nbstrings;i++) {
3716: target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
3717: if ((target > 0) && (target <= comp->nbstates)) {
3718: target--; /* to avoid 0 */
3719: if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) {
3720: exec->index = target;
3721: if ((exec->callback != NULL) && (comp->transdata != NULL)) {
3722: exec->callback(exec->data, value,
3723: comp->transdata[state * comp->nbstrings + i], data);
3724: }
3725: #ifdef DEBUG_PUSH
3726: printf("entering state %d\n", target);
3727: #endif
3728: if (comp->compact[target * (comp->nbstrings + 1)] ==
3729: XML_REGEXP_SINK_STATE)
3730: goto error;
3731:
3732: if (comp->compact[target * (comp->nbstrings + 1)] ==
3733: XML_REGEXP_FINAL_STATE)
3734: return(1);
3735: return(0);
3736: }
3737: }
3738: }
3739: /*
3740: * Failed to find an exit transition out from current state for the
3741: * current token
3742: */
3743: #ifdef DEBUG_PUSH
3744: printf("failed to find a transition for %s on state %d\n", value, state);
3745: #endif
3746: error:
3747: if (exec->errString != NULL)
3748: xmlFree(exec->errString);
3749: exec->errString = xmlStrdup(value);
3750: exec->errStateNo = state;
3751: exec->status = -1;
3752: #ifdef DEBUG_ERR
3753: testerr(exec);
3754: #endif
3755: return(-1);
3756: }
3757:
3758: /**
3759: * xmlRegExecPushStringInternal:
3760: * @exec: a regexp execution context or NULL to indicate the end
3761: * @value: a string token input
3762: * @data: data associated to the token to reuse in callbacks
3763: * @compound: value was assembled from 2 strings
3764: *
3765: * Push one input token in the execution context
3766: *
3767: * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3768: * a negative value in case of error.
3769: */
3770: static int
3771: xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value,
3772: void *data, int compound) {
3773: xmlRegTransPtr trans;
3774: xmlRegAtomPtr atom;
3775: int ret;
3776: int final = 0;
3777: int progress = 1;
3778:
3779: if (exec == NULL)
3780: return(-1);
3781: if (exec->comp == NULL)
3782: return(-1);
3783: if (exec->status != 0)
3784: return(exec->status);
3785:
3786: if (exec->comp->compact != NULL)
3787: return(xmlRegCompactPushString(exec, exec->comp, value, data));
3788:
3789: if (value == NULL) {
3790: if (exec->state->type == XML_REGEXP_FINAL_STATE)
3791: return(1);
3792: final = 1;
3793: }
3794:
3795: #ifdef DEBUG_PUSH
3796: printf("value pushed: %s\n", value);
3797: #endif
3798: /*
3799: * If we have an active rollback stack push the new value there
3800: * and get back to where we were left
3801: */
3802: if ((value != NULL) && (exec->inputStackNr > 0)) {
3803: xmlFARegExecSaveInputString(exec, value, data);
3804: value = exec->inputStack[exec->index].value;
3805: data = exec->inputStack[exec->index].data;
3806: #ifdef DEBUG_PUSH
3807: printf("value loaded: %s\n", value);
3808: #endif
3809: }
3810:
3811: while ((exec->status == 0) &&
3812: ((value != NULL) ||
3813: ((final == 1) &&
3814: (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3815:
3816: /*
3817: * End of input on non-terminal state, rollback, however we may
3818: * still have epsilon like transition for counted transitions
3819: * on counters, in that case don't break too early.
3820: */
3821: if ((value == NULL) && (exec->counts == NULL))
3822: goto rollback;
3823:
3824: exec->transcount = 0;
3825: for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3826: trans = &exec->state->trans[exec->transno];
3827: if (trans->to < 0)
3828: continue;
3829: atom = trans->atom;
3830: ret = 0;
3831: if (trans->count == REGEXP_ALL_LAX_COUNTER) {
3832: int i;
3833: int count;
3834: xmlRegTransPtr t;
3835: xmlRegCounterPtr counter;
3836:
3837: ret = 0;
3838:
3839: #ifdef DEBUG_PUSH
3840: printf("testing all lax %d\n", trans->count);
3841: #endif
3842: /*
3843: * Check all counted transitions from the current state
3844: */
3845: if ((value == NULL) && (final)) {
3846: ret = 1;
3847: } else if (value != NULL) {
3848: for (i = 0;i < exec->state->nbTrans;i++) {
3849: t = &exec->state->trans[i];
3850: if ((t->counter < 0) || (t == trans))
3851: continue;
3852: counter = &exec->comp->counters[t->counter];
3853: count = exec->counts[t->counter];
3854: if ((count < counter->max) &&
3855: (t->atom != NULL) &&
3856: (xmlStrEqual(value, t->atom->valuep))) {
3857: ret = 0;
3858: break;
3859: }
3860: if ((count >= counter->min) &&
3861: (count < counter->max) &&
3862: (t->atom != NULL) &&
3863: (xmlStrEqual(value, t->atom->valuep))) {
3864: ret = 1;
3865: break;
3866: }
3867: }
3868: }
3869: } else if (trans->count == REGEXP_ALL_COUNTER) {
3870: int i;
3871: int count;
3872: xmlRegTransPtr t;
3873: xmlRegCounterPtr counter;
3874:
3875: ret = 1;
3876:
3877: #ifdef DEBUG_PUSH
3878: printf("testing all %d\n", trans->count);
3879: #endif
3880: /*
3881: * Check all counted transitions from the current state
3882: */
3883: for (i = 0;i < exec->state->nbTrans;i++) {
3884: t = &exec->state->trans[i];
3885: if ((t->counter < 0) || (t == trans))
3886: continue;
3887: counter = &exec->comp->counters[t->counter];
3888: count = exec->counts[t->counter];
3889: if ((count < counter->min) || (count > counter->max)) {
3890: ret = 0;
3891: break;
3892: }
3893: }
3894: } else if (trans->count >= 0) {
3895: int count;
3896: xmlRegCounterPtr counter;
3897:
3898: /*
3899: * A counted transition.
3900: */
3901:
3902: count = exec->counts[trans->count];
3903: counter = &exec->comp->counters[trans->count];
3904: #ifdef DEBUG_PUSH
3905: printf("testing count %d: val %d, min %d, max %d\n",
3906: trans->count, count, counter->min, counter->max);
3907: #endif
3908: ret = ((count >= counter->min) && (count <= counter->max));
3909: } else if (atom == NULL) {
3910: fprintf(stderr, "epsilon transition left at runtime\n");
3911: exec->status = -2;
3912: break;
3913: } else if (value != NULL) {
3914: ret = xmlRegStrEqualWildcard(atom->valuep, value);
3915: if (atom->neg) {
3916: ret = !ret;
3917: if (!compound)
3918: ret = 0;
3919: }
3920: if ((ret == 1) && (trans->counter >= 0)) {
3921: xmlRegCounterPtr counter;
3922: int count;
3923:
3924: count = exec->counts[trans->counter];
3925: counter = &exec->comp->counters[trans->counter];
3926: if (count >= counter->max)
3927: ret = 0;
3928: }
3929:
3930: if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
3931: xmlRegStatePtr to = exec->comp->states[trans->to];
3932:
3933: /*
3934: * this is a multiple input sequence
3935: */
3936: if (exec->state->nbTrans > exec->transno + 1) {
3937: if (exec->inputStackNr <= 0) {
3938: xmlFARegExecSaveInputString(exec, value, data);
3939: }
3940: xmlFARegExecSave(exec);
3941: }
3942: exec->transcount = 1;
3943: do {
3944: /*
3945: * Try to progress as much as possible on the input
3946: */
3947: if (exec->transcount == atom->max) {
3948: break;
3949: }
3950: exec->index++;
3951: value = exec->inputStack[exec->index].value;
3952: data = exec->inputStack[exec->index].data;
3953: #ifdef DEBUG_PUSH
3954: printf("value loaded: %s\n", value);
3955: #endif
3956:
3957: /*
3958: * End of input: stop here
3959: */
3960: if (value == NULL) {
3961: exec->index --;
3962: break;
3963: }
3964: if (exec->transcount >= atom->min) {
3965: int transno = exec->transno;
3966: xmlRegStatePtr state = exec->state;
3967:
3968: /*
3969: * The transition is acceptable save it
3970: */
3971: exec->transno = -1; /* trick */
3972: exec->state = to;
3973: if (exec->inputStackNr <= 0) {
3974: xmlFARegExecSaveInputString(exec, value, data);
3975: }
3976: xmlFARegExecSave(exec);
3977: exec->transno = transno;
3978: exec->state = state;
3979: }
3980: ret = xmlStrEqual(value, atom->valuep);
3981: exec->transcount++;
3982: } while (ret == 1);
3983: if (exec->transcount < atom->min)
3984: ret = 0;
3985:
3986: /*
3987: * If the last check failed but one transition was found
3988: * possible, rollback
3989: */
3990: if (ret < 0)
3991: ret = 0;
3992: if (ret == 0) {
3993: goto rollback;
3994: }
3995: }
3996: }
3997: if (ret == 1) {
3998: if ((exec->callback != NULL) && (atom != NULL) &&
3999: (data != NULL)) {
4000: exec->callback(exec->data, atom->valuep,
4001: atom->data, data);
4002: }
4003: if (exec->state->nbTrans > exec->transno + 1) {
4004: if (exec->inputStackNr <= 0) {
4005: xmlFARegExecSaveInputString(exec, value, data);
4006: }
4007: xmlFARegExecSave(exec);
4008: }
4009: if (trans->counter >= 0) {
4010: #ifdef DEBUG_PUSH
4011: printf("Increasing count %d\n", trans->counter);
4012: #endif
4013: exec->counts[trans->counter]++;
4014: }
4015: if ((trans->count >= 0) &&
4016: (trans->count < REGEXP_ALL_COUNTER)) {
4017: #ifdef DEBUG_REGEXP_EXEC
4018: printf("resetting count %d on transition\n",
4019: trans->count);
4020: #endif
4021: exec->counts[trans->count] = 0;
4022: }
4023: #ifdef DEBUG_PUSH
4024: printf("entering state %d\n", trans->to);
4025: #endif
4026: if ((exec->comp->states[trans->to] != NULL) &&
4027: (exec->comp->states[trans->to]->type ==
4028: XML_REGEXP_SINK_STATE)) {
4029: /*
4030: * entering a sink state, save the current state as error
4031: * state.
4032: */
4033: if (exec->errString != NULL)
4034: xmlFree(exec->errString);
4035: exec->errString = xmlStrdup(value);
4036: exec->errState = exec->state;
4037: memcpy(exec->errCounts, exec->counts,
4038: exec->comp->nbCounters * sizeof(int));
4039: }
4040: exec->state = exec->comp->states[trans->to];
4041: exec->transno = 0;
4042: if (trans->atom != NULL) {
4043: if (exec->inputStack != NULL) {
4044: exec->index++;
4045: if (exec->index < exec->inputStackNr) {
4046: value = exec->inputStack[exec->index].value;
4047: data = exec->inputStack[exec->index].data;
4048: #ifdef DEBUG_PUSH
4049: printf("value loaded: %s\n", value);
4050: #endif
4051: } else {
4052: value = NULL;
4053: data = NULL;
4054: #ifdef DEBUG_PUSH
4055: printf("end of input\n");
4056: #endif
4057: }
4058: } else {
4059: value = NULL;
4060: data = NULL;
4061: #ifdef DEBUG_PUSH
4062: printf("end of input\n");
4063: #endif
4064: }
4065: }
4066: goto progress;
4067: } else if (ret < 0) {
4068: exec->status = -4;
4069: break;
4070: }
4071: }
4072: if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4073: rollback:
4074: /*
4075: * if we didn't yet rollback on the current input
4076: * store the current state as the error state.
4077: */
4078: if ((progress) && (exec->state != NULL) &&
4079: (exec->state->type != XML_REGEXP_SINK_STATE)) {
4080: progress = 0;
4081: if (exec->errString != NULL)
4082: xmlFree(exec->errString);
4083: exec->errString = xmlStrdup(value);
4084: exec->errState = exec->state;
4085: memcpy(exec->errCounts, exec->counts,
4086: exec->comp->nbCounters * sizeof(int));
4087: }
4088:
4089: /*
4090: * Failed to find a way out
4091: */
4092: exec->determinist = 0;
4093: xmlFARegExecRollBack(exec);
4094: if (exec->status == 0) {
4095: value = exec->inputStack[exec->index].value;
4096: data = exec->inputStack[exec->index].data;
4097: #ifdef DEBUG_PUSH
4098: printf("value loaded: %s\n", value);
4099: #endif
4100: }
4101: }
4102: continue;
4103: progress:
4104: progress = 1;
4105: continue;
4106: }
4107: if (exec->status == 0) {
4108: return(exec->state->type == XML_REGEXP_FINAL_STATE);
4109: }
4110: #ifdef DEBUG_ERR
4111: if (exec->status < 0) {
4112: testerr(exec);
4113: }
4114: #endif
4115: return(exec->status);
4116: }
4117:
4118: /**
4119: * xmlRegExecPushString:
4120: * @exec: a regexp execution context or NULL to indicate the end
4121: * @value: a string token input
4122: * @data: data associated to the token to reuse in callbacks
4123: *
4124: * Push one input token in the execution context
4125: *
4126: * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4127: * a negative value in case of error.
4128: */
4129: int
4130: xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
4131: void *data) {
4132: return(xmlRegExecPushStringInternal(exec, value, data, 0));
4133: }
4134:
4135: /**
4136: * xmlRegExecPushString2:
4137: * @exec: a regexp execution context or NULL to indicate the end
4138: * @value: the first string token input
4139: * @value2: the second string token input
4140: * @data: data associated to the token to reuse in callbacks
4141: *
4142: * Push one input token in the execution context
4143: *
4144: * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4145: * a negative value in case of error.
4146: */
4147: int
4148: xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value,
4149: const xmlChar *value2, void *data) {
4150: xmlChar buf[150];
4151: int lenn, lenp, ret;
4152: xmlChar *str;
4153:
4154: if (exec == NULL)
4155: return(-1);
4156: if (exec->comp == NULL)
4157: return(-1);
4158: if (exec->status != 0)
4159: return(exec->status);
4160:
4161: if (value2 == NULL)
4162: return(xmlRegExecPushString(exec, value, data));
4163:
4164: lenn = strlen((char *) value2);
4165: lenp = strlen((char *) value);
4166:
4167: if (150 < lenn + lenp + 2) {
4168: str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
4169: if (str == NULL) {
4170: exec->status = -1;
4171: return(-1);
4172: }
4173: } else {
4174: str = buf;
4175: }
4176: memcpy(&str[0], value, lenp);
4177: str[lenp] = XML_REG_STRING_SEPARATOR;
4178: memcpy(&str[lenp + 1], value2, lenn);
4179: str[lenn + lenp + 1] = 0;
4180:
4181: if (exec->comp->compact != NULL)
4182: ret = xmlRegCompactPushString(exec, exec->comp, str, data);
4183: else
4184: ret = xmlRegExecPushStringInternal(exec, str, data, 1);
4185:
4186: if (str != buf)
4187: xmlFree(str);
4188: return(ret);
4189: }
4190:
4191: /**
4192: * xmlRegExecGetValues:
4193: * @exec: a regexp execution context
4194: * @err: error extraction or normal one
4195: * @nbval: pointer to the number of accepted values IN/OUT
4196: * @nbneg: return number of negative transitions
4197: * @values: pointer to the array of acceptable values
4198: * @terminal: return value if this was a terminal state
4199: *
4200: * Extract informations from the regexp execution, internal routine to
4201: * implement xmlRegExecNextValues() and xmlRegExecErrInfo()
4202: *
4203: * Returns: 0 in case of success or -1 in case of error.
4204: */
4205: static int
4206: xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err,
4207: int *nbval, int *nbneg,
4208: xmlChar **values, int *terminal) {
4209: int maxval;
4210: int nb = 0;
4211:
4212: if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) ||
4213: (values == NULL) || (*nbval <= 0))
4214: return(-1);
4215:
4216: maxval = *nbval;
4217: *nbval = 0;
4218: *nbneg = 0;
4219: if ((exec->comp != NULL) && (exec->comp->compact != NULL)) {
4220: xmlRegexpPtr comp;
4221: int target, i, state;
4222:
4223: comp = exec->comp;
4224:
4225: if (err) {
4226: if (exec->errStateNo == -1) return(-1);
4227: state = exec->errStateNo;
4228: } else {
4229: state = exec->index;
4230: }
4231: if (terminal != NULL) {
4232: if (comp->compact[state * (comp->nbstrings + 1)] ==
4233: XML_REGEXP_FINAL_STATE)
4234: *terminal = 1;
4235: else
4236: *terminal = 0;
4237: }
4238: for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4239: target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4240: if ((target > 0) && (target <= comp->nbstates) &&
4241: (comp->compact[(target - 1) * (comp->nbstrings + 1)] !=
4242: XML_REGEXP_SINK_STATE)) {
4243: values[nb++] = comp->stringMap[i];
4244: (*nbval)++;
4245: }
4246: }
4247: for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4248: target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4249: if ((target > 0) && (target <= comp->nbstates) &&
4250: (comp->compact[(target - 1) * (comp->nbstrings + 1)] ==
4251: XML_REGEXP_SINK_STATE)) {
4252: values[nb++] = comp->stringMap[i];
4253: (*nbneg)++;
4254: }
4255: }
4256: } else {
4257: int transno;
4258: xmlRegTransPtr trans;
4259: xmlRegAtomPtr atom;
4260: xmlRegStatePtr state;
4261:
4262: if (terminal != NULL) {
4263: if (exec->state->type == XML_REGEXP_FINAL_STATE)
4264: *terminal = 1;
4265: else
4266: *terminal = 0;
4267: }
4268:
4269: if (err) {
4270: if (exec->errState == NULL) return(-1);
4271: state = exec->errState;
4272: } else {
4273: if (exec->state == NULL) return(-1);
4274: state = exec->state;
4275: }
4276: for (transno = 0;
4277: (transno < state->nbTrans) && (nb < maxval);
4278: transno++) {
4279: trans = &state->trans[transno];
4280: if (trans->to < 0)
4281: continue;
4282: atom = trans->atom;
4283: if ((atom == NULL) || (atom->valuep == NULL))
4284: continue;
4285: if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4286: /* this should not be reached but ... */
4287: TODO;
4288: } else if (trans->count == REGEXP_ALL_COUNTER) {
4289: /* this should not be reached but ... */
4290: TODO;
4291: } else if (trans->counter >= 0) {
4292: xmlRegCounterPtr counter = NULL;
4293: int count;
4294:
4295: if (err)
4296: count = exec->errCounts[trans->counter];
4297: else
4298: count = exec->counts[trans->counter];
4299: if (exec->comp != NULL)
4300: counter = &exec->comp->counters[trans->counter];
4301: if ((counter == NULL) || (count < counter->max)) {
4302: if (atom->neg)
4303: values[nb++] = (xmlChar *) atom->valuep2;
4304: else
4305: values[nb++] = (xmlChar *) atom->valuep;
4306: (*nbval)++;
4307: }
4308: } else {
4309: if ((exec->comp->states[trans->to] != NULL) &&
4310: (exec->comp->states[trans->to]->type !=
4311: XML_REGEXP_SINK_STATE)) {
4312: if (atom->neg)
4313: values[nb++] = (xmlChar *) atom->valuep2;
4314: else
4315: values[nb++] = (xmlChar *) atom->valuep;
4316: (*nbval)++;
4317: }
4318: }
4319: }
4320: for (transno = 0;
4321: (transno < state->nbTrans) && (nb < maxval);
4322: transno++) {
4323: trans = &state->trans[transno];
4324: if (trans->to < 0)
4325: continue;
4326: atom = trans->atom;
4327: if ((atom == NULL) || (atom->valuep == NULL))
4328: continue;
4329: if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4330: continue;
4331: } else if (trans->count == REGEXP_ALL_COUNTER) {
4332: continue;
4333: } else if (trans->counter >= 0) {
4334: continue;
4335: } else {
4336: if ((exec->comp->states[trans->to] != NULL) &&
4337: (exec->comp->states[trans->to]->type ==
4338: XML_REGEXP_SINK_STATE)) {
4339: if (atom->neg)
4340: values[nb++] = (xmlChar *) atom->valuep2;
4341: else
4342: values[nb++] = (xmlChar *) atom->valuep;
4343: (*nbneg)++;
4344: }
4345: }
4346: }
4347: }
4348: return(0);
4349: }
4350:
4351: /**
4352: * xmlRegExecNextValues:
4353: * @exec: a regexp execution context
4354: * @nbval: pointer to the number of accepted values IN/OUT
4355: * @nbneg: return number of negative transitions
4356: * @values: pointer to the array of acceptable values
4357: * @terminal: return value if this was a terminal state
4358: *
4359: * Extract informations from the regexp execution,
4360: * the parameter @values must point to an array of @nbval string pointers
4361: * on return nbval will contain the number of possible strings in that
4362: * state and the @values array will be updated with them. The string values
4363: * returned will be freed with the @exec context and don't need to be
4364: * deallocated.
4365: *
4366: * Returns: 0 in case of success or -1 in case of error.
4367: */
4368: int
4369: xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg,
4370: xmlChar **values, int *terminal) {
4371: return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal));
4372: }
4373:
4374: /**
4375: * xmlRegExecErrInfo:
4376: * @exec: a regexp execution context generating an error
4377: * @string: return value for the error string
4378: * @nbval: pointer to the number of accepted values IN/OUT
4379: * @nbneg: return number of negative transitions
4380: * @values: pointer to the array of acceptable values
4381: * @terminal: return value if this was a terminal state
4382: *
4383: * Extract error informations from the regexp execution, the parameter
4384: * @string will be updated with the value pushed and not accepted,
4385: * the parameter @values must point to an array of @nbval string pointers
4386: * on return nbval will contain the number of possible strings in that
4387: * state and the @values array will be updated with them. The string values
4388: * returned will be freed with the @exec context and don't need to be
4389: * deallocated.
4390: *
4391: * Returns: 0 in case of success or -1 in case of error.
4392: */
4393: int
4394: xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string,
4395: int *nbval, int *nbneg, xmlChar **values, int *terminal) {
4396: if (exec == NULL)
4397: return(-1);
4398: if (string != NULL) {
4399: if (exec->status != 0)
4400: *string = exec->errString;
4401: else
4402: *string = NULL;
4403: }
4404: return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal));
4405: }
4406:
4407: #ifdef DEBUG_ERR
4408: static void testerr(xmlRegExecCtxtPtr exec) {
4409: const xmlChar *string;
4410: xmlChar *values[5];
4411: int nb = 5;
4412: int nbneg;
4413: int terminal;
4414: xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal);
4415: }
4416: #endif
4417:
4418: #if 0
4419: static int
4420: xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
4421: xmlRegTransPtr trans;
4422: xmlRegAtomPtr atom;
4423: int ret;
4424: int codepoint, len;
4425:
4426: if (exec == NULL)
4427: return(-1);
4428: if (exec->status != 0)
4429: return(exec->status);
4430:
4431: while ((exec->status == 0) &&
4432: ((exec->inputString[exec->index] != 0) ||
4433: (exec->state->type != XML_REGEXP_FINAL_STATE))) {
4434:
4435: /*
4436: * End of input on non-terminal state, rollback, however we may
4437: * still have epsilon like transition for counted transitions
4438: * on counters, in that case don't break too early.
4439: */
4440: if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
4441: goto rollback;
4442:
4443: exec->transcount = 0;
4444: for (;exec->transno < exec->state->nbTrans;exec->transno++) {
4445: trans = &exec->state->trans[exec->transno];
4446: if (trans->to < 0)
4447: continue;
4448: atom = trans->atom;
4449: ret = 0;
4450: if (trans->count >= 0) {
4451: int count;
4452: xmlRegCounterPtr counter;
4453:
4454: /*
4455: * A counted transition.
4456: */
4457:
4458: count = exec->counts[trans->count];
4459: counter = &exec->comp->counters[trans->count];
4460: #ifdef DEBUG_REGEXP_EXEC
4461: printf("testing count %d: val %d, min %d, max %d\n",
4462: trans->count, count, counter->min, counter->max);
4463: #endif
4464: ret = ((count >= counter->min) && (count <= counter->max));
4465: } else if (atom == NULL) {
4466: fprintf(stderr, "epsilon transition left at runtime\n");
4467: exec->status = -2;
4468: break;
4469: } else if (exec->inputString[exec->index] != 0) {
4470: codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
4471: ret = xmlRegCheckCharacter(atom, codepoint);
4472: if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
4473: xmlRegStatePtr to = exec->comp->states[trans->to];
4474:
4475: /*
4476: * this is a multiple input sequence
4477: */
4478: if (exec->state->nbTrans > exec->transno + 1) {
4479: xmlFARegExecSave(exec);
4480: }
4481: exec->transcount = 1;
4482: do {
4483: /*
4484: * Try to progress as much as possible on the input
4485: */
4486: if (exec->transcount == atom->max) {
4487: break;
4488: }
4489: exec->index += len;
4490: /*
4491: * End of input: stop here
4492: */
4493: if (exec->inputString[exec->index] == 0) {
4494: exec->index -= len;
4495: break;
4496: }
4497: if (exec->transcount >= atom->min) {
4498: int transno = exec->transno;
4499: xmlRegStatePtr state = exec->state;
4500:
4501: /*
4502: * The transition is acceptable save it
4503: */
4504: exec->transno = -1; /* trick */
4505: exec->state = to;
4506: xmlFARegExecSave(exec);
4507: exec->transno = transno;
4508: exec->state = state;
4509: }
4510: codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
4511: len);
4512: ret = xmlRegCheckCharacter(atom, codepoint);
4513: exec->transcount++;
4514: } while (ret == 1);
4515: if (exec->transcount < atom->min)
4516: ret = 0;
4517:
4518: /*
4519: * If the last check failed but one transition was found
4520: * possible, rollback
4521: */
4522: if (ret < 0)
4523: ret = 0;
4524: if (ret == 0) {
4525: goto rollback;
4526: }
4527: }
4528: }
4529: if (ret == 1) {
4530: if (exec->state->nbTrans > exec->transno + 1) {
4531: xmlFARegExecSave(exec);
4532: }
4533: /*
4534: * restart count for expressions like this ((abc){2})*
4535: */
4536: if (trans->count >= 0) {
4537: #ifdef DEBUG_REGEXP_EXEC
4538: printf("Reset count %d\n", trans->count);
4539: #endif
4540: exec->counts[trans->count] = 0;
4541: }
4542: if (trans->counter >= 0) {
4543: #ifdef DEBUG_REGEXP_EXEC
4544: printf("Increasing count %d\n", trans->counter);
4545: #endif
4546: exec->counts[trans->counter]++;
4547: }
4548: #ifdef DEBUG_REGEXP_EXEC
4549: printf("entering state %d\n", trans->to);
4550: #endif
4551: exec->state = exec->comp->states[trans->to];
4552: exec->transno = 0;
4553: if (trans->atom != NULL) {
4554: exec->index += len;
4555: }
4556: goto progress;
4557: } else if (ret < 0) {
4558: exec->status = -4;
4559: break;
4560: }
4561: }
4562: if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4563: rollback:
4564: /*
4565: * Failed to find a way out
4566: */
4567: exec->determinist = 0;
4568: xmlFARegExecRollBack(exec);
4569: }
4570: progress:
4571: continue;
4572: }
4573: }
4574: #endif
4575: /************************************************************************
4576: * *
4577: * Parser for the Schemas Datatype Regular Expressions *
4578: * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs *
4579: * *
4580: ************************************************************************/
4581:
4582: /**
4583: * xmlFAIsChar:
4584: * @ctxt: a regexp parser context
4585: *
4586: * [10] Char ::= [^.\?*+()|#x5B#x5D]
4587: */
4588: static int
4589: xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
4590: int cur;
4591: int len;
4592:
4593: cur = CUR_SCHAR(ctxt->cur, len);
4594: if ((cur == '.') || (cur == '\\') || (cur == '?') ||
4595: (cur == '*') || (cur == '+') || (cur == '(') ||
4596: (cur == ')') || (cur == '|') || (cur == 0x5B) ||
4597: (cur == 0x5D) || (cur == 0))
4598: return(-1);
4599: return(cur);
4600: }
4601:
4602: /**
4603: * xmlFAParseCharProp:
4604: * @ctxt: a regexp parser context
4605: *
4606: * [27] charProp ::= IsCategory | IsBlock
4607: * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation |
4608: * Separators | Symbols | Others
4609: * [29] Letters ::= 'L' [ultmo]?
4610: * [30] Marks ::= 'M' [nce]?
4611: * [31] Numbers ::= 'N' [dlo]?
4612: * [32] Punctuation ::= 'P' [cdseifo]?
4613: * [33] Separators ::= 'Z' [slp]?
4614: * [34] Symbols ::= 'S' [mcko]?
4615: * [35] Others ::= 'C' [cfon]?
4616: * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+
4617: */
4618: static void
4619: xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
4620: int cur;
4621: xmlRegAtomType type = (xmlRegAtomType) 0;
4622: xmlChar *blockName = NULL;
4623:
4624: cur = CUR;
4625: if (cur == 'L') {
4626: NEXT;
4627: cur = CUR;
4628: if (cur == 'u') {
4629: NEXT;
4630: type = XML_REGEXP_LETTER_UPPERCASE;
4631: } else if (cur == 'l') {
4632: NEXT;
4633: type = XML_REGEXP_LETTER_LOWERCASE;
4634: } else if (cur == 't') {
4635: NEXT;
4636: type = XML_REGEXP_LETTER_TITLECASE;
4637: } else if (cur == 'm') {
4638: NEXT;
4639: type = XML_REGEXP_LETTER_MODIFIER;
4640: } else if (cur == 'o') {
4641: NEXT;
4642: type = XML_REGEXP_LETTER_OTHERS;
4643: } else {
4644: type = XML_REGEXP_LETTER;
4645: }
4646: } else if (cur == 'M') {
4647: NEXT;
4648: cur = CUR;
4649: if (cur == 'n') {
4650: NEXT;
4651: /* nonspacing */
4652: type = XML_REGEXP_MARK_NONSPACING;
4653: } else if (cur == 'c') {
4654: NEXT;
4655: /* spacing combining */
4656: type = XML_REGEXP_MARK_SPACECOMBINING;
4657: } else if (cur == 'e') {
4658: NEXT;
4659: /* enclosing */
4660: type = XML_REGEXP_MARK_ENCLOSING;
4661: } else {
4662: /* all marks */
4663: type = XML_REGEXP_MARK;
4664: }
4665: } else if (cur == 'N') {
4666: NEXT;
4667: cur = CUR;
4668: if (cur == 'd') {
4669: NEXT;
4670: /* digital */
4671: type = XML_REGEXP_NUMBER_DECIMAL;
4672: } else if (cur == 'l') {
4673: NEXT;
4674: /* letter */
4675: type = XML_REGEXP_NUMBER_LETTER;
4676: } else if (cur == 'o') {
4677: NEXT;
4678: /* other */
4679: type = XML_REGEXP_NUMBER_OTHERS;
4680: } else {
4681: /* all numbers */
4682: type = XML_REGEXP_NUMBER;
4683: }
4684: } else if (cur == 'P') {
4685: NEXT;
4686: cur = CUR;
4687: if (cur == 'c') {
4688: NEXT;
4689: /* connector */
4690: type = XML_REGEXP_PUNCT_CONNECTOR;
4691: } else if (cur == 'd') {
4692: NEXT;
4693: /* dash */
4694: type = XML_REGEXP_PUNCT_DASH;
4695: } else if (cur == 's') {
4696: NEXT;
4697: /* open */
4698: type = XML_REGEXP_PUNCT_OPEN;
4699: } else if (cur == 'e') {
4700: NEXT;
4701: /* close */
4702: type = XML_REGEXP_PUNCT_CLOSE;
4703: } else if (cur == 'i') {
4704: NEXT;
4705: /* initial quote */
4706: type = XML_REGEXP_PUNCT_INITQUOTE;
4707: } else if (cur == 'f') {
4708: NEXT;
4709: /* final quote */
4710: type = XML_REGEXP_PUNCT_FINQUOTE;
4711: } else if (cur == 'o') {
4712: NEXT;
4713: /* other */
4714: type = XML_REGEXP_PUNCT_OTHERS;
4715: } else {
4716: /* all punctuation */
4717: type = XML_REGEXP_PUNCT;
4718: }
4719: } else if (cur == 'Z') {
4720: NEXT;
4721: cur = CUR;
4722: if (cur == 's') {
4723: NEXT;
4724: /* space */
4725: type = XML_REGEXP_SEPAR_SPACE;
4726: } else if (cur == 'l') {
4727: NEXT;
4728: /* line */
4729: type = XML_REGEXP_SEPAR_LINE;
4730: } else if (cur == 'p') {
4731: NEXT;
4732: /* paragraph */
4733: type = XML_REGEXP_SEPAR_PARA;
4734: } else {
4735: /* all separators */
4736: type = XML_REGEXP_SEPAR;
4737: }
4738: } else if (cur == 'S') {
4739: NEXT;
4740: cur = CUR;
4741: if (cur == 'm') {
4742: NEXT;
4743: type = XML_REGEXP_SYMBOL_MATH;
4744: /* math */
4745: } else if (cur == 'c') {
4746: NEXT;
4747: type = XML_REGEXP_SYMBOL_CURRENCY;
4748: /* currency */
4749: } else if (cur == 'k') {
4750: NEXT;
4751: type = XML_REGEXP_SYMBOL_MODIFIER;
4752: /* modifiers */
4753: } else if (cur == 'o') {
4754: NEXT;
4755: type = XML_REGEXP_SYMBOL_OTHERS;
4756: /* other */
4757: } else {
4758: /* all symbols */
4759: type = XML_REGEXP_SYMBOL;
4760: }
4761: } else if (cur == 'C') {
4762: NEXT;
4763: cur = CUR;
4764: if (cur == 'c') {
4765: NEXT;
4766: /* control */
4767: type = XML_REGEXP_OTHER_CONTROL;
4768: } else if (cur == 'f') {
4769: NEXT;
4770: /* format */
4771: type = XML_REGEXP_OTHER_FORMAT;
4772: } else if (cur == 'o') {
4773: NEXT;
4774: /* private use */
4775: type = XML_REGEXP_OTHER_PRIVATE;
4776: } else if (cur == 'n') {
4777: NEXT;
4778: /* not assigned */
4779: type = XML_REGEXP_OTHER_NA;
4780: } else {
4781: /* all others */
4782: type = XML_REGEXP_OTHER;
4783: }
4784: } else if (cur == 'I') {
4785: const xmlChar *start;
4786: NEXT;
4787: cur = CUR;
4788: if (cur != 's') {
4789: ERROR("IsXXXX expected");
4790: return;
4791: }
4792: NEXT;
4793: start = ctxt->cur;
4794: cur = CUR;
4795: if (((cur >= 'a') && (cur <= 'z')) ||
4796: ((cur >= 'A') && (cur <= 'Z')) ||
4797: ((cur >= '0') && (cur <= '9')) ||
4798: (cur == 0x2D)) {
4799: NEXT;
4800: cur = CUR;
4801: while (((cur >= 'a') && (cur <= 'z')) ||
4802: ((cur >= 'A') && (cur <= 'Z')) ||
4803: ((cur >= '0') && (cur <= '9')) ||
4804: (cur == 0x2D)) {
4805: NEXT;
4806: cur = CUR;
4807: }
4808: }
4809: type = XML_REGEXP_BLOCK_NAME;
4810: blockName = xmlStrndup(start, ctxt->cur - start);
4811: } else {
4812: ERROR("Unknown char property");
4813: return;
4814: }
4815: if (ctxt->atom == NULL) {
4816: ctxt->atom = xmlRegNewAtom(ctxt, type);
4817: if (ctxt->atom != NULL)
4818: ctxt->atom->valuep = blockName;
4819: } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4820: xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4821: type, 0, 0, blockName);
4822: }
4823: }
4824:
4825: /**
4826: * xmlFAParseCharClassEsc:
4827: * @ctxt: a regexp parser context
4828: *
4829: * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
4830: * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
4831: * [25] catEsc ::= '\p{' charProp '}'
4832: * [26] complEsc ::= '\P{' charProp '}'
4833: * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
4834: */
4835: static void
4836: xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
4837: int cur;
4838:
4839: if (CUR == '.') {
4840: if (ctxt->atom == NULL) {
4841: ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR);
4842: } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4843: xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4844: XML_REGEXP_ANYCHAR, 0, 0, NULL);
4845: }
4846: NEXT;
4847: return;
4848: }
4849: if (CUR != '\\') {
4850: ERROR("Escaped sequence: expecting \\");
4851: return;
4852: }
4853: NEXT;
4854: cur = CUR;
4855: if (cur == 'p') {
4856: NEXT;
4857: if (CUR != '{') {
4858: ERROR("Expecting '{'");
4859: return;
4860: }
4861: NEXT;
4862: xmlFAParseCharProp(ctxt);
4863: if (CUR != '}') {
4864: ERROR("Expecting '}'");
4865: return;
4866: }
4867: NEXT;
4868: } else if (cur == 'P') {
4869: NEXT;
4870: if (CUR != '{') {
4871: ERROR("Expecting '{'");
4872: return;
4873: }
4874: NEXT;
4875: xmlFAParseCharProp(ctxt);
4876: ctxt->atom->neg = 1;
4877: if (CUR != '}') {
4878: ERROR("Expecting '}'");
4879: return;
4880: }
4881: NEXT;
4882: } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') ||
4883: (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
4884: (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
4885: (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
4886: (cur == 0x5E)) {
4887: if (ctxt->atom == NULL) {
4888: ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
4889: if (ctxt->atom != NULL) {
4890: switch (cur) {
4891: case 'n':
4892: ctxt->atom->codepoint = '\n';
4893: break;
4894: case 'r':
4895: ctxt->atom->codepoint = '\r';
4896: break;
4897: case 't':
4898: ctxt->atom->codepoint = '\t';
4899: break;
4900: default:
4901: ctxt->atom->codepoint = cur;
4902: }
4903: }
4904: } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4905: switch (cur) {
4906: case 'n':
4907: cur = '\n';
4908: break;
4909: case 'r':
4910: cur = '\r';
4911: break;
4912: case 't':
4913: cur = '\t';
4914: break;
4915: }
4916: xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4917: XML_REGEXP_CHARVAL, cur, cur, NULL);
4918: }
4919: NEXT;
4920: } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') ||
4921: (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') ||
4922: (cur == 'w') || (cur == 'W')) {
4923: xmlRegAtomType type = XML_REGEXP_ANYSPACE;
4924:
4925: switch (cur) {
4926: case 's':
4927: type = XML_REGEXP_ANYSPACE;
4928: break;
4929: case 'S':
4930: type = XML_REGEXP_NOTSPACE;
4931: break;
4932: case 'i':
4933: type = XML_REGEXP_INITNAME;
4934: break;
4935: case 'I':
4936: type = XML_REGEXP_NOTINITNAME;
4937: break;
4938: case 'c':
4939: type = XML_REGEXP_NAMECHAR;
4940: break;
4941: case 'C':
4942: type = XML_REGEXP_NOTNAMECHAR;
4943: break;
4944: case 'd':
4945: type = XML_REGEXP_DECIMAL;
4946: break;
4947: case 'D':
4948: type = XML_REGEXP_NOTDECIMAL;
4949: break;
4950: case 'w':
4951: type = XML_REGEXP_REALCHAR;
4952: break;
4953: case 'W':
4954: type = XML_REGEXP_NOTREALCHAR;
4955: break;
4956: }
4957: NEXT;
4958: if (ctxt->atom == NULL) {
4959: ctxt->atom = xmlRegNewAtom(ctxt, type);
4960: } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4961: xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4962: type, 0, 0, NULL);
4963: }
4964: } else {
4965: ERROR("Wrong escape sequence, misuse of character '\\'");
4966: }
4967: }
4968:
4969: /**
4970: * xmlFAParseCharRange:
4971: * @ctxt: a regexp parser context
4972: *
4973: * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash
4974: * [18] seRange ::= charOrEsc '-' charOrEsc
4975: * [20] charOrEsc ::= XmlChar | SingleCharEsc
4976: * [21] XmlChar ::= [^\#x2D#x5B#x5D]
4977: * [22] XmlCharIncDash ::= [^\#x5B#x5D]
4978: */
4979: static void
4980: xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
4981: int cur, len;
4982: int start = -1;
4983: int end = -1;
4984:
4985: if (CUR == '\0') {
4986: ERROR("Expecting ']'");
4987: return;
4988: }
4989:
4990: cur = CUR;
4991: if (cur == '\\') {
4992: NEXT;
4993: cur = CUR;
4994: switch (cur) {
4995: case 'n': start = 0xA; break;
4996: case 'r': start = 0xD; break;
4997: case 't': start = 0x9; break;
4998: case '\\': case '|': case '.': case '-': case '^': case '?':
4999: case '*': case '+': case '{': case '}': case '(': case ')':
5000: case '[': case ']':
5001: start = cur; break;
5002: default:
5003: ERROR("Invalid escape value");
5004: return;
5005: }
5006: end = start;
5007: len = 1;
5008: } else if ((cur != 0x5B) && (cur != 0x5D)) {
5009: end = start = CUR_SCHAR(ctxt->cur, len);
5010: } else {
5011: ERROR("Expecting a char range");
5012: return;
5013: }
5014: /*
5015: * Since we are "inside" a range, we can assume ctxt->cur is past
5016: * the start of ctxt->string, and PREV should be safe
5017: */
5018: if ((start == '-') && (NXT(1) != ']') && (PREV != '[') && (PREV != '^')) {
5019: NEXTL(len);
5020: return;
5021: }
5022: NEXTL(len);
5023: cur = CUR;
5024: if ((cur != '-') || (NXT(1) == ']')) {
5025: xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5026: XML_REGEXP_CHARVAL, start, end, NULL);
5027: return;
5028: }
5029: NEXT;
5030: cur = CUR;
5031: if (cur == '\\') {
5032: NEXT;
5033: cur = CUR;
5034: switch (cur) {
5035: case 'n': end = 0xA; break;
5036: case 'r': end = 0xD; break;
5037: case 't': end = 0x9; break;
5038: case '\\': case '|': case '.': case '-': case '^': case '?':
5039: case '*': case '+': case '{': case '}': case '(': case ')':
5040: case '[': case ']':
5041: end = cur; break;
5042: default:
5043: ERROR("Invalid escape value");
5044: return;
5045: }
5046: len = 1;
5047: } else if ((cur != 0x5B) && (cur != 0x5D)) {
5048: end = CUR_SCHAR(ctxt->cur, len);
5049: } else {
5050: ERROR("Expecting the end of a char range");
5051: return;
5052: }
5053: NEXTL(len);
5054: /* TODO check that the values are acceptable character ranges for XML */
5055: if (end < start) {
5056: ERROR("End of range is before start of range");
5057: } else {
5058: xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5059: XML_REGEXP_CHARVAL, start, end, NULL);
5060: }
5061: return;
5062: }
5063:
5064: /**
5065: * xmlFAParsePosCharGroup:
5066: * @ctxt: a regexp parser context
5067: *
5068: * [14] posCharGroup ::= ( charRange | charClassEsc )+
5069: */
5070: static void
5071: xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) {
5072: do {
5073: if (CUR == '\\') {
5074: xmlFAParseCharClassEsc(ctxt);
5075: } else {
5076: xmlFAParseCharRange(ctxt);
5077: }
5078: } while ((CUR != ']') && (CUR != '^') && (CUR != '-') &&
5079: (CUR != 0) && (ctxt->error == 0));
5080: }
5081:
5082: /**
5083: * xmlFAParseCharGroup:
5084: * @ctxt: a regexp parser context
5085: *
5086: * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub
5087: * [15] negCharGroup ::= '^' posCharGroup
5088: * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
5089: * [12] charClassExpr ::= '[' charGroup ']'
5090: */
5091: static void
5092: xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) {
5093: int n = ctxt->neg;
5094: while ((CUR != ']') && (ctxt->error == 0)) {
5095: if (CUR == '^') {
5096: int neg = ctxt->neg;
5097:
5098: NEXT;
5099: ctxt->neg = !ctxt->neg;
5100: xmlFAParsePosCharGroup(ctxt);
5101: ctxt->neg = neg;
5102: } else if ((CUR == '-') && (NXT(1) == '[')) {
5103: int neg = ctxt->neg;
5104: ctxt->neg = 2;
5105: NEXT; /* eat the '-' */
5106: NEXT; /* eat the '[' */
5107: xmlFAParseCharGroup(ctxt);
5108: if (CUR == ']') {
5109: NEXT;
5110: } else {
5111: ERROR("charClassExpr: ']' expected");
5112: break;
5113: }
5114: ctxt->neg = neg;
5115: break;
5116: } else if (CUR != ']') {
5117: xmlFAParsePosCharGroup(ctxt);
5118: }
5119: }
5120: ctxt->neg = n;
5121: }
5122:
5123: /**
5124: * xmlFAParseCharClass:
5125: * @ctxt: a regexp parser context
5126: *
5127: * [11] charClass ::= charClassEsc | charClassExpr
5128: * [12] charClassExpr ::= '[' charGroup ']'
5129: */
5130: static void
5131: xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) {
5132: if (CUR == '[') {
5133: NEXT;
5134: ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES);
5135: if (ctxt->atom == NULL)
5136: return;
5137: xmlFAParseCharGroup(ctxt);
5138: if (CUR == ']') {
5139: NEXT;
5140: } else {
5141: ERROR("xmlFAParseCharClass: ']' expected");
5142: }
5143: } else {
5144: xmlFAParseCharClassEsc(ctxt);
5145: }
5146: }
5147:
5148: /**
5149: * xmlFAParseQuantExact:
5150: * @ctxt: a regexp parser context
5151: *
5152: * [8] QuantExact ::= [0-9]+
5153: *
5154: * Returns 0 if success or -1 in case of error
5155: */
5156: static int
5157: xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) {
5158: int ret = 0;
5159: int ok = 0;
5160:
5161: while ((CUR >= '0') && (CUR <= '9')) {
5162: ret = ret * 10 + (CUR - '0');
5163: ok = 1;
5164: NEXT;
5165: }
5166: if (ok != 1) {
5167: return(-1);
5168: }
5169: return(ret);
5170: }
5171:
5172: /**
5173: * xmlFAParseQuantifier:
5174: * @ctxt: a regexp parser context
5175: *
5176: * [4] quantifier ::= [?*+] | ( '{' quantity '}' )
5177: * [5] quantity ::= quantRange | quantMin | QuantExact
5178: * [6] quantRange ::= QuantExact ',' QuantExact
5179: * [7] quantMin ::= QuantExact ','
5180: * [8] QuantExact ::= [0-9]+
5181: */
5182: static int
5183: xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) {
5184: int cur;
5185:
5186: cur = CUR;
5187: if ((cur == '?') || (cur == '*') || (cur == '+')) {
5188: if (ctxt->atom != NULL) {
5189: if (cur == '?')
5190: ctxt->atom->quant = XML_REGEXP_QUANT_OPT;
5191: else if (cur == '*')
5192: ctxt->atom->quant = XML_REGEXP_QUANT_MULT;
5193: else if (cur == '+')
5194: ctxt->atom->quant = XML_REGEXP_QUANT_PLUS;
5195: }
5196: NEXT;
5197: return(1);
5198: }
5199: if (cur == '{') {
5200: int min = 0, max = 0;
5201:
5202: NEXT;
5203: cur = xmlFAParseQuantExact(ctxt);
5204: if (cur >= 0)
5205: min = cur;
5206: if (CUR == ',') {
5207: NEXT;
5208: if (CUR == '}')
5209: max = INT_MAX;
5210: else {
5211: cur = xmlFAParseQuantExact(ctxt);
5212: if (cur >= 0)
5213: max = cur;
5214: else {
5215: ERROR("Improper quantifier");
5216: }
5217: }
5218: }
5219: if (CUR == '}') {
5220: NEXT;
5221: } else {
5222: ERROR("Unterminated quantifier");
5223: }
5224: if (max == 0)
5225: max = min;
5226: if (ctxt->atom != NULL) {
5227: ctxt->atom->quant = XML_REGEXP_QUANT_RANGE;
5228: ctxt->atom->min = min;
5229: ctxt->atom->max = max;
5230: }
5231: return(1);
5232: }
5233: return(0);
5234: }
5235:
5236: /**
5237: * xmlFAParseAtom:
5238: * @ctxt: a regexp parser context
5239: *
5240: * [9] atom ::= Char | charClass | ( '(' regExp ')' )
5241: */
5242: static int
5243: xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
5244: int codepoint, len;
5245:
5246: codepoint = xmlFAIsChar(ctxt);
5247: if (codepoint > 0) {
5248: ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
5249: if (ctxt->atom == NULL)
5250: return(-1);
5251: codepoint = CUR_SCHAR(ctxt->cur, len);
5252: ctxt->atom->codepoint = codepoint;
5253: NEXTL(len);
5254: return(1);
5255: } else if (CUR == '|') {
5256: return(0);
5257: } else if (CUR == 0) {
5258: return(0);
5259: } else if (CUR == ')') {
5260: return(0);
5261: } else if (CUR == '(') {
5262: xmlRegStatePtr start, oldend, start0;
5263:
5264: NEXT;
5265: /*
5266: * this extra Epsilon transition is needed if we count with 0 allowed
5267: * unfortunately this can't be known at that point
5268: */
5269: xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5270: start0 = ctxt->state;
5271: xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5272: start = ctxt->state;
5273: oldend = ctxt->end;
5274: ctxt->end = NULL;
5275: ctxt->atom = NULL;
5276: xmlFAParseRegExp(ctxt, 0);
5277: if (CUR == ')') {
5278: NEXT;
5279: } else {
5280: ERROR("xmlFAParseAtom: expecting ')'");
5281: }
5282: ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG);
5283: if (ctxt->atom == NULL)
5284: return(-1);
5285: ctxt->atom->start = start;
5286: ctxt->atom->start0 = start0;
5287: ctxt->atom->stop = ctxt->state;
5288: ctxt->end = oldend;
5289: return(1);
5290: } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) {
5291: xmlFAParseCharClass(ctxt);
5292: return(1);
5293: }
5294: return(0);
5295: }
5296:
5297: /**
5298: * xmlFAParsePiece:
5299: * @ctxt: a regexp parser context
5300: *
5301: * [3] piece ::= atom quantifier?
5302: */
5303: static int
5304: xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
5305: int ret;
5306:
5307: ctxt->atom = NULL;
5308: ret = xmlFAParseAtom(ctxt);
5309: if (ret == 0)
5310: return(0);
5311: if (ctxt->atom == NULL) {
5312: ERROR("internal: no atom generated");
5313: }
5314: xmlFAParseQuantifier(ctxt);
5315: return(1);
5316: }
5317:
5318: /**
5319: * xmlFAParseBranch:
5320: * @ctxt: a regexp parser context
5321: * @to: optional target to the end of the branch
5322: *
5323: * @to is used to optimize by removing duplicate path in automata
5324: * in expressions like (a|b)(c|d)
5325: *
5326: * [2] branch ::= piece*
5327: */
5328: static int
5329: xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) {
5330: xmlRegStatePtr previous;
5331: int ret;
5332:
5333: previous = ctxt->state;
5334: ret = xmlFAParsePiece(ctxt);
5335: if (ret != 0) {
5336: if (xmlFAGenerateTransitions(ctxt, previous,
5337: (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0)
5338: return(-1);
5339: previous = ctxt->state;
5340: ctxt->atom = NULL;
5341: }
5342: while ((ret != 0) && (ctxt->error == 0)) {
5343: ret = xmlFAParsePiece(ctxt);
5344: if (ret != 0) {
5345: if (xmlFAGenerateTransitions(ctxt, previous,
5346: (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0)
5347: return(-1);
5348: previous = ctxt->state;
5349: ctxt->atom = NULL;
5350: }
5351: }
5352: return(0);
5353: }
5354:
5355: /**
5356: * xmlFAParseRegExp:
5357: * @ctxt: a regexp parser context
5358: * @top: is this the top-level expression ?
5359: *
5360: * [1] regExp ::= branch ( '|' branch )*
5361: */
5362: static void
5363: xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
5364: xmlRegStatePtr start, end;
5365:
5366: /* if not top start should have been generated by an epsilon trans */
5367: start = ctxt->state;
5368: ctxt->end = NULL;
5369: xmlFAParseBranch(ctxt, NULL);
5370: if (top) {
5371: #ifdef DEBUG_REGEXP_GRAPH
5372: printf("State %d is final\n", ctxt->state->no);
5373: #endif
5374: ctxt->state->type = XML_REGEXP_FINAL_STATE;
5375: }
5376: if (CUR != '|') {
5377: ctxt->end = ctxt->state;
5378: return;
5379: }
5380: end = ctxt->state;
5381: while ((CUR == '|') && (ctxt->error == 0)) {
5382: NEXT;
5383: if (CUR == 0) {
5384: ERROR("expecting a branch after |")
5385: return;
5386: }
5387: ctxt->state = start;
5388: ctxt->end = NULL;
5389: xmlFAParseBranch(ctxt, end);
5390: }
5391: if (!top) {
5392: ctxt->state = end;
5393: ctxt->end = end;
5394: }
5395: }
5396:
5397: /************************************************************************
5398: * *
5399: * The basic API *
5400: * *
5401: ************************************************************************/
5402:
5403: /**
5404: * xmlRegexpPrint:
5405: * @output: the file for the output debug
5406: * @regexp: the compiled regexp
5407: *
5408: * Print the content of the compiled regular expression
5409: */
5410: void
5411: xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
5412: int i;
5413:
5414: if (output == NULL)
5415: return;
5416: fprintf(output, " regexp: ");
5417: if (regexp == NULL) {
5418: fprintf(output, "NULL\n");
5419: return;
5420: }
5421: fprintf(output, "'%s' ", regexp->string);
5422: fprintf(output, "\n");
5423: fprintf(output, "%d atoms:\n", regexp->nbAtoms);
5424: for (i = 0;i < regexp->nbAtoms; i++) {
5425: fprintf(output, " %02d ", i);
5426: xmlRegPrintAtom(output, regexp->atoms[i]);
5427: }
5428: fprintf(output, "%d states:", regexp->nbStates);
5429: fprintf(output, "\n");
5430: for (i = 0;i < regexp->nbStates; i++) {
5431: xmlRegPrintState(output, regexp->states[i]);
5432: }
5433: fprintf(output, "%d counters:\n", regexp->nbCounters);
5434: for (i = 0;i < regexp->nbCounters; i++) {
5435: fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min,
5436: regexp->counters[i].max);
5437: }
5438: }
5439:
5440: /**
5441: * xmlRegexpCompile:
5442: * @regexp: a regular expression string
5443: *
5444: * Parses a regular expression conforming to XML Schemas Part 2 Datatype
5445: * Appendix F and builds an automata suitable for testing strings against
5446: * that regular expression
5447: *
5448: * Returns the compiled expression or NULL in case of error
5449: */
5450: xmlRegexpPtr
5451: xmlRegexpCompile(const xmlChar *regexp) {
5452: xmlRegexpPtr ret;
5453: xmlRegParserCtxtPtr ctxt;
5454:
5455: ctxt = xmlRegNewParserCtxt(regexp);
5456: if (ctxt == NULL)
5457: return(NULL);
5458:
5459: /* initialize the parser */
5460: ctxt->end = NULL;
5461: ctxt->start = ctxt->state = xmlRegNewState(ctxt);
5462: xmlRegStatePush(ctxt, ctxt->start);
5463:
5464: /* parse the expression building an automata */
5465: xmlFAParseRegExp(ctxt, 1);
5466: if (CUR != 0) {
5467: ERROR("xmlFAParseRegExp: extra characters");
5468: }
5469: if (ctxt->error != 0) {
5470: xmlRegFreeParserCtxt(ctxt);
5471: return(NULL);
5472: }
5473: ctxt->end = ctxt->state;
5474: ctxt->start->type = XML_REGEXP_START_STATE;
5475: ctxt->end->type = XML_REGEXP_FINAL_STATE;
5476:
5477: /* remove the Epsilon except for counted transitions */
5478: xmlFAEliminateEpsilonTransitions(ctxt);
5479:
5480:
5481: if (ctxt->error != 0) {
5482: xmlRegFreeParserCtxt(ctxt);
5483: return(NULL);
5484: }
5485: ret = xmlRegEpxFromParse(ctxt);
5486: xmlRegFreeParserCtxt(ctxt);
5487: return(ret);
5488: }
5489:
5490: /**
5491: * xmlRegexpExec:
5492: * @comp: the compiled regular expression
5493: * @content: the value to check against the regular expression
5494: *
5495: * Check if the regular expression generates the value
5496: *
5497: * Returns 1 if it matches, 0 if not and a negative value in case of error
5498: */
5499: int
5500: xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
5501: if ((comp == NULL) || (content == NULL))
5502: return(-1);
5503: return(xmlFARegExec(comp, content));
5504: }
5505:
5506: /**
5507: * xmlRegexpIsDeterminist:
5508: * @comp: the compiled regular expression
5509: *
5510: * Check if the regular expression is determinist
5511: *
5512: * Returns 1 if it yes, 0 if not and a negative value in case of error
5513: */
5514: int
5515: xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
5516: xmlAutomataPtr am;
5517: int ret;
5518:
5519: if (comp == NULL)
5520: return(-1);
5521: if (comp->determinist != -1)
5522: return(comp->determinist);
5523:
5524: am = xmlNewAutomata();
5525: if (am->states != NULL) {
5526: int i;
5527:
5528: for (i = 0;i < am->nbStates;i++)
5529: xmlRegFreeState(am->states[i]);
5530: xmlFree(am->states);
5531: }
5532: am->nbAtoms = comp->nbAtoms;
5533: am->atoms = comp->atoms;
5534: am->nbStates = comp->nbStates;
5535: am->states = comp->states;
5536: am->determinist = -1;
5537: am->flags = comp->flags;
5538: ret = xmlFAComputesDeterminism(am);
5539: am->atoms = NULL;
5540: am->states = NULL;
5541: xmlFreeAutomata(am);
5542: comp->determinist = ret;
5543: return(ret);
5544: }
5545:
5546: /**
5547: * xmlRegFreeRegexp:
5548: * @regexp: the regexp
5549: *
5550: * Free a regexp
5551: */
5552: void
5553: xmlRegFreeRegexp(xmlRegexpPtr regexp) {
5554: int i;
5555: if (regexp == NULL)
5556: return;
5557:
5558: if (regexp->string != NULL)
5559: xmlFree(regexp->string);
5560: if (regexp->states != NULL) {
5561: for (i = 0;i < regexp->nbStates;i++)
5562: xmlRegFreeState(regexp->states[i]);
5563: xmlFree(regexp->states);
5564: }
5565: if (regexp->atoms != NULL) {
5566: for (i = 0;i < regexp->nbAtoms;i++)
5567: xmlRegFreeAtom(regexp->atoms[i]);
5568: xmlFree(regexp->atoms);
5569: }
5570: if (regexp->counters != NULL)
5571: xmlFree(regexp->counters);
5572: if (regexp->compact != NULL)
5573: xmlFree(regexp->compact);
5574: if (regexp->transdata != NULL)
5575: xmlFree(regexp->transdata);
5576: if (regexp->stringMap != NULL) {
5577: for (i = 0; i < regexp->nbstrings;i++)
5578: xmlFree(regexp->stringMap[i]);
5579: xmlFree(regexp->stringMap);
5580: }
5581:
5582: xmlFree(regexp);
5583: }
5584:
5585: #ifdef LIBXML_AUTOMATA_ENABLED
5586: /************************************************************************
5587: * *
5588: * The Automata interface *
5589: * *
5590: ************************************************************************/
5591:
5592: /**
5593: * xmlNewAutomata:
5594: *
5595: * Create a new automata
5596: *
5597: * Returns the new object or NULL in case of failure
5598: */
5599: xmlAutomataPtr
5600: xmlNewAutomata(void) {
5601: xmlAutomataPtr ctxt;
5602:
5603: ctxt = xmlRegNewParserCtxt(NULL);
5604: if (ctxt == NULL)
5605: return(NULL);
5606:
5607: /* initialize the parser */
5608: ctxt->end = NULL;
5609: ctxt->start = ctxt->state = xmlRegNewState(ctxt);
5610: if (ctxt->start == NULL) {
5611: xmlFreeAutomata(ctxt);
5612: return(NULL);
5613: }
5614: ctxt->start->type = XML_REGEXP_START_STATE;
5615: if (xmlRegStatePush(ctxt, ctxt->start) < 0) {
5616: xmlRegFreeState(ctxt->start);
5617: xmlFreeAutomata(ctxt);
5618: return(NULL);
5619: }
5620: ctxt->flags = 0;
5621:
5622: return(ctxt);
5623: }
5624:
5625: /**
5626: * xmlFreeAutomata:
5627: * @am: an automata
5628: *
5629: * Free an automata
5630: */
5631: void
5632: xmlFreeAutomata(xmlAutomataPtr am) {
5633: if (am == NULL)
5634: return;
5635: xmlRegFreeParserCtxt(am);
5636: }
5637:
5638: /**
5639: * xmlAutomataSetFlags:
5640: * @am: an automata
5641: * @flags: a set of internal flags
5642: *
5643: * Set some flags on the automata
5644: */
5645: void
5646: xmlAutomataSetFlags(xmlAutomataPtr am, int flags) {
5647: if (am == NULL)
5648: return;
5649: am->flags |= flags;
5650: }
5651:
5652: /**
5653: * xmlAutomataGetInitState:
5654: * @am: an automata
5655: *
5656: * Initial state lookup
5657: *
5658: * Returns the initial state of the automata
5659: */
5660: xmlAutomataStatePtr
5661: xmlAutomataGetInitState(xmlAutomataPtr am) {
5662: if (am == NULL)
5663: return(NULL);
5664: return(am->start);
5665: }
5666:
5667: /**
5668: * xmlAutomataSetFinalState:
5669: * @am: an automata
5670: * @state: a state in this automata
5671: *
5672: * Makes that state a final state
5673: *
5674: * Returns 0 or -1 in case of error
5675: */
5676: int
5677: xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
5678: if ((am == NULL) || (state == NULL))
5679: return(-1);
5680: state->type = XML_REGEXP_FINAL_STATE;
5681: return(0);
5682: }
5683:
5684: /**
5685: * xmlAutomataNewTransition:
5686: * @am: an automata
5687: * @from: the starting point of the transition
5688: * @to: the target point of the transition or NULL
5689: * @token: the input string associated to that transition
5690: * @data: data passed to the callback function if the transition is activated
5691: *
5692: * If @to is NULL, this creates first a new target state in the automata
5693: * and then adds a transition from the @from state to the target state
5694: * activated by the value of @token
5695: *
5696: * Returns the target state or NULL in case of error
5697: */
5698: xmlAutomataStatePtr
5699: xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
5700: xmlAutomataStatePtr to, const xmlChar *token,
5701: void *data) {
5702: xmlRegAtomPtr atom;
5703:
5704: if ((am == NULL) || (from == NULL) || (token == NULL))
5705: return(NULL);
5706: atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5707: if (atom == NULL)
5708: return(NULL);
5709: atom->data = data;
5710: if (atom == NULL)
5711: return(NULL);
5712: atom->valuep = xmlStrdup(token);
5713:
5714: if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5715: xmlRegFreeAtom(atom);
5716: return(NULL);
5717: }
5718: if (to == NULL)
5719: return(am->state);
5720: return(to);
5721: }
5722:
5723: /**
5724: * xmlAutomataNewTransition2:
5725: * @am: an automata
5726: * @from: the starting point of the transition
5727: * @to: the target point of the transition or NULL
5728: * @token: the first input string associated to that transition
5729: * @token2: the second input string associated to that transition
5730: * @data: data passed to the callback function if the transition is activated
5731: *
5732: * If @to is NULL, this creates first a new target state in the automata
5733: * and then adds a transition from the @from state to the target state
5734: * activated by the value of @token
5735: *
5736: * Returns the target state or NULL in case of error
5737: */
5738: xmlAutomataStatePtr
5739: xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5740: xmlAutomataStatePtr to, const xmlChar *token,
5741: const xmlChar *token2, void *data) {
5742: xmlRegAtomPtr atom;
5743:
5744: if ((am == NULL) || (from == NULL) || (token == NULL))
5745: return(NULL);
5746: atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5747: if (atom == NULL)
5748: return(NULL);
5749: atom->data = data;
5750: if ((token2 == NULL) || (*token2 == 0)) {
5751: atom->valuep = xmlStrdup(token);
5752: } else {
5753: int lenn, lenp;
5754: xmlChar *str;
5755:
5756: lenn = strlen((char *) token2);
5757: lenp = strlen((char *) token);
5758:
5759: str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5760: if (str == NULL) {
5761: xmlRegFreeAtom(atom);
5762: return(NULL);
5763: }
5764: memcpy(&str[0], token, lenp);
5765: str[lenp] = '|';
5766: memcpy(&str[lenp + 1], token2, lenn);
5767: str[lenn + lenp + 1] = 0;
5768:
5769: atom->valuep = str;
5770: }
5771:
5772: if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5773: xmlRegFreeAtom(atom);
5774: return(NULL);
5775: }
5776: if (to == NULL)
5777: return(am->state);
5778: return(to);
5779: }
5780:
5781: /**
5782: * xmlAutomataNewNegTrans:
5783: * @am: an automata
5784: * @from: the starting point of the transition
5785: * @to: the target point of the transition or NULL
5786: * @token: the first input string associated to that transition
5787: * @token2: the second input string associated to that transition
5788: * @data: data passed to the callback function if the transition is activated
5789: *
5790: * If @to is NULL, this creates first a new target state in the automata
5791: * and then adds a transition from the @from state to the target state
5792: * activated by any value except (@token,@token2)
5793: * Note that if @token2 is not NULL, then (X, NULL) won't match to follow
5794: # the semantic of XSD ##other
5795: *
5796: * Returns the target state or NULL in case of error
5797: */
5798: xmlAutomataStatePtr
5799: xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5800: xmlAutomataStatePtr to, const xmlChar *token,
5801: const xmlChar *token2, void *data) {
5802: xmlRegAtomPtr atom;
5803: xmlChar err_msg[200];
5804:
5805: if ((am == NULL) || (from == NULL) || (token == NULL))
5806: return(NULL);
5807: atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5808: if (atom == NULL)
5809: return(NULL);
5810: atom->data = data;
5811: atom->neg = 1;
5812: if ((token2 == NULL) || (*token2 == 0)) {
5813: atom->valuep = xmlStrdup(token);
5814: } else {
5815: int lenn, lenp;
5816: xmlChar *str;
5817:
5818: lenn = strlen((char *) token2);
5819: lenp = strlen((char *) token);
5820:
5821: str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5822: if (str == NULL) {
5823: xmlRegFreeAtom(atom);
5824: return(NULL);
5825: }
5826: memcpy(&str[0], token, lenp);
5827: str[lenp] = '|';
5828: memcpy(&str[lenp + 1], token2, lenn);
5829: str[lenn + lenp + 1] = 0;
5830:
5831: atom->valuep = str;
5832: }
5833: snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep);
5834: err_msg[199] = 0;
5835: atom->valuep2 = xmlStrdup(err_msg);
5836:
5837: if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5838: xmlRegFreeAtom(atom);
5839: return(NULL);
5840: }
5841: am->negs++;
5842: if (to == NULL)
5843: return(am->state);
5844: return(to);
5845: }
5846:
5847: /**
5848: * xmlAutomataNewCountTrans2:
5849: * @am: an automata
5850: * @from: the starting point of the transition
5851: * @to: the target point of the transition or NULL
5852: * @token: the input string associated to that transition
5853: * @token2: the second input string associated to that transition
5854: * @min: the minimum successive occurences of token
5855: * @max: the maximum successive occurences of token
5856: * @data: data associated to the transition
5857: *
5858: * If @to is NULL, this creates first a new target state in the automata
5859: * and then adds a transition from the @from state to the target state
5860: * activated by a succession of input of value @token and @token2 and
5861: * whose number is between @min and @max
5862: *
5863: * Returns the target state or NULL in case of error
5864: */
5865: xmlAutomataStatePtr
5866: xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5867: xmlAutomataStatePtr to, const xmlChar *token,
5868: const xmlChar *token2,
5869: int min, int max, void *data) {
5870: xmlRegAtomPtr atom;
5871: int counter;
5872:
5873: if ((am == NULL) || (from == NULL) || (token == NULL))
5874: return(NULL);
5875: if (min < 0)
5876: return(NULL);
5877: if ((max < min) || (max < 1))
5878: return(NULL);
5879: atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5880: if (atom == NULL)
5881: return(NULL);
5882: if ((token2 == NULL) || (*token2 == 0)) {
5883: atom->valuep = xmlStrdup(token);
5884: } else {
5885: int lenn, lenp;
5886: xmlChar *str;
5887:
5888: lenn = strlen((char *) token2);
5889: lenp = strlen((char *) token);
5890:
5891: str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5892: if (str == NULL) {
5893: xmlRegFreeAtom(atom);
5894: return(NULL);
5895: }
5896: memcpy(&str[0], token, lenp);
5897: str[lenp] = '|';
5898: memcpy(&str[lenp + 1], token2, lenn);
5899: str[lenn + lenp + 1] = 0;
5900:
5901: atom->valuep = str;
5902: }
5903: atom->data = data;
5904: if (min == 0)
5905: atom->min = 1;
5906: else
5907: atom->min = min;
5908: atom->max = max;
5909:
5910: /*
5911: * associate a counter to the transition.
5912: */
5913: counter = xmlRegGetCounter(am);
5914: am->counters[counter].min = min;
5915: am->counters[counter].max = max;
5916:
5917: /* xmlFAGenerateTransitions(am, from, to, atom); */
5918: if (to == NULL) {
5919: to = xmlRegNewState(am);
5920: xmlRegStatePush(am, to);
5921: }
5922: xmlRegStateAddTrans(am, from, atom, to, counter, -1);
5923: xmlRegAtomPush(am, atom);
5924: am->state = to;
5925:
5926: if (to == NULL)
5927: to = am->state;
5928: if (to == NULL)
5929: return(NULL);
5930: if (min == 0)
5931: xmlFAGenerateEpsilonTransition(am, from, to);
5932: return(to);
5933: }
5934:
5935: /**
5936: * xmlAutomataNewCountTrans:
5937: * @am: an automata
5938: * @from: the starting point of the transition
5939: * @to: the target point of the transition or NULL
5940: * @token: the input string associated to that transition
5941: * @min: the minimum successive occurences of token
5942: * @max: the maximum successive occurences of token
5943: * @data: data associated to the transition
5944: *
5945: * If @to is NULL, this creates first a new target state in the automata
5946: * and then adds a transition from the @from state to the target state
5947: * activated by a succession of input of value @token and whose number
5948: * is between @min and @max
5949: *
5950: * Returns the target state or NULL in case of error
5951: */
5952: xmlAutomataStatePtr
5953: xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5954: xmlAutomataStatePtr to, const xmlChar *token,
5955: int min, int max, void *data) {
5956: xmlRegAtomPtr atom;
5957: int counter;
5958:
5959: if ((am == NULL) || (from == NULL) || (token == NULL))
5960: return(NULL);
5961: if (min < 0)
5962: return(NULL);
5963: if ((max < min) || (max < 1))
5964: return(NULL);
5965: atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5966: if (atom == NULL)
5967: return(NULL);
5968: atom->valuep = xmlStrdup(token);
5969: atom->data = data;
5970: if (min == 0)
5971: atom->min = 1;
5972: else
5973: atom->min = min;
5974: atom->max = max;
5975:
5976: /*
5977: * associate a counter to the transition.
5978: */
5979: counter = xmlRegGetCounter(am);
5980: am->counters[counter].min = min;
5981: am->counters[counter].max = max;
5982:
5983: /* xmlFAGenerateTransitions(am, from, to, atom); */
5984: if (to == NULL) {
5985: to = xmlRegNewState(am);
5986: xmlRegStatePush(am, to);
5987: }
5988: xmlRegStateAddTrans(am, from, atom, to, counter, -1);
5989: xmlRegAtomPush(am, atom);
5990: am->state = to;
5991:
5992: if (to == NULL)
5993: to = am->state;
5994: if (to == NULL)
5995: return(NULL);
5996: if (min == 0)
5997: xmlFAGenerateEpsilonTransition(am, from, to);
5998: return(to);
5999: }
6000:
6001: /**
6002: * xmlAutomataNewOnceTrans2:
6003: * @am: an automata
6004: * @from: the starting point of the transition
6005: * @to: the target point of the transition or NULL
6006: * @token: the input string associated to that transition
6007: * @token2: the second input string associated to that transition
6008: * @min: the minimum successive occurences of token
6009: * @max: the maximum successive occurences of token
6010: * @data: data associated to the transition
6011: *
6012: * If @to is NULL, this creates first a new target state in the automata
6013: * and then adds a transition from the @from state to the target state
6014: * activated by a succession of input of value @token and @token2 and whose
6015: * number is between @min and @max, moreover that transition can only be
6016: * crossed once.
6017: *
6018: * Returns the target state or NULL in case of error
6019: */
6020: xmlAutomataStatePtr
6021: xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
6022: xmlAutomataStatePtr to, const xmlChar *token,
6023: const xmlChar *token2,
6024: int min, int max, void *data) {
6025: xmlRegAtomPtr atom;
6026: int counter;
6027:
6028: if ((am == NULL) || (from == NULL) || (token == NULL))
6029: return(NULL);
6030: if (min < 1)
6031: return(NULL);
6032: if ((max < min) || (max < 1))
6033: return(NULL);
6034: atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6035: if (atom == NULL)
6036: return(NULL);
6037: if ((token2 == NULL) || (*token2 == 0)) {
6038: atom->valuep = xmlStrdup(token);
6039: } else {
6040: int lenn, lenp;
6041: xmlChar *str;
6042:
6043: lenn = strlen((char *) token2);
6044: lenp = strlen((char *) token);
6045:
6046: str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
6047: if (str == NULL) {
6048: xmlRegFreeAtom(atom);
6049: return(NULL);
6050: }
6051: memcpy(&str[0], token, lenp);
6052: str[lenp] = '|';
6053: memcpy(&str[lenp + 1], token2, lenn);
6054: str[lenn + lenp + 1] = 0;
6055:
6056: atom->valuep = str;
6057: }
6058: atom->data = data;
6059: atom->quant = XML_REGEXP_QUANT_ONCEONLY;
6060: atom->min = min;
6061: atom->max = max;
6062: /*
6063: * associate a counter to the transition.
6064: */
6065: counter = xmlRegGetCounter(am);
6066: am->counters[counter].min = 1;
6067: am->counters[counter].max = 1;
6068:
6069: /* xmlFAGenerateTransitions(am, from, to, atom); */
6070: if (to == NULL) {
6071: to = xmlRegNewState(am);
6072: xmlRegStatePush(am, to);
6073: }
6074: xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6075: xmlRegAtomPush(am, atom);
6076: am->state = to;
6077: return(to);
6078: }
6079:
6080:
6081:
6082: /**
6083: * xmlAutomataNewOnceTrans:
6084: * @am: an automata
6085: * @from: the starting point of the transition
6086: * @to: the target point of the transition or NULL
6087: * @token: the input string associated to that transition
6088: * @min: the minimum successive occurences of token
6089: * @max: the maximum successive occurences of token
6090: * @data: data associated to the transition
6091: *
6092: * If @to is NULL, this creates first a new target state in the automata
6093: * and then adds a transition from the @from state to the target state
6094: * activated by a succession of input of value @token and whose number
6095: * is between @min and @max, moreover that transition can only be crossed
6096: * once.
6097: *
6098: * Returns the target state or NULL in case of error
6099: */
6100: xmlAutomataStatePtr
6101: xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6102: xmlAutomataStatePtr to, const xmlChar *token,
6103: int min, int max, void *data) {
6104: xmlRegAtomPtr atom;
6105: int counter;
6106:
6107: if ((am == NULL) || (from == NULL) || (token == NULL))
6108: return(NULL);
6109: if (min < 1)
6110: return(NULL);
6111: if ((max < min) || (max < 1))
6112: return(NULL);
6113: atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6114: if (atom == NULL)
6115: return(NULL);
6116: atom->valuep = xmlStrdup(token);
6117: atom->data = data;
6118: atom->quant = XML_REGEXP_QUANT_ONCEONLY;
6119: atom->min = min;
6120: atom->max = max;
6121: /*
6122: * associate a counter to the transition.
6123: */
6124: counter = xmlRegGetCounter(am);
6125: am->counters[counter].min = 1;
6126: am->counters[counter].max = 1;
6127:
6128: /* xmlFAGenerateTransitions(am, from, to, atom); */
6129: if (to == NULL) {
6130: to = xmlRegNewState(am);
6131: xmlRegStatePush(am, to);
6132: }
6133: xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6134: xmlRegAtomPush(am, atom);
6135: am->state = to;
6136: return(to);
6137: }
6138:
6139: /**
6140: * xmlAutomataNewState:
6141: * @am: an automata
6142: *
6143: * Create a new disconnected state in the automata
6144: *
6145: * Returns the new state or NULL in case of error
6146: */
6147: xmlAutomataStatePtr
6148: xmlAutomataNewState(xmlAutomataPtr am) {
6149: xmlAutomataStatePtr to;
6150:
6151: if (am == NULL)
6152: return(NULL);
6153: to = xmlRegNewState(am);
6154: xmlRegStatePush(am, to);
6155: return(to);
6156: }
6157:
6158: /**
6159: * xmlAutomataNewEpsilon:
6160: * @am: an automata
6161: * @from: the starting point of the transition
6162: * @to: the target point of the transition or NULL
6163: *
6164: * If @to is NULL, this creates first a new target state in the automata
6165: * and then adds an epsilon transition from the @from state to the
6166: * target state
6167: *
6168: * Returns the target state or NULL in case of error
6169: */
6170: xmlAutomataStatePtr
6171: xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from,
6172: xmlAutomataStatePtr to) {
6173: if ((am == NULL) || (from == NULL))
6174: return(NULL);
6175: xmlFAGenerateEpsilonTransition(am, from, to);
6176: if (to == NULL)
6177: return(am->state);
6178: return(to);
6179: }
6180:
6181: /**
6182: * xmlAutomataNewAllTrans:
6183: * @am: an automata
6184: * @from: the starting point of the transition
6185: * @to: the target point of the transition or NULL
6186: * @lax: allow to transition if not all all transitions have been activated
6187: *
6188: * If @to is NULL, this creates first a new target state in the automata
6189: * and then adds a an ALL transition from the @from state to the
6190: * target state. That transition is an epsilon transition allowed only when
6191: * all transitions from the @from node have been activated.
6192: *
6193: * Returns the target state or NULL in case of error
6194: */
6195: xmlAutomataStatePtr
6196: xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6197: xmlAutomataStatePtr to, int lax) {
6198: if ((am == NULL) || (from == NULL))
6199: return(NULL);
6200: xmlFAGenerateAllTransition(am, from, to, lax);
6201: if (to == NULL)
6202: return(am->state);
6203: return(to);
6204: }
6205:
6206: /**
6207: * xmlAutomataNewCounter:
6208: * @am: an automata
6209: * @min: the minimal value on the counter
6210: * @max: the maximal value on the counter
6211: *
6212: * Create a new counter
6213: *
6214: * Returns the counter number or -1 in case of error
6215: */
6216: int
6217: xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) {
6218: int ret;
6219:
6220: if (am == NULL)
6221: return(-1);
6222:
6223: ret = xmlRegGetCounter(am);
6224: if (ret < 0)
6225: return(-1);
6226: am->counters[ret].min = min;
6227: am->counters[ret].max = max;
6228: return(ret);
6229: }
6230:
6231: /**
6232: * xmlAutomataNewCountedTrans:
6233: * @am: an automata
6234: * @from: the starting point of the transition
6235: * @to: the target point of the transition or NULL
6236: * @counter: the counter associated to that transition
6237: *
6238: * If @to is NULL, this creates first a new target state in the automata
6239: * and then adds an epsilon transition from the @from state to the target state
6240: * which will increment the counter provided
6241: *
6242: * Returns the target state or NULL in case of error
6243: */
6244: xmlAutomataStatePtr
6245: xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6246: xmlAutomataStatePtr to, int counter) {
6247: if ((am == NULL) || (from == NULL) || (counter < 0))
6248: return(NULL);
6249: xmlFAGenerateCountedEpsilonTransition(am, from, to, counter);
6250: if (to == NULL)
6251: return(am->state);
6252: return(to);
6253: }
6254:
6255: /**
6256: * xmlAutomataNewCounterTrans:
6257: * @am: an automata
6258: * @from: the starting point of the transition
6259: * @to: the target point of the transition or NULL
6260: * @counter: the counter associated to that transition
6261: *
6262: * If @to is NULL, this creates first a new target state in the automata
6263: * and then adds an epsilon transition from the @from state to the target state
6264: * which will be allowed only if the counter is within the right range.
6265: *
6266: * Returns the target state or NULL in case of error
6267: */
6268: xmlAutomataStatePtr
6269: xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6270: xmlAutomataStatePtr to, int counter) {
6271: if ((am == NULL) || (from == NULL) || (counter < 0))
6272: return(NULL);
6273: xmlFAGenerateCountedTransition(am, from, to, counter);
6274: if (to == NULL)
6275: return(am->state);
6276: return(to);
6277: }
6278:
6279: /**
6280: * xmlAutomataCompile:
6281: * @am: an automata
6282: *
6283: * Compile the automata into a Reg Exp ready for being executed.
6284: * The automata should be free after this point.
6285: *
6286: * Returns the compiled regexp or NULL in case of error
6287: */
6288: xmlRegexpPtr
6289: xmlAutomataCompile(xmlAutomataPtr am) {
6290: xmlRegexpPtr ret;
6291:
6292: if ((am == NULL) || (am->error != 0)) return(NULL);
6293: xmlFAEliminateEpsilonTransitions(am);
6294: /* xmlFAComputesDeterminism(am); */
6295: ret = xmlRegEpxFromParse(am);
6296:
6297: return(ret);
6298: }
6299:
6300: /**
6301: * xmlAutomataIsDeterminist:
6302: * @am: an automata
6303: *
6304: * Checks if an automata is determinist.
6305: *
6306: * Returns 1 if true, 0 if not, and -1 in case of error
6307: */
6308: int
6309: xmlAutomataIsDeterminist(xmlAutomataPtr am) {
6310: int ret;
6311:
6312: if (am == NULL)
6313: return(-1);
6314:
6315: ret = xmlFAComputesDeterminism(am);
6316: return(ret);
6317: }
6318: #endif /* LIBXML_AUTOMATA_ENABLED */
6319:
6320: #ifdef LIBXML_EXPR_ENABLED
6321: /************************************************************************
6322: * *
6323: * Formal Expression handling code *
6324: * *
6325: ************************************************************************/
6326: /************************************************************************
6327: * *
6328: * Expression handling context *
6329: * *
6330: ************************************************************************/
6331:
6332: struct _xmlExpCtxt {
6333: xmlDictPtr dict;
6334: xmlExpNodePtr *table;
6335: int size;
6336: int nbElems;
6337: int nb_nodes;
6338: int maxNodes;
6339: const char *expr;
6340: const char *cur;
6341: int nb_cons;
6342: int tabSize;
6343: };
6344:
6345: /**
6346: * xmlExpNewCtxt:
6347: * @maxNodes: the maximum number of nodes
6348: * @dict: optional dictionnary to use internally
6349: *
6350: * Creates a new context for manipulating expressions
6351: *
6352: * Returns the context or NULL in case of error
6353: */
6354: xmlExpCtxtPtr
6355: xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) {
6356: xmlExpCtxtPtr ret;
6357: int size = 256;
6358:
6359: if (maxNodes <= 4096)
6360: maxNodes = 4096;
6361:
6362: ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt));
6363: if (ret == NULL)
6364: return(NULL);
6365: memset(ret, 0, sizeof(xmlExpCtxt));
6366: ret->size = size;
6367: ret->nbElems = 0;
6368: ret->maxNodes = maxNodes;
6369: ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr));
6370: if (ret->table == NULL) {
6371: xmlFree(ret);
6372: return(NULL);
6373: }
6374: memset(ret->table, 0, size * sizeof(xmlExpNodePtr));
6375: if (dict == NULL) {
6376: ret->dict = xmlDictCreate();
6377: if (ret->dict == NULL) {
6378: xmlFree(ret->table);
6379: xmlFree(ret);
6380: return(NULL);
6381: }
6382: } else {
6383: ret->dict = dict;
6384: xmlDictReference(ret->dict);
6385: }
6386: return(ret);
6387: }
6388:
6389: /**
6390: * xmlExpFreeCtxt:
6391: * @ctxt: an expression context
6392: *
6393: * Free an expression context
6394: */
6395: void
6396: xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) {
6397: if (ctxt == NULL)
6398: return;
6399: xmlDictFree(ctxt->dict);
6400: if (ctxt->table != NULL)
6401: xmlFree(ctxt->table);
6402: xmlFree(ctxt);
6403: }
6404:
6405: /************************************************************************
6406: * *
6407: * Structure associated to an expression node *
6408: * *
6409: ************************************************************************/
6410: #define MAX_NODES 10000
6411:
6412: /* #define DEBUG_DERIV */
6413:
6414: /*
6415: * TODO:
6416: * - Wildcards
6417: * - public API for creation
6418: *
6419: * Started
6420: * - regression testing
6421: *
6422: * Done
6423: * - split into module and test tool
6424: * - memleaks
6425: */
6426:
6427: typedef enum {
6428: XML_EXP_NILABLE = (1 << 0)
6429: } xmlExpNodeInfo;
6430:
6431: #define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE)
6432:
6433: struct _xmlExpNode {
6434: unsigned char type;/* xmlExpNodeType */
6435: unsigned char info;/* OR of xmlExpNodeInfo */
6436: unsigned short key; /* the hash key */
6437: unsigned int ref; /* The number of references */
6438: int c_max; /* the maximum length it can consume */
6439: xmlExpNodePtr exp_left;
6440: xmlExpNodePtr next;/* the next node in the hash table or free list */
6441: union {
6442: struct {
6443: int f_min;
6444: int f_max;
6445: } count;
6446: struct {
6447: xmlExpNodePtr f_right;
6448: } children;
6449: const xmlChar *f_str;
6450: } field;
6451: };
6452:
6453: #define exp_min field.count.f_min
6454: #define exp_max field.count.f_max
6455: /* #define exp_left field.children.f_left */
6456: #define exp_right field.children.f_right
6457: #define exp_str field.f_str
6458:
6459: static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type);
6460: static xmlExpNode forbiddenExpNode = {
6461: XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6462: };
6463: xmlExpNodePtr forbiddenExp = &forbiddenExpNode;
6464: static xmlExpNode emptyExpNode = {
6465: XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6466: };
6467: xmlExpNodePtr emptyExp = &emptyExpNode;
6468:
6469: /************************************************************************
6470: * *
6471: * The custom hash table for unicity and canonicalization *
6472: * of sub-expressions pointers *
6473: * *
6474: ************************************************************************/
6475: /*
6476: * xmlExpHashNameComputeKey:
6477: * Calculate the hash key for a token
6478: */
6479: static unsigned short
6480: xmlExpHashNameComputeKey(const xmlChar *name) {
6481: unsigned short value = 0L;
6482: char ch;
6483:
6484: if (name != NULL) {
6485: value += 30 * (*name);
6486: while ((ch = *name++) != 0) {
6487: value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch);
6488: }
6489: }
6490: return (value);
6491: }
6492:
6493: /*
6494: * xmlExpHashComputeKey:
6495: * Calculate the hash key for a compound expression
6496: */
6497: static unsigned short
6498: xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left,
6499: xmlExpNodePtr right) {
6500: unsigned long value;
6501: unsigned short ret;
6502:
6503: switch (type) {
6504: case XML_EXP_SEQ:
6505: value = left->key;
6506: value += right->key;
6507: value *= 3;
6508: ret = (unsigned short) value;
6509: break;
6510: case XML_EXP_OR:
6511: value = left->key;
6512: value += right->key;
6513: value *= 7;
6514: ret = (unsigned short) value;
6515: break;
6516: case XML_EXP_COUNT:
6517: value = left->key;
6518: value += right->key;
6519: ret = (unsigned short) value;
6520: break;
6521: default:
6522: ret = 0;
6523: }
6524: return(ret);
6525: }
6526:
6527:
6528: static xmlExpNodePtr
6529: xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) {
6530: xmlExpNodePtr ret;
6531:
6532: if (ctxt->nb_nodes >= MAX_NODES)
6533: return(NULL);
6534: ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode));
6535: if (ret == NULL)
6536: return(NULL);
6537: memset(ret, 0, sizeof(xmlExpNode));
6538: ret->type = type;
6539: ret->next = NULL;
6540: ctxt->nb_nodes++;
6541: ctxt->nb_cons++;
6542: return(ret);
6543: }
6544:
6545: /**
6546: * xmlExpHashGetEntry:
6547: * @table: the hash table
6548: *
6549: * Get the unique entry from the hash table. The entry is created if
6550: * needed. @left and @right are consumed, i.e. their ref count will
6551: * be decremented by the operation.
6552: *
6553: * Returns the pointer or NULL in case of error
6554: */
6555: static xmlExpNodePtr
6556: xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type,
6557: xmlExpNodePtr left, xmlExpNodePtr right,
6558: const xmlChar *name, int min, int max) {
6559: unsigned short kbase, key;
6560: xmlExpNodePtr entry;
6561: xmlExpNodePtr insert;
6562:
6563: if (ctxt == NULL)
6564: return(NULL);
6565:
6566: /*
6567: * Check for duplicate and insertion location.
6568: */
6569: if (type == XML_EXP_ATOM) {
6570: kbase = xmlExpHashNameComputeKey(name);
6571: } else if (type == XML_EXP_COUNT) {
6572: /* COUNT reduction rule 1 */
6573: /* a{1} -> a */
6574: if (min == max) {
6575: if (min == 1) {
6576: return(left);
6577: }
6578: if (min == 0) {
6579: xmlExpFree(ctxt, left);
6580: return(emptyExp);
6581: }
6582: }
6583: if (min < 0) {
6584: xmlExpFree(ctxt, left);
6585: return(forbiddenExp);
6586: }
6587: if (max == -1)
6588: kbase = min + 79;
6589: else
6590: kbase = max - min;
6591: kbase += left->key;
6592: } else if (type == XML_EXP_OR) {
6593: /* Forbid reduction rules */
6594: if (left->type == XML_EXP_FORBID) {
6595: xmlExpFree(ctxt, left);
6596: return(right);
6597: }
6598: if (right->type == XML_EXP_FORBID) {
6599: xmlExpFree(ctxt, right);
6600: return(left);
6601: }
6602:
6603: /* OR reduction rule 1 */
6604: /* a | a reduced to a */
6605: if (left == right) {
6606: left->ref--;
6607: return(left);
6608: }
6609: /* OR canonicalization rule 1 */
6610: /* linearize (a | b) | c into a | (b | c) */
6611: if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) {
6612: xmlExpNodePtr tmp = left;
6613: left = right;
6614: right = tmp;
6615: }
6616: /* OR reduction rule 2 */
6617: /* a | (a | b) and b | (a | b) are reduced to a | b */
6618: if (right->type == XML_EXP_OR) {
6619: if ((left == right->exp_left) ||
6620: (left == right->exp_right)) {
6621: xmlExpFree(ctxt, left);
6622: return(right);
6623: }
6624: }
6625: /* OR canonicalization rule 2 */
6626: /* linearize (a | b) | c into a | (b | c) */
6627: if (left->type == XML_EXP_OR) {
6628: xmlExpNodePtr tmp;
6629:
6630: /* OR canonicalization rule 2 */
6631: if ((left->exp_right->type != XML_EXP_OR) &&
6632: (left->exp_right->key < left->exp_left->key)) {
6633: tmp = left->exp_right;
6634: left->exp_right = left->exp_left;
6635: left->exp_left = tmp;
6636: }
6637: left->exp_right->ref++;
6638: tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right,
6639: NULL, 0, 0);
6640: left->exp_left->ref++;
6641: tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp,
6642: NULL, 0, 0);
6643:
6644: xmlExpFree(ctxt, left);
6645: return(tmp);
6646: }
6647: if (right->type == XML_EXP_OR) {
6648: /* Ordering in the tree */
6649: /* C | (A | B) -> A | (B | C) */
6650: if (left->key > right->exp_right->key) {
6651: xmlExpNodePtr tmp;
6652: right->exp_right->ref++;
6653: tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right,
6654: left, NULL, 0, 0);
6655: right->exp_left->ref++;
6656: tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6657: tmp, NULL, 0, 0);
6658: xmlExpFree(ctxt, right);
6659: return(tmp);
6660: }
6661: /* Ordering in the tree */
6662: /* B | (A | C) -> A | (B | C) */
6663: if (left->key > right->exp_left->key) {
6664: xmlExpNodePtr tmp;
6665: right->exp_right->ref++;
6666: tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left,
6667: right->exp_right, NULL, 0, 0);
6668: right->exp_left->ref++;
6669: tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6670: tmp, NULL, 0, 0);
6671: xmlExpFree(ctxt, right);
6672: return(tmp);
6673: }
6674: }
6675: /* we know both types are != XML_EXP_OR here */
6676: else if (left->key > right->key) {
6677: xmlExpNodePtr tmp = left;
6678: left = right;
6679: right = tmp;
6680: }
6681: kbase = xmlExpHashComputeKey(type, left, right);
6682: } else if (type == XML_EXP_SEQ) {
6683: /* Forbid reduction rules */
6684: if (left->type == XML_EXP_FORBID) {
6685: xmlExpFree(ctxt, right);
6686: return(left);
6687: }
6688: if (right->type == XML_EXP_FORBID) {
6689: xmlExpFree(ctxt, left);
6690: return(right);
6691: }
6692: /* Empty reduction rules */
6693: if (right->type == XML_EXP_EMPTY) {
6694: return(left);
6695: }
6696: if (left->type == XML_EXP_EMPTY) {
6697: return(right);
6698: }
6699: kbase = xmlExpHashComputeKey(type, left, right);
6700: } else
6701: return(NULL);
6702:
6703: key = kbase % ctxt->size;
6704: if (ctxt->table[key] != NULL) {
6705: for (insert = ctxt->table[key]; insert != NULL;
6706: insert = insert->next) {
6707: if ((insert->key == kbase) &&
6708: (insert->type == type)) {
6709: if (type == XML_EXP_ATOM) {
6710: if (name == insert->exp_str) {
6711: insert->ref++;
6712: return(insert);
6713: }
6714: } else if (type == XML_EXP_COUNT) {
6715: if ((insert->exp_min == min) && (insert->exp_max == max) &&
6716: (insert->exp_left == left)) {
6717: insert->ref++;
6718: left->ref--;
6719: return(insert);
6720: }
6721: } else if ((insert->exp_left == left) &&
6722: (insert->exp_right == right)) {
6723: insert->ref++;
6724: left->ref--;
6725: right->ref--;
6726: return(insert);
6727: }
6728: }
6729: }
6730: }
6731:
6732: entry = xmlExpNewNode(ctxt, type);
6733: if (entry == NULL)
6734: return(NULL);
6735: entry->key = kbase;
6736: if (type == XML_EXP_ATOM) {
6737: entry->exp_str = name;
6738: entry->c_max = 1;
6739: } else if (type == XML_EXP_COUNT) {
6740: entry->exp_min = min;
6741: entry->exp_max = max;
6742: entry->exp_left = left;
6743: if ((min == 0) || (IS_NILLABLE(left)))
6744: entry->info |= XML_EXP_NILABLE;
6745: if (max < 0)
6746: entry->c_max = -1;
6747: else
6748: entry->c_max = max * entry->exp_left->c_max;
6749: } else {
6750: entry->exp_left = left;
6751: entry->exp_right = right;
6752: if (type == XML_EXP_OR) {
6753: if ((IS_NILLABLE(left)) || (IS_NILLABLE(right)))
6754: entry->info |= XML_EXP_NILABLE;
6755: if ((entry->exp_left->c_max == -1) ||
6756: (entry->exp_right->c_max == -1))
6757: entry->c_max = -1;
6758: else if (entry->exp_left->c_max > entry->exp_right->c_max)
6759: entry->c_max = entry->exp_left->c_max;
6760: else
6761: entry->c_max = entry->exp_right->c_max;
6762: } else {
6763: if ((IS_NILLABLE(left)) && (IS_NILLABLE(right)))
6764: entry->info |= XML_EXP_NILABLE;
6765: if ((entry->exp_left->c_max == -1) ||
6766: (entry->exp_right->c_max == -1))
6767: entry->c_max = -1;
6768: else
6769: entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max;
6770: }
6771: }
6772: entry->ref = 1;
6773: if (ctxt->table[key] != NULL)
6774: entry->next = ctxt->table[key];
6775:
6776: ctxt->table[key] = entry;
6777: ctxt->nbElems++;
6778:
6779: return(entry);
6780: }
6781:
6782: /**
6783: * xmlExpFree:
6784: * @ctxt: the expression context
6785: * @exp: the expression
6786: *
6787: * Dereference the expression
6788: */
6789: void
6790: xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) {
6791: if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp))
6792: return;
6793: exp->ref--;
6794: if (exp->ref == 0) {
6795: unsigned short key;
6796:
6797: /* Unlink it first from the hash table */
6798: key = exp->key % ctxt->size;
6799: if (ctxt->table[key] == exp) {
6800: ctxt->table[key] = exp->next;
6801: } else {
6802: xmlExpNodePtr tmp;
6803:
6804: tmp = ctxt->table[key];
6805: while (tmp != NULL) {
6806: if (tmp->next == exp) {
6807: tmp->next = exp->next;
6808: break;
6809: }
6810: tmp = tmp->next;
6811: }
6812: }
6813:
6814: if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) {
6815: xmlExpFree(ctxt, exp->exp_left);
6816: xmlExpFree(ctxt, exp->exp_right);
6817: } else if (exp->type == XML_EXP_COUNT) {
6818: xmlExpFree(ctxt, exp->exp_left);
6819: }
6820: xmlFree(exp);
6821: ctxt->nb_nodes--;
6822: }
6823: }
6824:
6825: /**
6826: * xmlExpRef:
6827: * @exp: the expression
6828: *
6829: * Increase the reference count of the expression
6830: */
6831: void
6832: xmlExpRef(xmlExpNodePtr exp) {
6833: if (exp != NULL)
6834: exp->ref++;
6835: }
6836:
6837: /**
6838: * xmlExpNewAtom:
6839: * @ctxt: the expression context
6840: * @name: the atom name
6841: * @len: the atom name length in byte (or -1);
6842: *
6843: * Get the atom associated to this name from that context
6844: *
6845: * Returns the node or NULL in case of error
6846: */
6847: xmlExpNodePtr
6848: xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) {
6849: if ((ctxt == NULL) || (name == NULL))
6850: return(NULL);
6851: name = xmlDictLookup(ctxt->dict, name, len);
6852: if (name == NULL)
6853: return(NULL);
6854: return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0));
6855: }
6856:
6857: /**
6858: * xmlExpNewOr:
6859: * @ctxt: the expression context
6860: * @left: left expression
6861: * @right: right expression
6862: *
6863: * Get the atom associated to the choice @left | @right
6864: * Note that @left and @right are consumed in the operation, to keep
6865: * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
6866: * this is true even in case of failure (unless ctxt == NULL).
6867: *
6868: * Returns the node or NULL in case of error
6869: */
6870: xmlExpNodePtr
6871: xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
6872: if (ctxt == NULL)
6873: return(NULL);
6874: if ((left == NULL) || (right == NULL)) {
6875: xmlExpFree(ctxt, left);
6876: xmlExpFree(ctxt, right);
6877: return(NULL);
6878: }
6879: return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0));
6880: }
6881:
6882: /**
6883: * xmlExpNewSeq:
6884: * @ctxt: the expression context
6885: * @left: left expression
6886: * @right: right expression
6887: *
6888: * Get the atom associated to the sequence @left , @right
6889: * Note that @left and @right are consumed in the operation, to keep
6890: * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
6891: * this is true even in case of failure (unless ctxt == NULL).
6892: *
6893: * Returns the node or NULL in case of error
6894: */
6895: xmlExpNodePtr
6896: xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
6897: if (ctxt == NULL)
6898: return(NULL);
6899: if ((left == NULL) || (right == NULL)) {
6900: xmlExpFree(ctxt, left);
6901: xmlExpFree(ctxt, right);
6902: return(NULL);
6903: }
6904: return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0));
6905: }
6906:
6907: /**
6908: * xmlExpNewRange:
6909: * @ctxt: the expression context
6910: * @subset: the expression to be repeated
6911: * @min: the lower bound for the repetition
6912: * @max: the upper bound for the repetition, -1 means infinite
6913: *
6914: * Get the atom associated to the range (@subset){@min, @max}
6915: * Note that @subset is consumed in the operation, to keep
6916: * an handle on it use xmlExpRef() and use xmlExpFree() to release it,
6917: * this is true even in case of failure (unless ctxt == NULL).
6918: *
6919: * Returns the node or NULL in case of error
6920: */
6921: xmlExpNodePtr
6922: xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) {
6923: if (ctxt == NULL)
6924: return(NULL);
6925: if ((subset == NULL) || (min < 0) || (max < -1) ||
6926: ((max >= 0) && (min > max))) {
6927: xmlExpFree(ctxt, subset);
6928: return(NULL);
6929: }
6930: return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset,
6931: NULL, NULL, min, max));
6932: }
6933:
6934: /************************************************************************
6935: * *
6936: * Public API for operations on expressions *
6937: * *
6938: ************************************************************************/
6939:
6940: static int
6941: xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
6942: const xmlChar**list, int len, int nb) {
6943: int tmp, tmp2;
6944: tail:
6945: switch (exp->type) {
6946: case XML_EXP_EMPTY:
6947: return(0);
6948: case XML_EXP_ATOM:
6949: for (tmp = 0;tmp < nb;tmp++)
6950: if (list[tmp] == exp->exp_str)
6951: return(0);
6952: if (nb >= len)
6953: return(-2);
6954: list[nb] = exp->exp_str;
6955: return(1);
6956: case XML_EXP_COUNT:
6957: exp = exp->exp_left;
6958: goto tail;
6959: case XML_EXP_SEQ:
6960: case XML_EXP_OR:
6961: tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb);
6962: if (tmp < 0)
6963: return(tmp);
6964: tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len,
6965: nb + tmp);
6966: if (tmp2 < 0)
6967: return(tmp2);
6968: return(tmp + tmp2);
6969: }
6970: return(-1);
6971: }
6972:
6973: /**
6974: * xmlExpGetLanguage:
6975: * @ctxt: the expression context
6976: * @exp: the expression
6977: * @langList: where to store the tokens
6978: * @len: the allocated length of @list
6979: *
6980: * Find all the strings used in @exp and store them in @list
6981: *
6982: * Returns the number of unique strings found, -1 in case of errors and
6983: * -2 if there is more than @len strings
6984: */
6985: int
6986: xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
6987: const xmlChar**langList, int len) {
6988: if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0))
6989: return(-1);
6990: return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0));
6991: }
6992:
6993: static int
6994: xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
6995: const xmlChar**list, int len, int nb) {
6996: int tmp, tmp2;
6997: tail:
6998: switch (exp->type) {
6999: case XML_EXP_FORBID:
7000: return(0);
7001: case XML_EXP_EMPTY:
7002: return(0);
7003: case XML_EXP_ATOM:
7004: for (tmp = 0;tmp < nb;tmp++)
7005: if (list[tmp] == exp->exp_str)
7006: return(0);
7007: if (nb >= len)
7008: return(-2);
7009: list[nb] = exp->exp_str;
7010: return(1);
7011: case XML_EXP_COUNT:
7012: exp = exp->exp_left;
7013: goto tail;
7014: case XML_EXP_SEQ:
7015: tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
7016: if (tmp < 0)
7017: return(tmp);
7018: if (IS_NILLABLE(exp->exp_left)) {
7019: tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
7020: nb + tmp);
7021: if (tmp2 < 0)
7022: return(tmp2);
7023: tmp += tmp2;
7024: }
7025: return(tmp);
7026: case XML_EXP_OR:
7027: tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
7028: if (tmp < 0)
7029: return(tmp);
7030: tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
7031: nb + tmp);
7032: if (tmp2 < 0)
7033: return(tmp2);
7034: return(tmp + tmp2);
7035: }
7036: return(-1);
7037: }
7038:
7039: /**
7040: * xmlExpGetStart:
7041: * @ctxt: the expression context
7042: * @exp: the expression
7043: * @tokList: where to store the tokens
7044: * @len: the allocated length of @list
7045: *
7046: * Find all the strings that appears at the start of the languages
7047: * accepted by @exp and store them in @list. E.g. for (a, b) | c
7048: * it will return the list [a, c]
7049: *
7050: * Returns the number of unique strings found, -1 in case of errors and
7051: * -2 if there is more than @len strings
7052: */
7053: int
7054: xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7055: const xmlChar**tokList, int len) {
7056: if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0))
7057: return(-1);
7058: return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0));
7059: }
7060:
7061: /**
7062: * xmlExpIsNillable:
7063: * @exp: the expression
7064: *
7065: * Finds if the expression is nillable, i.e. if it accepts the empty sequqnce
7066: *
7067: * Returns 1 if nillable, 0 if not and -1 in case of error
7068: */
7069: int
7070: xmlExpIsNillable(xmlExpNodePtr exp) {
7071: if (exp == NULL)
7072: return(-1);
7073: return(IS_NILLABLE(exp) != 0);
7074: }
7075:
7076: static xmlExpNodePtr
7077: xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str)
7078: {
7079: xmlExpNodePtr ret;
7080:
7081: switch (exp->type) {
7082: case XML_EXP_EMPTY:
7083: return(forbiddenExp);
7084: case XML_EXP_FORBID:
7085: return(forbiddenExp);
7086: case XML_EXP_ATOM:
7087: if (exp->exp_str == str) {
7088: #ifdef DEBUG_DERIV
7089: printf("deriv atom: equal => Empty\n");
7090: #endif
7091: ret = emptyExp;
7092: } else {
7093: #ifdef DEBUG_DERIV
7094: printf("deriv atom: mismatch => forbid\n");
7095: #endif
7096: /* TODO wildcards here */
7097: ret = forbiddenExp;
7098: }
7099: return(ret);
7100: case XML_EXP_OR: {
7101: xmlExpNodePtr tmp;
7102:
7103: #ifdef DEBUG_DERIV
7104: printf("deriv or: => or(derivs)\n");
7105: #endif
7106: tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7107: if (tmp == NULL) {
7108: return(NULL);
7109: }
7110: ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
7111: if (ret == NULL) {
7112: xmlExpFree(ctxt, tmp);
7113: return(NULL);
7114: }
7115: ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret,
7116: NULL, 0, 0);
7117: return(ret);
7118: }
7119: case XML_EXP_SEQ:
7120: #ifdef DEBUG_DERIV
7121: printf("deriv seq: starting with left\n");
7122: #endif
7123: ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7124: if (ret == NULL) {
7125: return(NULL);
7126: } else if (ret == forbiddenExp) {
7127: if (IS_NILLABLE(exp->exp_left)) {
7128: #ifdef DEBUG_DERIV
7129: printf("deriv seq: left failed but nillable\n");
7130: #endif
7131: ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
7132: }
7133: } else {
7134: #ifdef DEBUG_DERIV
7135: printf("deriv seq: left match => sequence\n");
7136: #endif
7137: exp->exp_right->ref++;
7138: ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right,
7139: NULL, 0, 0);
7140: }
7141: return(ret);
7142: case XML_EXP_COUNT: {
7143: int min, max;
7144: xmlExpNodePtr tmp;
7145:
7146: if (exp->exp_max == 0)
7147: return(forbiddenExp);
7148: ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7149: if (ret == NULL)
7150: return(NULL);
7151: if (ret == forbiddenExp) {
7152: #ifdef DEBUG_DERIV
7153: printf("deriv count: pattern mismatch => forbid\n");
7154: #endif
7155: return(ret);
7156: }
7157: if (exp->exp_max == 1)
7158: return(ret);
7159: if (exp->exp_max < 0) /* unbounded */
7160: max = -1;
7161: else
7162: max = exp->exp_max - 1;
7163: if (exp->exp_min > 0)
7164: min = exp->exp_min - 1;
7165: else
7166: min = 0;
7167: exp->exp_left->ref++;
7168: tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL,
7169: NULL, min, max);
7170: if (ret == emptyExp) {
7171: #ifdef DEBUG_DERIV
7172: printf("deriv count: match to empty => new count\n");
7173: #endif
7174: return(tmp);
7175: }
7176: #ifdef DEBUG_DERIV
7177: printf("deriv count: match => sequence with new count\n");
7178: #endif
7179: return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp,
7180: NULL, 0, 0));
7181: }
7182: }
7183: return(NULL);
7184: }
7185:
7186: /**
7187: * xmlExpStringDerive:
7188: * @ctxt: the expression context
7189: * @exp: the expression
7190: * @str: the string
7191: * @len: the string len in bytes if available
7192: *
7193: * Do one step of Brzozowski derivation of the expression @exp with
7194: * respect to the input string
7195: *
7196: * Returns the resulting expression or NULL in case of internal error
7197: */
7198: xmlExpNodePtr
7199: xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7200: const xmlChar *str, int len) {
7201: const xmlChar *input;
7202:
7203: if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) {
7204: return(NULL);
7205: }
7206: /*
7207: * check the string is in the dictionnary, if yes use an interned
7208: * copy, otherwise we know it's not an acceptable input
7209: */
7210: input = xmlDictExists(ctxt->dict, str, len);
7211: if (input == NULL) {
7212: return(forbiddenExp);
7213: }
7214: return(xmlExpStringDeriveInt(ctxt, exp, input));
7215: }
7216:
7217: static int
7218: xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) {
7219: int ret = 1;
7220:
7221: if (sub->c_max == -1) {
7222: if (exp->c_max != -1)
7223: ret = 0;
7224: } else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) {
7225: ret = 0;
7226: }
7227: #if 0
7228: if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp)))
7229: ret = 0;
7230: #endif
7231: return(ret);
7232: }
7233:
7234: static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7235: xmlExpNodePtr sub);
7236: /**
7237: * xmlExpDivide:
7238: * @ctxt: the expressions context
7239: * @exp: the englobing expression
7240: * @sub: the subexpression
7241: * @mult: the multiple expression
7242: * @remain: the remain from the derivation of the multiple
7243: *
7244: * Check if exp is a multiple of sub, i.e. if there is a finite number n
7245: * so that sub{n} subsume exp
7246: *
7247: * Returns the multiple value if successful, 0 if it is not a multiple
7248: * and -1 in case of internel error.
7249: */
7250:
7251: static int
7252: xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub,
7253: xmlExpNodePtr *mult, xmlExpNodePtr *remain) {
7254: int i;
7255: xmlExpNodePtr tmp, tmp2;
7256:
7257: if (mult != NULL) *mult = NULL;
7258: if (remain != NULL) *remain = NULL;
7259: if (exp->c_max == -1) return(0);
7260: if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0);
7261:
7262: for (i = 1;i <= exp->c_max;i++) {
7263: sub->ref++;
7264: tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7265: sub, NULL, NULL, i, i);
7266: if (tmp == NULL) {
7267: return(-1);
7268: }
7269: if (!xmlExpCheckCard(tmp, exp)) {
7270: xmlExpFree(ctxt, tmp);
7271: continue;
7272: }
7273: tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp);
7274: if (tmp2 == NULL) {
7275: xmlExpFree(ctxt, tmp);
7276: return(-1);
7277: }
7278: if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) {
7279: if (remain != NULL)
7280: *remain = tmp2;
7281: else
7282: xmlExpFree(ctxt, tmp2);
7283: if (mult != NULL)
7284: *mult = tmp;
7285: else
7286: xmlExpFree(ctxt, tmp);
7287: #ifdef DEBUG_DERIV
7288: printf("Divide succeeded %d\n", i);
7289: #endif
7290: return(i);
7291: }
7292: xmlExpFree(ctxt, tmp);
7293: xmlExpFree(ctxt, tmp2);
7294: }
7295: #ifdef DEBUG_DERIV
7296: printf("Divide failed\n");
7297: #endif
7298: return(0);
7299: }
7300:
7301: /**
7302: * xmlExpExpDeriveInt:
7303: * @ctxt: the expressions context
7304: * @exp: the englobing expression
7305: * @sub: the subexpression
7306: *
7307: * Try to do a step of Brzozowski derivation but at a higher level
7308: * the input being a subexpression.
7309: *
7310: * Returns the resulting expression or NULL in case of internal error
7311: */
7312: static xmlExpNodePtr
7313: xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7314: xmlExpNodePtr ret, tmp, tmp2, tmp3;
7315: const xmlChar **tab;
7316: int len, i;
7317:
7318: /*
7319: * In case of equality and if the expression can only consume a finite
7320: * amount, then the derivation is empty
7321: */
7322: if ((exp == sub) && (exp->c_max >= 0)) {
7323: #ifdef DEBUG_DERIV
7324: printf("Equal(exp, sub) and finite -> Empty\n");
7325: #endif
7326: return(emptyExp);
7327: }
7328: /*
7329: * decompose sub sequence first
7330: */
7331: if (sub->type == XML_EXP_EMPTY) {
7332: #ifdef DEBUG_DERIV
7333: printf("Empty(sub) -> Empty\n");
7334: #endif
7335: exp->ref++;
7336: return(exp);
7337: }
7338: if (sub->type == XML_EXP_SEQ) {
7339: #ifdef DEBUG_DERIV
7340: printf("Seq(sub) -> decompose\n");
7341: #endif
7342: tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7343: if (tmp == NULL)
7344: return(NULL);
7345: if (tmp == forbiddenExp)
7346: return(tmp);
7347: ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right);
7348: xmlExpFree(ctxt, tmp);
7349: return(ret);
7350: }
7351: if (sub->type == XML_EXP_OR) {
7352: #ifdef DEBUG_DERIV
7353: printf("Or(sub) -> decompose\n");
7354: #endif
7355: tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7356: if (tmp == forbiddenExp)
7357: return(tmp);
7358: if (tmp == NULL)
7359: return(NULL);
7360: ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right);
7361: if ((ret == NULL) || (ret == forbiddenExp)) {
7362: xmlExpFree(ctxt, tmp);
7363: return(ret);
7364: }
7365: return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0));
7366: }
7367: if (!xmlExpCheckCard(exp, sub)) {
7368: #ifdef DEBUG_DERIV
7369: printf("CheckCard(exp, sub) failed -> Forbid\n");
7370: #endif
7371: return(forbiddenExp);
7372: }
7373: switch (exp->type) {
7374: case XML_EXP_EMPTY:
7375: if (sub == emptyExp)
7376: return(emptyExp);
7377: #ifdef DEBUG_DERIV
7378: printf("Empty(exp) -> Forbid\n");
7379: #endif
7380: return(forbiddenExp);
7381: case XML_EXP_FORBID:
7382: #ifdef DEBUG_DERIV
7383: printf("Forbid(exp) -> Forbid\n");
7384: #endif
7385: return(forbiddenExp);
7386: case XML_EXP_ATOM:
7387: if (sub->type == XML_EXP_ATOM) {
7388: /* TODO: handle wildcards */
7389: if (exp->exp_str == sub->exp_str) {
7390: #ifdef DEBUG_DERIV
7391: printf("Atom match -> Empty\n");
7392: #endif
7393: return(emptyExp);
7394: }
7395: #ifdef DEBUG_DERIV
7396: printf("Atom mismatch -> Forbid\n");
7397: #endif
7398: return(forbiddenExp);
7399: }
7400: if ((sub->type == XML_EXP_COUNT) &&
7401: (sub->exp_max == 1) &&
7402: (sub->exp_left->type == XML_EXP_ATOM)) {
7403: /* TODO: handle wildcards */
7404: if (exp->exp_str == sub->exp_left->exp_str) {
7405: #ifdef DEBUG_DERIV
7406: printf("Atom match -> Empty\n");
7407: #endif
7408: return(emptyExp);
7409: }
7410: #ifdef DEBUG_DERIV
7411: printf("Atom mismatch -> Forbid\n");
7412: #endif
7413: return(forbiddenExp);
7414: }
7415: #ifdef DEBUG_DERIV
7416: printf("Compex exp vs Atom -> Forbid\n");
7417: #endif
7418: return(forbiddenExp);
7419: case XML_EXP_SEQ:
7420: /* try to get the sequence consumed only if possible */
7421: if (xmlExpCheckCard(exp->exp_left, sub)) {
7422: /* See if the sequence can be consumed directly */
7423: #ifdef DEBUG_DERIV
7424: printf("Seq trying left only\n");
7425: #endif
7426: ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7427: if ((ret != forbiddenExp) && (ret != NULL)) {
7428: #ifdef DEBUG_DERIV
7429: printf("Seq trying left only worked\n");
7430: #endif
7431: /*
7432: * TODO: assumption here that we are determinist
7433: * i.e. we won't get to a nillable exp left
7434: * subset which could be matched by the right
7435: * part too.
7436: * e.g.: (a | b)+,(a | c) and 'a+,a'
7437: */
7438: exp->exp_right->ref++;
7439: return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7440: exp->exp_right, NULL, 0, 0));
7441: }
7442: #ifdef DEBUG_DERIV
7443: } else {
7444: printf("Seq: left too short\n");
7445: #endif
7446: }
7447: /* Try instead to decompose */
7448: if (sub->type == XML_EXP_COUNT) {
7449: int min, max;
7450:
7451: #ifdef DEBUG_DERIV
7452: printf("Seq: sub is a count\n");
7453: #endif
7454: ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7455: if (ret == NULL)
7456: return(NULL);
7457: if (ret != forbiddenExp) {
7458: #ifdef DEBUG_DERIV
7459: printf("Seq , Count match on left\n");
7460: #endif
7461: if (sub->exp_max < 0)
7462: max = -1;
7463: else
7464: max = sub->exp_max -1;
7465: if (sub->exp_min > 0)
7466: min = sub->exp_min -1;
7467: else
7468: min = 0;
7469: exp->exp_right->ref++;
7470: tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7471: exp->exp_right, NULL, 0, 0);
7472: if (tmp == NULL)
7473: return(NULL);
7474:
7475: sub->exp_left->ref++;
7476: tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7477: sub->exp_left, NULL, NULL, min, max);
7478: if (tmp2 == NULL) {
7479: xmlExpFree(ctxt, tmp);
7480: return(NULL);
7481: }
7482: ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7483: xmlExpFree(ctxt, tmp);
7484: xmlExpFree(ctxt, tmp2);
7485: return(ret);
7486: }
7487: }
7488: /* we made no progress on structured operations */
7489: break;
7490: case XML_EXP_OR:
7491: #ifdef DEBUG_DERIV
7492: printf("Or , trying both side\n");
7493: #endif
7494: ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7495: if (ret == NULL)
7496: return(NULL);
7497: tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub);
7498: if (tmp == NULL) {
7499: xmlExpFree(ctxt, ret);
7500: return(NULL);
7501: }
7502: return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0));
7503: case XML_EXP_COUNT: {
7504: int min, max;
7505:
7506: if (sub->type == XML_EXP_COUNT) {
7507: /*
7508: * Try to see if the loop is completely subsumed
7509: */
7510: tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7511: if (tmp == NULL)
7512: return(NULL);
7513: if (tmp == forbiddenExp) {
7514: int mult;
7515:
7516: #ifdef DEBUG_DERIV
7517: printf("Count, Count inner don't subsume\n");
7518: #endif
7519: mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left,
7520: NULL, &tmp);
7521: if (mult <= 0) {
7522: #ifdef DEBUG_DERIV
7523: printf("Count, Count not multiple => forbidden\n");
7524: #endif
7525: return(forbiddenExp);
7526: }
7527: if (sub->exp_max == -1) {
7528: max = -1;
7529: if (exp->exp_max == -1) {
7530: if (exp->exp_min <= sub->exp_min * mult)
7531: min = 0;
7532: else
7533: min = exp->exp_min - sub->exp_min * mult;
7534: } else {
7535: #ifdef DEBUG_DERIV
7536: printf("Count, Count finite can't subsume infinite\n");
7537: #endif
7538: xmlExpFree(ctxt, tmp);
7539: return(forbiddenExp);
7540: }
7541: } else {
7542: if (exp->exp_max == -1) {
7543: #ifdef DEBUG_DERIV
7544: printf("Infinite loop consume mult finite loop\n");
7545: #endif
7546: if (exp->exp_min > sub->exp_min * mult) {
7547: max = -1;
7548: min = exp->exp_min - sub->exp_min * mult;
7549: } else {
7550: max = -1;
7551: min = 0;
7552: }
7553: } else {
7554: if (exp->exp_max < sub->exp_max * mult) {
7555: #ifdef DEBUG_DERIV
7556: printf("loops max mult mismatch => forbidden\n");
7557: #endif
7558: xmlExpFree(ctxt, tmp);
7559: return(forbiddenExp);
7560: }
7561: if (sub->exp_max * mult > exp->exp_min)
7562: min = 0;
7563: else
7564: min = exp->exp_min - sub->exp_max * mult;
7565: max = exp->exp_max - sub->exp_max * mult;
7566: }
7567: }
7568: } else if (!IS_NILLABLE(tmp)) {
7569: /*
7570: * TODO: loop here to try to grow if working on finite
7571: * blocks.
7572: */
7573: #ifdef DEBUG_DERIV
7574: printf("Count, Count remain not nillable => forbidden\n");
7575: #endif
7576: xmlExpFree(ctxt, tmp);
7577: return(forbiddenExp);
7578: } else if (sub->exp_max == -1) {
7579: if (exp->exp_max == -1) {
7580: if (exp->exp_min <= sub->exp_min) {
7581: #ifdef DEBUG_DERIV
7582: printf("Infinite loops Okay => COUNT(0,Inf)\n");
7583: #endif
7584: max = -1;
7585: min = 0;
7586: } else {
7587: #ifdef DEBUG_DERIV
7588: printf("Infinite loops min => Count(X,Inf)\n");
7589: #endif
7590: max = -1;
7591: min = exp->exp_min - sub->exp_min;
7592: }
7593: } else if (exp->exp_min > sub->exp_min) {
7594: #ifdef DEBUG_DERIV
7595: printf("loops min mismatch 1 => forbidden ???\n");
7596: #endif
7597: xmlExpFree(ctxt, tmp);
7598: return(forbiddenExp);
7599: } else {
7600: max = -1;
7601: min = 0;
7602: }
7603: } else {
7604: if (exp->exp_max == -1) {
7605: #ifdef DEBUG_DERIV
7606: printf("Infinite loop consume finite loop\n");
7607: #endif
7608: if (exp->exp_min > sub->exp_min) {
7609: max = -1;
7610: min = exp->exp_min - sub->exp_min;
7611: } else {
7612: max = -1;
7613: min = 0;
7614: }
7615: } else {
7616: if (exp->exp_max < sub->exp_max) {
7617: #ifdef DEBUG_DERIV
7618: printf("loops max mismatch => forbidden\n");
7619: #endif
7620: xmlExpFree(ctxt, tmp);
7621: return(forbiddenExp);
7622: }
7623: if (sub->exp_max > exp->exp_min)
7624: min = 0;
7625: else
7626: min = exp->exp_min - sub->exp_max;
7627: max = exp->exp_max - sub->exp_max;
7628: }
7629: }
7630: #ifdef DEBUG_DERIV
7631: printf("loops match => SEQ(COUNT())\n");
7632: #endif
7633: exp->exp_left->ref++;
7634: tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7635: NULL, NULL, min, max);
7636: if (tmp2 == NULL) {
7637: return(NULL);
7638: }
7639: ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7640: NULL, 0, 0);
7641: return(ret);
7642: }
7643: tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7644: if (tmp == NULL)
7645: return(NULL);
7646: if (tmp == forbiddenExp) {
7647: #ifdef DEBUG_DERIV
7648: printf("loop mismatch => forbidden\n");
7649: #endif
7650: return(forbiddenExp);
7651: }
7652: if (exp->exp_min > 0)
7653: min = exp->exp_min - 1;
7654: else
7655: min = 0;
7656: if (exp->exp_max < 0)
7657: max = -1;
7658: else
7659: max = exp->exp_max - 1;
7660:
7661: #ifdef DEBUG_DERIV
7662: printf("loop match => SEQ(COUNT())\n");
7663: #endif
7664: exp->exp_left->ref++;
7665: tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7666: NULL, NULL, min, max);
7667: if (tmp2 == NULL)
7668: return(NULL);
7669: ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7670: NULL, 0, 0);
7671: return(ret);
7672: }
7673: }
7674:
7675: #ifdef DEBUG_DERIV
7676: printf("Fallback to derivative\n");
7677: #endif
7678: if (IS_NILLABLE(sub)) {
7679: if (!(IS_NILLABLE(exp)))
7680: return(forbiddenExp);
7681: else
7682: ret = emptyExp;
7683: } else
7684: ret = NULL;
7685: /*
7686: * here the structured derivation made no progress so
7687: * we use the default token based derivation to force one more step
7688: */
7689: if (ctxt->tabSize == 0)
7690: ctxt->tabSize = 40;
7691:
7692: tab = (const xmlChar **) xmlMalloc(ctxt->tabSize *
7693: sizeof(const xmlChar *));
7694: if (tab == NULL) {
7695: return(NULL);
7696: }
7697:
7698: /*
7699: * collect all the strings accepted by the subexpression on input
7700: */
7701: len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7702: while (len < 0) {
7703: const xmlChar **temp;
7704: temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 *
7705: sizeof(const xmlChar *));
7706: if (temp == NULL) {
7707: xmlFree((xmlChar **) tab);
7708: return(NULL);
7709: }
7710: tab = temp;
7711: ctxt->tabSize *= 2;
7712: len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7713: }
7714: for (i = 0;i < len;i++) {
7715: tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]);
7716: if ((tmp == NULL) || (tmp == forbiddenExp)) {
7717: xmlExpFree(ctxt, ret);
7718: xmlFree((xmlChar **) tab);
7719: return(tmp);
7720: }
7721: tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]);
7722: if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) {
7723: xmlExpFree(ctxt, tmp);
7724: xmlExpFree(ctxt, ret);
7725: xmlFree((xmlChar **) tab);
7726: return(tmp);
7727: }
7728: tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7729: xmlExpFree(ctxt, tmp);
7730: xmlExpFree(ctxt, tmp2);
7731:
7732: if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) {
7733: xmlExpFree(ctxt, ret);
7734: xmlFree((xmlChar **) tab);
7735: return(tmp3);
7736: }
7737:
7738: if (ret == NULL)
7739: ret = tmp3;
7740: else {
7741: ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0);
7742: if (ret == NULL) {
7743: xmlFree((xmlChar **) tab);
7744: return(NULL);
7745: }
7746: }
7747: }
7748: xmlFree((xmlChar **) tab);
7749: return(ret);
7750: }
7751:
7752: /**
7753: * xmlExpExpDerive:
7754: * @ctxt: the expressions context
7755: * @exp: the englobing expression
7756: * @sub: the subexpression
7757: *
7758: * Evaluates the expression resulting from @exp consuming a sub expression @sub
7759: * Based on algebraic derivation and sometimes direct Brzozowski derivation
7760: * it usually tatkes less than linear time and can handle expressions generating
7761: * infinite languages.
7762: *
7763: * Returns the resulting expression or NULL in case of internal error, the
7764: * result must be freed
7765: */
7766: xmlExpNodePtr
7767: xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7768: if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7769: return(NULL);
7770:
7771: /*
7772: * O(1) speedups
7773: */
7774: if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7775: #ifdef DEBUG_DERIV
7776: printf("Sub nillable and not exp : can't subsume\n");
7777: #endif
7778: return(forbiddenExp);
7779: }
7780: if (xmlExpCheckCard(exp, sub) == 0) {
7781: #ifdef DEBUG_DERIV
7782: printf("sub generate longuer sequances than exp : can't subsume\n");
7783: #endif
7784: return(forbiddenExp);
7785: }
7786: return(xmlExpExpDeriveInt(ctxt, exp, sub));
7787: }
7788:
7789: /**
7790: * xmlExpSubsume:
7791: * @ctxt: the expressions context
7792: * @exp: the englobing expression
7793: * @sub: the subexpression
7794: *
7795: * Check whether @exp accepts all the languages accexpted by @sub
7796: * the input being a subexpression.
7797: *
7798: * Returns 1 if true 0 if false and -1 in case of failure.
7799: */
7800: int
7801: xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7802: xmlExpNodePtr tmp;
7803:
7804: if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7805: return(-1);
7806:
7807: /*
7808: * TODO: speedup by checking the language of sub is a subset of the
7809: * language of exp
7810: */
7811: /*
7812: * O(1) speedups
7813: */
7814: if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7815: #ifdef DEBUG_DERIV
7816: printf("Sub nillable and not exp : can't subsume\n");
7817: #endif
7818: return(0);
7819: }
7820: if (xmlExpCheckCard(exp, sub) == 0) {
7821: #ifdef DEBUG_DERIV
7822: printf("sub generate longuer sequances than exp : can't subsume\n");
7823: #endif
7824: return(0);
7825: }
7826: tmp = xmlExpExpDeriveInt(ctxt, exp, sub);
7827: #ifdef DEBUG_DERIV
7828: printf("Result derivation :\n");
7829: PRINT_EXP(tmp);
7830: #endif
7831: if (tmp == NULL)
7832: return(-1);
7833: if (tmp == forbiddenExp)
7834: return(0);
7835: if (tmp == emptyExp)
7836: return(1);
7837: if ((tmp != NULL) && (IS_NILLABLE(tmp))) {
7838: xmlExpFree(ctxt, tmp);
7839: return(1);
7840: }
7841: xmlExpFree(ctxt, tmp);
7842: return(0);
7843: }
7844:
7845: /************************************************************************
7846: * *
7847: * Parsing expression *
7848: * *
7849: ************************************************************************/
7850:
7851: static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt);
7852:
7853: #undef CUR
7854: #define CUR (*ctxt->cur)
7855: #undef NEXT
7856: #define NEXT ctxt->cur++;
7857: #undef IS_BLANK
7858: #define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t'))
7859: #define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++;
7860:
7861: static int
7862: xmlExpParseNumber(xmlExpCtxtPtr ctxt) {
7863: int ret = 0;
7864:
7865: SKIP_BLANKS
7866: if (CUR == '*') {
7867: NEXT
7868: return(-1);
7869: }
7870: if ((CUR < '0') || (CUR > '9'))
7871: return(-1);
7872: while ((CUR >= '0') && (CUR <= '9')) {
7873: ret = ret * 10 + (CUR - '0');
7874: NEXT
7875: }
7876: return(ret);
7877: }
7878:
7879: static xmlExpNodePtr
7880: xmlExpParseOr(xmlExpCtxtPtr ctxt) {
7881: const char *base;
7882: xmlExpNodePtr ret;
7883: const xmlChar *val;
7884:
7885: SKIP_BLANKS
7886: base = ctxt->cur;
7887: if (*ctxt->cur == '(') {
7888: NEXT
7889: ret = xmlExpParseExpr(ctxt);
7890: SKIP_BLANKS
7891: if (*ctxt->cur != ')') {
7892: fprintf(stderr, "unbalanced '(' : %s\n", base);
7893: xmlExpFree(ctxt, ret);
7894: return(NULL);
7895: }
7896: NEXT;
7897: SKIP_BLANKS
7898: goto parse_quantifier;
7899: }
7900: while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') &&
7901: (CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') &&
7902: (CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}'))
7903: NEXT;
7904: val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base);
7905: if (val == NULL)
7906: return(NULL);
7907: ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0);
7908: if (ret == NULL)
7909: return(NULL);
7910: SKIP_BLANKS
7911: parse_quantifier:
7912: if (CUR == '{') {
7913: int min, max;
7914:
7915: NEXT
7916: min = xmlExpParseNumber(ctxt);
7917: if (min < 0) {
7918: xmlExpFree(ctxt, ret);
7919: return(NULL);
7920: }
7921: SKIP_BLANKS
7922: if (CUR == ',') {
7923: NEXT
7924: max = xmlExpParseNumber(ctxt);
7925: SKIP_BLANKS
7926: } else
7927: max = min;
7928: if (CUR != '}') {
7929: xmlExpFree(ctxt, ret);
7930: return(NULL);
7931: }
7932: NEXT
7933: ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7934: min, max);
7935: SKIP_BLANKS
7936: } else if (CUR == '?') {
7937: NEXT
7938: ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7939: 0, 1);
7940: SKIP_BLANKS
7941: } else if (CUR == '+') {
7942: NEXT
7943: ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7944: 1, -1);
7945: SKIP_BLANKS
7946: } else if (CUR == '*') {
7947: NEXT
7948: ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7949: 0, -1);
7950: SKIP_BLANKS
7951: }
7952: return(ret);
7953: }
7954:
7955:
7956: static xmlExpNodePtr
7957: xmlExpParseSeq(xmlExpCtxtPtr ctxt) {
7958: xmlExpNodePtr ret, right;
7959:
7960: ret = xmlExpParseOr(ctxt);
7961: SKIP_BLANKS
7962: while (CUR == '|') {
7963: NEXT
7964: right = xmlExpParseOr(ctxt);
7965: if (right == NULL) {
7966: xmlExpFree(ctxt, ret);
7967: return(NULL);
7968: }
7969: ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0);
7970: if (ret == NULL)
7971: return(NULL);
7972: }
7973: return(ret);
7974: }
7975:
7976: static xmlExpNodePtr
7977: xmlExpParseExpr(xmlExpCtxtPtr ctxt) {
7978: xmlExpNodePtr ret, right;
7979:
7980: ret = xmlExpParseSeq(ctxt);
7981: SKIP_BLANKS
7982: while (CUR == ',') {
7983: NEXT
7984: right = xmlExpParseSeq(ctxt);
7985: if (right == NULL) {
7986: xmlExpFree(ctxt, ret);
7987: return(NULL);
7988: }
7989: ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0);
7990: if (ret == NULL)
7991: return(NULL);
7992: }
7993: return(ret);
7994: }
7995:
7996: /**
7997: * xmlExpParse:
7998: * @ctxt: the expressions context
7999: * @expr: the 0 terminated string
8000: *
8001: * Minimal parser for regexps, it understand the following constructs
8002: * - string terminals
8003: * - choice operator |
8004: * - sequence operator ,
8005: * - subexpressions (...)
8006: * - usual cardinality operators + * and ?
8007: * - finite sequences { min, max }
8008: * - infinite sequences { min, * }
8009: * There is minimal checkings made especially no checking on strings values
8010: *
8011: * Returns a new expression or NULL in case of failure
8012: */
8013: xmlExpNodePtr
8014: xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) {
8015: xmlExpNodePtr ret;
8016:
8017: ctxt->expr = expr;
8018: ctxt->cur = expr;
8019:
8020: ret = xmlExpParseExpr(ctxt);
8021: SKIP_BLANKS
8022: if (*ctxt->cur != 0) {
8023: xmlExpFree(ctxt, ret);
8024: return(NULL);
8025: }
8026: return(ret);
8027: }
8028:
8029: static void
8030: xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) {
8031: xmlExpNodePtr c;
8032:
8033: if (expr == NULL) return;
8034: if (glob) xmlBufferWriteChar(buf, "(");
8035: switch (expr->type) {
8036: case XML_EXP_EMPTY:
8037: xmlBufferWriteChar(buf, "empty");
8038: break;
8039: case XML_EXP_FORBID:
8040: xmlBufferWriteChar(buf, "forbidden");
8041: break;
8042: case XML_EXP_ATOM:
8043: xmlBufferWriteCHAR(buf, expr->exp_str);
8044: break;
8045: case XML_EXP_SEQ:
8046: c = expr->exp_left;
8047: if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8048: xmlExpDumpInt(buf, c, 1);
8049: else
8050: xmlExpDumpInt(buf, c, 0);
8051: xmlBufferWriteChar(buf, " , ");
8052: c = expr->exp_right;
8053: if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8054: xmlExpDumpInt(buf, c, 1);
8055: else
8056: xmlExpDumpInt(buf, c, 0);
8057: break;
8058: case XML_EXP_OR:
8059: c = expr->exp_left;
8060: if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8061: xmlExpDumpInt(buf, c, 1);
8062: else
8063: xmlExpDumpInt(buf, c, 0);
8064: xmlBufferWriteChar(buf, " | ");
8065: c = expr->exp_right;
8066: if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8067: xmlExpDumpInt(buf, c, 1);
8068: else
8069: xmlExpDumpInt(buf, c, 0);
8070: break;
8071: case XML_EXP_COUNT: {
8072: char rep[40];
8073:
8074: c = expr->exp_left;
8075: if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8076: xmlExpDumpInt(buf, c, 1);
8077: else
8078: xmlExpDumpInt(buf, c, 0);
8079: if ((expr->exp_min == 0) && (expr->exp_max == 1)) {
8080: rep[0] = '?';
8081: rep[1] = 0;
8082: } else if ((expr->exp_min == 0) && (expr->exp_max == -1)) {
8083: rep[0] = '*';
8084: rep[1] = 0;
8085: } else if ((expr->exp_min == 1) && (expr->exp_max == -1)) {
8086: rep[0] = '+';
8087: rep[1] = 0;
8088: } else if (expr->exp_max == expr->exp_min) {
8089: snprintf(rep, 39, "{%d}", expr->exp_min);
8090: } else if (expr->exp_max < 0) {
8091: snprintf(rep, 39, "{%d,inf}", expr->exp_min);
8092: } else {
8093: snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max);
8094: }
8095: rep[39] = 0;
8096: xmlBufferWriteChar(buf, rep);
8097: break;
8098: }
8099: default:
8100: fprintf(stderr, "Error in tree\n");
8101: }
8102: if (glob)
8103: xmlBufferWriteChar(buf, ")");
8104: }
8105: /**
8106: * xmlExpDump:
8107: * @buf: a buffer to receive the output
8108: * @expr: the compiled expression
8109: *
8110: * Serialize the expression as compiled to the buffer
8111: */
8112: void
8113: xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) {
8114: if ((buf == NULL) || (expr == NULL))
8115: return;
8116: xmlExpDumpInt(buf, expr, 0);
8117: }
8118:
8119: /**
8120: * xmlExpMaxToken:
8121: * @expr: a compiled expression
8122: *
8123: * Indicate the maximum number of input a expression can accept
8124: *
8125: * Returns the maximum length or -1 in case of error
8126: */
8127: int
8128: xmlExpMaxToken(xmlExpNodePtr expr) {
8129: if (expr == NULL)
8130: return(-1);
8131: return(expr->c_max);
8132: }
8133:
8134: /**
8135: * xmlExpCtxtNbNodes:
8136: * @ctxt: an expression context
8137: *
8138: * Debugging facility provides the number of allocated nodes at a that point
8139: *
8140: * Returns the number of nodes in use or -1 in case of error
8141: */
8142: int
8143: xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) {
8144: if (ctxt == NULL)
8145: return(-1);
8146: return(ctxt->nb_nodes);
8147: }
8148:
8149: /**
8150: * xmlExpCtxtNbCons:
8151: * @ctxt: an expression context
8152: *
8153: * Debugging facility provides the number of allocated nodes over lifetime
8154: *
8155: * Returns the number of nodes ever allocated or -1 in case of error
8156: */
8157: int
8158: xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) {
8159: if (ctxt == NULL)
8160: return(-1);
8161: return(ctxt->nb_cons);
8162: }
8163:
8164: #endif /* LIBXML_EXPR_ENABLED */
8165: #define bottom_xmlregexp
8166: #include "elfgcchack.h"
8167: #endif /* LIBXML_REGEXP_ENABLED */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>